lattice-functions.h 21.4 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426


// lat/lattice-functions.h

// Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
//           2012-2013   Johns Hopkins University (Author: Daniel Povey);
//                       Bagher BabaAli
//                2014   Guoguo Chen

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#ifndef KALDI_LAT_LATTICE_FUNCTIONS_H_
#define KALDI_LAT_LATTICE_FUNCTIONS_H_

#include <vector>
#include <map>

#include "base/kaldi-common.h"
#include "hmm/posterior.h"
#include "fstext/fstext-lib.h"
#include "hmm/transition-model.h"
#include "lat/kaldi-lattice.h"
#include "itf/decodable-itf.h"

namespace kaldi {

/**
   This function extracts the per-frame log likelihoods from a linear
   lattice (which we refer to as an 'nbest' lattice elsewhere in Kaldi code).
   The dimension of *per_frame_loglikes will be set to the
   number of input symbols in 'nbest'.  The elements of
   '*per_frame_loglikes' will be set to the .Value2() elements of the lattice
   weights, which represent the acoustic costs; you may want to scale this
   vector afterward by -1/acoustic_scale to get the original loglikes.
   If there are acoustic costs on input-epsilon arcs or the final-prob in 'nbest'
   (and this should not normally be the case in situations where it makes
   sense to call this function), they will be included to the cost of the
   preceding input symbol, or the following input symbol for input-epsilons
   encountered prior to any input symbol.  If 'nbest' has no input symbols,
   'per_frame_loglikes' will be set to the empty vector.
**/
void GetPerFrameAcousticCosts(const Lattice &nbest,
                              Vector<BaseFloat> *per_frame_loglikes);

/// This function iterates over the states of a topologically sorted lattice and
/// counts the time instance corresponding to each state. The times are returned
/// in a vector of integers 'times' which is resized to have a size equal to the
/// number of states in the lattice. The function also returns the maximum time
/// in the lattice (this will equal the number of frames in the file).
int32 LatticeStateTimes(const Lattice &lat, std::vector<int32> *times);

/// As LatticeStateTimes, but in the CompactLattice format.  Note: must
/// be topologically sorted.  Returns length of the utterance in frames, which
/// might not be the same as the maximum time in the lattice, due to frames
/// in the final-prob.
int32 CompactLatticeStateTimes(const CompactLattice &clat,
                               std::vector<int32> *times);

/// This function does the forward-backward over lattices and computes the
/// posterior probabilities of the arcs. It returns the total log-probability
/// of the lattice.  The Posterior quantities contain pairs of (transition-id, weight)
/// on each frame.
/// If the pointer "acoustic_like_sum" is provided, this value is set to
/// the sum over the arcs, of the posterior of the arc times the
/// acoustic likelihood [i.e. negated acoustic score] on that link.
/// This is used in combination with other quantities to work out
/// the objective function in MMI discriminative training.
BaseFloat LatticeForwardBackward(const Lattice &lat,
                                 Posterior *arc_post,
                                 double *acoustic_like_sum = NULL);

// This function is something similar to LatticeForwardBackward(), but it is on
// the CompactLattice lattice format. Also we only need the alpha in the forward
// path, not the posteriors.
bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
                                 std::vector<double> *alpha);

// A sibling of the function CompactLatticeAlphas()... We compute the beta from
// the backward path here.
bool ComputeCompactLatticeBetas(const CompactLattice &lat,
                                std::vector<double> *beta);


// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
// best-path negated cost) Note: in either case, the alphas and betas are
// negated costs.  Requires that lat be topologically sorted.  This code
// will work for either CompactLattice or Latice.
template<typename LatticeType>
double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
                                    bool viterbi,
                                    std::vector<double> *alpha,
                                    std::vector<double> *beta);


/// Topologically sort the compact lattice if not already topologically sorted.
/// Will crash if the lattice cannot be topologically sorted.
void TopSortCompactLatticeIfNeeded(CompactLattice *clat);


/// Topologically sort the lattice if not already topologically sorted.
/// Will crash if lattice cannot be topologically sorted.
void TopSortLatticeIfNeeded(Lattice *clat);

/// Returns the depth of the lattice, defined as the average number of arcs (or
/// final-prob strings) crossing any given frame.  Returns 1 for empty lattices.
/// Requires that clat is topologically sorted!
BaseFloat CompactLatticeDepth(const CompactLattice &clat,
                              int32 *num_frames = NULL);

/// This function returns, for each frame, the number of arcs crossing that
/// frame.
void CompactLatticeDepthPerFrame(const CompactLattice &clat,
                                 std::vector<int32> *depth_per_frame);


/// This function limits the depth of the lattice, per frame: that means, it
/// does not allow more than a specified number of arcs active on any given
/// frame.  This can be used to reduce the size of the "very deep" portions of
/// the lattice.
void CompactLatticeLimitDepth(int32 max_arcs_per_frame,
                              CompactLattice *clat);


/// Given a lattice, and a transition model to map pdf-ids to phones,
/// outputs for each frame the set of phones active on that frame.  If
/// sil_phones (which must be sorted and uniq) is nonempty, it excludes
/// phones in this list.
void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
                         const std::vector<int32> &sil_phones,
                         std::vector<std::set<int32> > *active_phones);

/// Given a lattice, and a transition model to map pdf-ids to phones,
/// replace the output symbols (presumably words), with phones; we
/// use the TransitionModel to work out the phone sequence.  Note
/// that the phone labels are not exactly aligned with the phone
/// boundaries.  We put a phone label to coincide with any transition
/// to the final, nonemitting state of a phone (this state always exists,
/// we ensure this in HmmTopology::Check()).  This would be the last
/// transition-id in the phone if reordering is not done (but typically
/// we do reorder).
/// Also see PhoneAlignLattice, in phone-align-lattice.h.
void ConvertLatticeToPhones(const TransitionModel &trans_model,
                            Lattice *lat);

/// Prunes a lattice or compact lattice.  Returns true on success, false if
/// there was some kind of failure.
template<class LatticeType>
bool PruneLattice(BaseFloat beam, LatticeType *lat);


/// Given a lattice, and a transition model to map pdf-ids to phones,
/// replace the sequences of transition-ids with sequences of phones.
/// Note that this is different from ConvertLatticeToPhones, in that
/// we replace the transition-ids not the words.
void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
                                   CompactLattice *clat);

/// Boosts LM probabilities by b * [number of frame errors]; equivalently, adds
/// -b*[number of frame errors] to the graph-component of the cost of each arc/path.
/// There is a frame error if a particular transition-id on a particular frame
/// corresponds to a phone not matching transcription's alignment for that frame.
/// This is used in "margin-inspired" discriminative training, esp. Boosted MMI.
/// The TransitionModel is used to map transition-ids in the lattice
/// input-side to phones; the phones appearing in
/// "silence_phones" are treated specially in that we replace the frame error f
/// (either zero or 1) for a frame, with the minimum of f or max_silence_error.
/// For the normal recipe, max_silence_error would be zero.
/// Returns true on success, false if there was some kind of mismatch.
/// At input, silence_phones must be sorted and unique.
bool LatticeBoost(const TransitionModel &trans,
                  const std::vector<int32> &alignment,
                  const std::vector<int32> &silence_phones,
                  BaseFloat b,
                  BaseFloat max_silence_error,
                  Lattice *lat);


/**
   This function implements either the MPFE (minimum phone frame error) or SMBR
   (state-level minimum bayes risk) forward-backward, depending on whether
   "criterion" is "mpfe" or "smbr".  It returns the MPFE
   criterion of SMBR criterion for this utterance, and outputs the posteriors (which
   may be positive or negative) into "post".

   @param [in] trans    The transition model. Used to map the
                        transition-ids to phones or pdfs.
   @param [in] silence_phones   A list of integer ids of silence phones. The
                        silence frames i.e. the frames where num_ali
                        corresponds to a silence phones are treated specially.
                        The behavior is determined by 'one_silence_class'
                        being false (traditional behavior) or true.
                        Usually in our setup, several phones including
                        the silence, vocalized noise, non-spoken noise
                        and unk are treated as "silence phones"
   @param [in] lat      The denominator lattice
   @param [in] num_ali  The numerator alignment
   @param [in] criterion    The objective function. Must be "mpfe" or "smbr"
                        for MPFE (minimum phone frame error) or sMBR
                        (state minimum bayes risk) training.
   @param [in] one_silence_class   Determines how the silence frames are treated.
                        Setting this to false gives the old traditional behavior,
                        where the silence frames (according to num_ali) are
                        treated as incorrect. However, this means that the
                        insertions are not penalized by the objective.
                        Setting this to true gives the new behaviour, where we
                        treat silence as any other phone, except that all pdfs
                        of silence phones are collapsed into a single class for
                        the frame-error computation. This can possible reduce
                        the insertions in the trained model. This is closer to
                        the WER metric that we actually care about, since WER is
                        generally computed after filtering out noises, but
                        does penalize insertions.
    @param [out] post   The "MBR posteriors" i.e. derivatives w.r.t to the
                        pseudo log-likelihoods of states at each frame.
*/
BaseFloat LatticeForwardBackwardMpeVariants(
    const TransitionModel &trans,
    const std::vector<int32> &silence_phones,
    const Lattice &lat,
    const std::vector<int32> &num_ali,
    std::string criterion,
    bool one_silence_class,
    Posterior *post);

/**
   This function can be used to compute posteriors for MMI, with a positive contribution
   for the numerator and a negative one for the denominator.  This function is not actually
   used in our normal MMI training recipes, where it's instead done using various command
   line programs that each do a part of the job.  This function was written for use in
   neural-net MMI training.

   @param [in] trans    The transition model. Used to map the
                        transition-ids to phones or pdfs.
   @param [in] lat      The denominator lattice
   @param [in] num_ali  The numerator alignment
   @param [in] drop_frames   If "drop_frames" is true, it will not compute any
                        posteriors on frames where the num and den have disjoint
                        pdf-ids.
   @param [in] convert_to_pdf_ids   If "convert_to_pdfs_ids" is true, it will
                        convert the output to be at the level of pdf-ids, not
                        transition-ids.
   @param [in] cancel   If "cancel" is true, it will cancel out any positive and
                        negative parts from the same transition-id (or pdf-id,
                        if convert_to_pdf_ids == true).
   @param [out] arc_post   The output MMI posteriors of transition-ids (or
                        pdf-ids if convert_to_pdf_ids == true) at each frame
                        i.e. the difference between the numerator
                        and denominator posteriors.

   It returns the forward-backward likelihood of the lattice. */
BaseFloat LatticeForwardBackwardMmi(
    const TransitionModel &trans,
    const Lattice &lat,
    const std::vector<int32> &num_ali,
    bool drop_frames,
    bool convert_to_pdf_ids,
    bool cancel,
    Posterior *arc_post);


/// This function takes a CompactLattice that should only contain a single
/// linear sequence (e.g. derived from lattice-1best), and that should have been
/// processed so that the arcs in the CompactLattice align correctly with the
/// word boundaries (e.g. by lattice-align-words).  It outputs 3 vectors of the
/// same size, which give, for each word in the lattice (in sequence), the word
/// label and the begin time and length in frames.  This is done even for zero
/// (epsilon) words, generally corresponding to optional silence-- if you don't
/// want them, just ignore them in the output.
/// This function will print a warning and return false, if the lattice
/// did not have the correct format (e.g. if it is empty or it is not
/// linear).
bool CompactLatticeToWordAlignment(const CompactLattice &clat,
                                   std::vector<int32> *words,
                                   std::vector<int32> *begin_times,
                                   std::vector<int32> *lengths);

/// This function takes a CompactLattice that should only contain a single
/// linear sequence (e.g. derived from lattice-1best), and that should have been
/// processed so that the arcs in the CompactLattice align correctly with the
/// word boundaries (e.g. by lattice-align-words).  It outputs 4 vectors of the
/// same size, which give, for each word in the lattice (in sequence), the word
/// label, the begin time and length in frames, and the pronunciation (sequence
/// of phones).  This is done even for zero words, corresponding to optional
/// silences -- if you don't want them, just ignore them in the output.
/// This function will print a warning and return false, if the lattice
/// did not have the correct format (e.g. if it is empty or it is not
/// linear).
bool CompactLatticeToWordProns(
    const TransitionModel &tmodel,
    const CompactLattice &clat,
    std::vector<int32> *words,
    std::vector<int32> *begin_times,
    std::vector<int32> *lengths,
    std::vector<std::vector<int32> > *prons,
    std::vector<std::vector<int32> > *phone_lengths);


/// A form of the shortest-path/best-path algorithm that's specially coded for
/// CompactLattice.  Requires that clat be acyclic.
void CompactLatticeShortestPath(const CompactLattice &clat,
                                CompactLattice *shortest_path);

/// This function add the word insertion penalty to graph score of each word
/// in the compact lattice
void AddWordInsPenToCompactLattice(BaseFloat word_ins_penalty,
                                   CompactLattice *clat);

/// This function *adds* the negated scores obtained from the Decodable object,
/// to the acoustic scores on the arcs.  If you want to replace them, you should
/// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
/// true on success, false on error (typically some kind of mismatched inputs).
bool RescoreCompactLattice(DecodableInterface *decodable,
                           CompactLattice *clat);


/// This function returns the number of words in the longest sentence in a
/// CompactLattice (i.e. the the maximum of any path, of the count of
/// olabels on that path).
int32 LongestSentenceLength(const Lattice &lat);

/// This function returns the number of words in the longest sentence in a
/// CompactLattice, i.e. the the maximum of any path, of the count of
/// labels on that path... note, in CompactLattice, the ilabels and olabels
/// are identical because it is an acceptor.
int32 LongestSentenceLength(const CompactLattice &lat);


/// This function is like RescoreCompactLattice, but it is modified to avoid
/// computing probabilities on most frames where all the pdf-ids are the same.
/// (it needs the transition-model to work out whether two transition-ids map to
/// the same pdf-id, and it assumes that the lattice has transition-ids on it).
/// The naive thing would be to just set all probabilities to zero on frames
/// where all the pdf-ids are the same (because this value won't affect the
/// lattice posterior).  But this would become confusing when we compute
/// corpus-level diagnostics such as the MMI objective function.  Instead,
/// imagine speedup_factor = 100 (it must be >= 1.0)... with probability (1.0 /
/// speedup_factor) we compute those likelihoods and multiply them by
/// speedup_factor; otherwise we set them to zero.  This gives the right
/// expected probability so our corpus-level diagnostics will be about right.
bool RescoreCompactLatticeSpeedup(
    const TransitionModel &tmodel,
    BaseFloat speedup_factor,
    DecodableInterface *decodable,
    CompactLattice *clat);


/// This function *adds* the negated scores obtained from the Decodable object,
/// to the acoustic scores on the arcs.  If you want to replace them, you should
/// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
/// true on success, false on error (e.g. some kind of mismatched inputs).
/// The input labels, if nonzero, are interpreted as transition-ids or whatever
/// other index the Decodable object expects.
bool RescoreLattice(DecodableInterface *decodable,
                    Lattice *lat);

/// This function Composes a CompactLattice format lattice with a
/// DeterministicOnDemandFst<fst::StdFst> format fst, and outputs another
/// CompactLattice format lattice. The first element (the one that corresponds
/// to LM weight) in CompactLatticeWeight is used for composition.
///
/// Note that the DeterministicOnDemandFst interface is not "const", therefore
/// we cannot use "const" for <det_fst>.
void ComposeCompactLatticeDeterministic(
    const CompactLattice& clat,
    fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
    CompactLattice* composed_clat);

/// This function computes the mapping from the pair 
/// (frame-index, transition-id) to the pair 
/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the 
/// transition-id in that frame.
/// frame-index in the lattice. 
/// This function is useful for retaining the acoustic scores in a 
/// non-compact lattice after a process like determinization where the 
/// frame-level acoustic scores are typically lost.
/// The function ReplaceAcousticScoresFromMap is used to restore the 
/// acoustic scores computed by this function.
///
///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the 
///                     function will crash. 
///   @param [out] acoustic_scores  
///                     Pointer to a map from the pair (frame-index,
///                     transition-id) to a pair (sum-of-acoustic-scores,
///                     num-of-occurences).
///                     Usually the acoustic scores for a pdf-id (and hence
///                     transition-id) on a frame will be the same for all the
///                     occurences of the pdf-id in that frame. 
///                     But if not, we will take the average of the acoustic
///                     scores. Hence, we store both the sum-of-acoustic-scores
///                     and the num-of-occurences of the transition-id in that
///                     frame.
void ComputeAcousticScoresMap(
    const Lattice &lat,
    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                        PairHasher<int32> > *acoustic_scores);

/// This function restores acoustic scores computed using the function
/// ComputeAcousticScoresMap into the lattice.
///
///   @param [in] acoustic_scores  
///                      A map from the pair (frame-index, transition-id) to a
///                      pair (sum-of-acoustic-scores, num-of-occurences) of 
///                      the occurences of the transition-id in that frame.
///                      See the comments for ComputeAcousticScoresMap for 
///                      details.
///   @param [out] lat   Pointer to the output lattice.
void ReplaceAcousticScoresFromMap(
    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                        PairHasher<int32> > &acoustic_scores,
    Lattice *lat);

}  // namespace kaldi

#endif  // KALDI_LAT_LATTICE_FUNCTIONS_H_