hmm-utils.h 16 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337


// hmm/hmm-utils.h

// Copyright 2009-2011  Microsoft Corporation

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_HMM_HMM_UTILS_H_
#define KALDI_HMM_HMM_UTILS_H_

#include "hmm/hmm-topology.h"
#include "hmm/transition-model.h"
#include "lat/kaldi-lattice.h"

namespace kaldi {


/// \defgroup hmm_group_graph Classes and functions for creating FSTs from HMMs
/// \ingroup hmm_group
/// @{

/// Configuration class for the GetHTransducer() function; see
/// \ref hmm_graph_config for context.
struct HTransducerConfig {
  /// Transition log-prob scale, see \ref hmm_scale.
  /// Note this doesn't apply to self-loops; GetHTransducer() does
  /// not include self-loops.
  BaseFloat transition_scale;
  int32 nonterm_phones_offset;

  HTransducerConfig():
      transition_scale(1.0),
      nonterm_phones_offset(-1) { }

  void Register (OptionsItf *opts) {
    opts->Register("transition-scale", &transition_scale,
                   "Scale of transition probs (relative to LM)");
    opts->Register("nonterm-phones-offset", &nonterm_phones_offset,
                   "The integer id of #nonterm_bos in phones.txt, if present. "
                   "Only needs to be set if you are doing grammar decoding, "
                   "see doc/grammar.dox.");
  }
};


struct HmmCacheHash {
  int operator () (const std::pair<int32, std::vector<int32> >&p) const {
    VectorHasher<int32> v;
    int32 prime = 103049;
    return prime*p.first + v(p.second);
  }
};

/// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used
/// as cache in GetHmmAsFsa, as an optimization.
typedef unordered_map<std::pair<int32, std::vector<int32> >,
                      fst::VectorFst<fst::StdArc>*,
                      HmmCacheHash> HmmCacheType;


/// Called by GetHTransducer() and probably will not need to be called directly;
/// it creates and returns the FST corresponding to the phone.  It's actually an
/// acceptor (ilabels equal to olabels), which is why this is called "Fsa" not
/// "Fst".  This acceptor does not include self-loops; you have to call
/// AddSelfLoops() for that.  (We do that at a later graph compilation phase,
/// for efficiency).  The labels on the FSA correspond to transition-ids.
///
/// as the symbols.
/// For documentation in context, see \ref hmm_graph_get_hmm_as_fst
///   @param context_window  A vector representing the phonetic context; see
///            \ref tree_window "here" for explanation.
///   @param ctx_dep The object that contains the phonetic decision-tree
///   @param trans_model The transition-model object, which provides
///         the mappings to transition-ids and also the transition
///         probabilities.
///   @param config Configuration object, see \ref HTransducerConfig.
///   @param cache Object used as a lookaside buffer to save computation;
///       if it finds that the object it needs is already there, it will
///       just return a pointer value from "cache"-- not that this means
///       you have to be careful not to delete things twice.
fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
    std::vector<int32> context_window,
    const ContextDependencyInterface &ctx_dep,
    const TransitionModel &trans_model,
    const HTransducerConfig &config,
    HmmCacheType *cache = NULL);


/// Included mainly as a form of documentation, not used in any other code
/// currently.  Creates the acceptor FST with self-loops, and with fewer
/// options.
fst::VectorFst<fst::StdArc>*
GetHmmAsFsaSimple(std::vector<int32> context_window,
                  const ContextDependencyInterface &ctx_dep,
                  const TransitionModel &trans_model,
                  BaseFloat prob_scale);


/**
  * Returns the H tranducer; result owned by caller.  Caution: our version of
  * the H transducer does not include self-loops; you have to add those later.
  * See \ref hmm_graph_get_h_transducer.  The H transducer has on the
  * input transition-ids, and also possibly some disambiguation symbols, which
  * will be put in disambig_syms.  The output side contains the identifiers that
  * are indexes into "ilabel_info" (these represent phones-in-context or
  * disambiguation symbols).  The ilabel_info vector allows GetHTransducer to map
  * from symbols to phones-in-context (i.e. phonetic context windows).  Any
  * singleton symbols in the ilabel_info vector which are not phones, will be
  * treated as disambiguation symbols.  [Not all recipes use these].  The output
  * "disambig_syms_left" will be set to a list of the disambiguation symbols on
  * the input of the transducer (i.e. same symbol type as whatever is on the
  * input of the transducer
  */
fst::VectorFst<fst::StdArc>*
GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
               const ContextDependencyInterface &ctx_dep,
               const TransitionModel &trans_model,
               const HTransducerConfig &config,
               std::vector<int32> *disambig_syms_left);

/**
  * GetIlabelMapping produces a mapping that's similar to HTK's logical-to-physical
  * model mapping (i.e. the xwrd.clustered.mlist files).   It groups together
  * "logical HMMs" (i.e. in our world, phonetic context windows) that share the
  * same sequence of transition-ids.   This can be used in an
  * optional graph-creation step that produces a remapped form of CLG that can be
  * more productively determinized and minimized.  This is used in the command-line program
  * make-ilabel-transducer.cc.
  * @param ilabel_info_old [in] The original \ref tree_ilabel "ilabel_info" vector
  * @param ctx_dep [in] The tree
  * @param trans_model [in] The transition-model object
  * @param old2new_map [out] The output; this vector, which is of size equal to the
  *       number of new labels, is a mapping to the old labels such that we could
  *       create a vector ilabel_info_new such that
  *       ilabel_info_new[i] == ilabel_info_old[old2new_map[i]]
  */
void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
                      const ContextDependencyInterface &ctx_dep,
                      const TransitionModel &trans_model,
                      std::vector<int32> *old2new_map);


/**
  * For context, see \ref hmm_graph_add_self_loops.  Expands an FST that has been
  * built without self-loops, and adds the self-loops (it also needs to modify
  * the probability of the non-self-loop ones, as the graph without self-loops
  * was created in such a way that it was stochastic).  Note that the
  * disambig_syms will be empty in some recipes (e.g.  if you already removed
  * the disambiguation symbols).
  * This function will treat numbers over 10000000 (kNontermBigNumber) the
  * same as disambiguation symbols, assuming they are special symbols for
  * grammar decoding.
  *
  * @param trans_model [in] Transition model
  * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required
  *       if the graph contains disambiguation symbols but only needed for sanity checks.
  * @param self_loop_scale [in] Transition-probability scale for self-loops; c.f.
  *                    \ref hmm_scale
  * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder).
  *                     You'll normally want this to be true.
  * @param check_no_self_loops [in]  If true, it will check that there are no
  *                      self-loops in the original graph; you'll normally want
  *                      this to be true.  If false, it will allow them, and
  *                      will add self-loops after the original self-loop
  *                      transitions, assuming reorder==true... this happens to
  *                      be what we want when converting normal to unconstrained
  *                      chain examples.  WARNING: this was added in 2018;
  *                      if you get a compilation error, add this as 'true',
  *                      which emulates the behavior of older code.
  * @param  fst [in, out] The FST to be modified.
  */
void AddSelfLoops(const TransitionModel &trans_model,
                  const std::vector<int32> &disambig_syms,  // used as a check only.
                  BaseFloat self_loop_scale,
                  bool reorder,
                  bool check_no_self_loops,
                  fst::VectorFst<fst::StdArc> *fst);

/**
  * Adds transition-probs, with the supplied
  * scales (see \ref hmm_scale), to the graph.
  * Useful if you want to create a graph without transition probs, then possibly
  * train the model (including the transition probs) but keep the graph fixed,
  * and add back in the transition probs.  It assumes the fst has transition-ids
  * on it.  It is not an error if the FST has no states (nothing will be done).
  * @param trans_model [in] The transition model
  * @param disambig_syms [in] A list of disambiguation symbols, required if the
  *                       graph has disambiguation symbols on its input but only
  *                       used for checks.
  * @param transition_scale [in] A scale on transition-probabilities apart from
  *                      those involving self-loops; see \ref hmm_scale.
  * @param self_loop_scale [in] A scale on self-loop transition probabilities;
  *                      see \ref hmm_scale.
  * @param  fst [in, out] The FST to be modified.
  */
void AddTransitionProbs(const TransitionModel &trans_model,
                        const std::vector<int32> &disambig_syms,
                        BaseFloat transition_scale,
                        BaseFloat self_loop_scale,
                        fst::VectorFst<fst::StdArc> *fst);

/**
   This is as AddSelfLoops(), but operates on a Lattice, where
   it affects the graph part of the weight (the first element
   of the pair). */
void AddTransitionProbs(const TransitionModel &trans_model,
                        BaseFloat transition_scale,
                        BaseFloat self_loop_scale,
                        Lattice *lat);


/// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
/// Currenly of use only for testing.
fst::VectorFst<fst::StdArc>*
GetPdfToTransitionIdTransducer(const TransitionModel &trans_model);

/// Converts all transition-ids in the FST to pdfs plus one.
/// Placeholder: not implemented yet!
void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
                                const std::vector<int32> &disambig_syms,
                                fst::VectorFst<fst::StdArc> *fst);

/// @} end "defgroup hmm_group_graph"

/// \addtogroup hmm_group
/// @{

/// SplitToPhones splits up the TransitionIds in "alignment" into their
/// individual phones (one vector per instance of a phone).  At output,
/// the sum of the sizes of the vectors in split_alignment will be the same
/// as the corresponding sum for "alignment".  The function returns
/// true on success.  If the alignment appears to be incomplete, e.g.
/// not ending at the end-state of a phone, it will still break it up into
/// phones but it will return false.  For more serious errors it will
/// die or throw an exception.
/// This function works out by itself whether the graph was created
/// with "reordering", and just does the right thing.
bool SplitToPhones(const TransitionModel &trans_model,
                   const std::vector<int32> &alignment,
                   std::vector<std::vector<int32> > *split_alignment);

/**
   ConvertAlignment converts an alignment that was created using one model, to
   another model.  Returns false if it could not be split to phones (e.g.
   because the alignment was partial), or because some other error happened,
   such as we couldn't convert the alignment because there were too few frames
   for the new topology.

   @param old_trans_model [in]  The transition model that the original alignment
                                used.
   @param new_trans_model [in]  The transition model that we want to use for the
                                new alignment.
   @param new_ctx_dep     [in]  The new tree
   @param old_alignment   [in]  The alignment we want to convert
   @param subsample_factor [in] The frame subsampling factor... normally 1, but
                                might be > 1 if we're converting to a reduced-frame-rate
                                system.
   @param repeat_frames [in]    Only relevant when subsample_factor != 1
                                If true, repeat frames of alignment by
                                'subsample_factor' after alignment
                                conversion, to keep the alignment the same
                                length as the input alignment.
                                [note: we actually do this by interpolating
                                'subsample_factor' separately generated
                                alignments, to keep the phone boundaries
                                the same as the input where possible.]
   @param reorder [in]          True if you want the pdf-ids on the new alignment to
                                be 'reordered'. (vs. the way they appear in
                                the HmmTopology object)
   @param phone_map [in]        If non-NULL, map from old to new phones.
   @param new_alignment [out]   The converted alignment.
*/

bool ConvertAlignment(const TransitionModel &old_trans_model,
                      const TransitionModel &new_trans_model,
                      const ContextDependencyInterface &new_ctx_dep,
                      const std::vector<int32> &old_alignment,
                      int32 subsample_factor,  // 1 in the normal case -> no subsampling.
                      bool repeat_frames,
                      bool reorder,
                      const std::vector<int32> *phone_map,  // may be NULL
                      std::vector<int32> *new_alignment);

// ConvertPhnxToProns is only needed in bin/phones-to-prons.cc and
// isn't closely related with HMMs, but we put it here as there isn't
// any other obvious place for it and it needs to be tested.
// This function takes a phone-sequence with word-start and word-end
// markers in it, and a word-sequence, and outputs the pronunciations
// "prons"... the format of "prons" is, each element is a vector,
// where the first element is the word (or zero meaning no word, e.g.
// for optional silence introduced by the lexicon), and the remaining
// elements are the phones in the word's pronunciation.
// It returns false if it encounters a problem of some kind, e.g.
// if the phone-sequence doesn't seem to have the right number of
// words in it.
bool ConvertPhnxToProns(const std::vector<int32> &phnx,
                        const std::vector<int32> &words,
                        int32 word_start_sym,
                        int32 word_end_sym,
                        std::vector<std::vector<int32> > *prons);


/* Generates a random alignment for this phone, of length equal to
   alignment->size(), which is required to be at least the MinLength() of the
   topology for this phone, or this function will crash.
   The alignment will be without 'reordering'.
*/
void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
                                const TransitionModel &trans_model,
                                const std::vector<int32> &phone_window,
                                std::vector<int32> *alignment);

/*
  If the alignment was non-reordered makes it reordered, and vice versa.
*/
void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
                                 std::vector<int32> *alignment);

/// @} end "addtogroup hmm_group"

} // end namespace kaldi


#endif