grammar-context-fst.h
12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
// fstext/grammar-context-fst.h
// Copyright 2018 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//
#ifndef KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_
#define KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_
/* This header defines a special form of the context FST "C" (the "C" in "HCLG")
that integrates with our framework for building dynamic graphs for grammars
that are too big to statically create, or graphs with on-the-fly pieces that
you want to create at recognition time without building the whole graph.
This framework is limited to only work with models with left-biphone context.
(Fortunately this doesn't impact results, as our best models are all 'chain'
models with left biphone context).
The main code exported from here is the class InverseLeftBiphoneContextFst,
which is similar to the InverseContextFst defined in context-fst.h, but
is limited to left-biphone context and also supports certain special
extensions we need to compile grammars.
See \ref grammar (../doc/grammar.dox) for high-level
documentation on how this framework works.
*/
#include <algorithm>
#include <string>
#include <vector>
#include <fst/fstlib.h>
#include <fst/fst-decl.h>
#include "util/const-integer-set.h"
#include "fstext/deterministic-fst.h"
#include "fstext/context-fst.h"
namespace fst {
/**
An anonymous enum to define some values for symbols used in our grammar-fst
framework. Please understand this with reference to the documentation in
\ref grammar (../doc/grammar.dox). This enum defines
the values of nonterminal-related symbols in phones.txt. They are not
the actual values-- they will be shifted by adding the value
nonterm_phones_offset which is passed in by the command-line flag
--nonterm-phones-offset.
*/
enum NonterminalValues {
kNontermBos = 0, // #nonterm_bos
kNontermBegin = 1, // #nonterm_begin
kNontermEnd = 2, // #nonterm_end
kNontermReenter = 3, // #nonterm_reenter
kNontermUserDefined = 4, // the lowest-numbered user-defined nonterminal, e.g. #nonterm:foo
// kNontermMediumNumber and kNontermBigNumber come into the encoding of
// nonterminal-related symbols in HCLG.fst. The only hard constraint on them
// is that kNontermBigNumber must be bigger than the biggest transition-id in
// your system, and kNontermMediumNumber must be >0. These values were chosen
// for ease of human inspection of numbers encoded with them.
kNontermMediumNumber = 1000,
kNontermBigNumber = 10000000
};
// Returns the smallest multiple of 1000 that is strictly greater than
// nonterm_phones_offset. Used in the encoding of special symbol in HCLG;
// they are encoded as
// special_symbol =
// kNontermBigNumber + (nonterminal * encoding_multiple) + phone_index
inline int32 GetEncodingMultiple(int32 nonterm_phones_offset) {
int32 medium_number = static_cast<int32>(kNontermMediumNumber);
return medium_number *
((nonterm_phones_offset + medium_number) / medium_number);
}
/**
This is a variant of the function ComposeContext() which is to be used
with our "grammar FST" framework (see \ref graph_context, i.e.
../doc/grammar.dox, for more details). This does not take
the 'context_width' and 'central_position' arguments because they are
assumed to be 2 and 1 respectively (meaning, left-biphone phonetic context).
This function creates a context FST and composes it on the left with "ifst"
to make "ofst".
@param [in] nonterm_phones_offset The integer id of the symbol
#nonterm_bos in the phones.txt file. You can just set this
to a large value (like 1 million) if you are not actually using
nonterminals (e.g. for testing purposes).
@param [in] disambig_syms List of disambiguation symbols, e.g. the integer
ids of #0, #1, #2 ... in the phones.txt.
@param [in,out] ifst The FST we are composing with C (e.g. LG.fst).
@param [out] ofst Composed output FST (would be CLG.fst).
@param [out] ilabels Vector, indexed by ilabel of CLG.fst, providing information
about the meaning of that ilabel; see \ref tree_ilabel
(http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel)
and also \ref grammar_special_clg
(http://kaldi-asr.org/doc/grammar#grammar_special_clg).
*/
void ComposeContextLeftBiphone(
int32 nonterm_phones_offset,
const vector<int32> &disambig_syms,
const VectorFst<StdArc> &ifst,
VectorFst<StdArc> *ofst,
vector<vector<int32> > *ilabels);
/*
InverseLeftBiphoneContextFst represents the inverse of the context FST "C" (the "C" in
"HCLG") which transduces from symbols representing phone context windows
(e.g. "a, b, c") to individual phones, e.g. "a". So InverseContextFst
transduces from phones to symbols representing phone context windows. The
point is that the inverse is deterministic, so the DeterministicOnDemandFst
interface is applicable, which turns out to be a convenient way to implement
this.
This doesn't implement the full Fst interface, it implements the
DeterministicOnDemandFst interface which is much simpler and which is
sufficient for what we need to do with this.
Search for "hbka.pdf" ("Speech Recognition with Weighted Finite State
Transducers") by M. Mohri, for more context.
*/
class InverseLeftBiphoneContextFst: public DeterministicOnDemandFst<StdArc> {
public:
typedef StdArc Arc;
typedef typename StdArc::StateId StateId;
typedef typename StdArc::Weight Weight;
typedef typename StdArc::Label Label;
/**
Constructor. This does not take the arguments 'context_width' or
'central_position' because they are assumed to be (2, 1) meaning a
system with left-biphone context; and there is no subsequential
symbol because it is not needed in systems without right context.
@param [in] nonterm_phones_offset The integer id of the symbol
#nonterm_bos in the phones.txt file. You can just set this to
a large value (like 1 million) if you are not actually using
nonterminals (e.g. for testing purposes).
@param [in] phones List of integer ids of phones, as you would see in phones.txt
@param [in] disambig_syms List of integer ids of disambiguation symbols,
e.g. the ids of #0, #1, #2 in phones.txt
See \ref graph_context for more details.
*/
InverseLeftBiphoneContextFst(Label nonterm_phones_offset,
const vector<int32>& phones,
const vector<int32>& disambig_syms);
/**
Here is a note on the state space of InverseLeftBiphoneContextFst;
see \ref grammar_special_c which has some documentation on this.
The state space uses the same numbering as phones.txt.
State 0 means the beginning-of-sequence state, where there is no left
context.
For each phone p in the list 'phones' passed to the constructor (i.e. in
the set passed to the constructor), the state 'p' corresponds to a
left-context of that phone.
If p is equal to nonterm_phones_offset_ + kNontermBegin (i.e. the
integer form of `\#nonterm_begin`), then this is the state we transition
to when we see that symbol starting from left-context==0 (no context). The
transition to this special state will have epsilon on the output. (talking
here about inv(C), not C, so input/output are reversed).
The state is nonfinal and when we see a regular phone p1 or #nonterm_bos, instead of
outputting that phone in context, we output the pair (#nonterm_begin,p1) or
(#nonterm_begin,#nonterm_bos). This state is not final.
If p is equal to nonterm_phones_offset_ + kNontermUserDefined, then this
is the state we transition to when we see any user-defined nonterminal.
Transitions to this special state have olabels of the form (#nonterm:foo,p1)
where p1 is the preceding context (with #nonterm_begin if that context was
0); transitions out of it have olabels of the form (#nonterm_reenter,p2), where
p2 is the phone on the ilabel of that transition. Again: talking about inv(C).
This state is not final.
If p is equal to nonterm_phones_offset_ + kNontermEnd, then this is
the state we transition to when we see the ilabel #nonterm_end. The olabels
on the transitions to it (talking here about inv(C), so ilabels and olabels
are reversed) are of the form (#nonterm_end, p1) where p1 corresponds to the
context we were in. This state is final.
*/
virtual StateId Start() { return 0; }
virtual Weight Final(StateId s);
/// Note: ilabel must not be epsilon.
virtual bool GetArc(StateId s, Label ilabel, Arc *arc);
~InverseLeftBiphoneContextFst() { }
// Returns a reference to a vector<vector<int32> > with information about all
// the input symbols of C (i.e. all the output symbols of this
// InverseContextFst). See
// "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
const vector<vector<int32> > &IlabelInfo() const {
return ilabel_info_;
}
// A way to destructively obtain the ilabel-info. Only do this if you
// are just about to destroy this object.
void SwapIlabelInfo(vector<vector<int32> > *vec) { ilabel_info_.swap(*vec); }
private:
inline int32 GetPhoneSymbolFor(enum NonterminalValues n) {
return nonterm_phones_offset_ + static_cast<int32>(n);
}
/// Finds the label index corresponding to this context-window of phones
/// (likely of width context_width_). Inserts it into the
/// ilabel_info_/ilabel_map_ tables if necessary.
Label FindLabel(const vector<int32> &label_info);
// Map type to map from vectors of int32 (representing ilabel-info,
// see http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel) to
// Label (the output label in this FST).
typedef unordered_map<vector<int32>, Label,
kaldi::VectorHasher<int32> > VectorToLabelMap;
// The following three variables were also passed in by the caller:
int32 nonterm_phones_offset_;
// 'phone_syms_' are a set of phone-ids, typically 1, 2, .. num_phones.
kaldi::ConstIntegerSet<Label> phone_syms_;
// disambig_syms_ is the set of integer ids of the disambiguation symbols,
// usually represented in text form as #0, #1, #2, etc. These are inserted
// into the grammar (for #0) and the lexicon (for #1, #2, ...) in order to
// make the composed FSTs determinizable. They are treated "specially" by the
// context FST in that they are not part of the context, they are just "passed
// through" via self-loops. See the Mohri chapter mrentioned above for more
// information.
kaldi::ConstIntegerSet<Label> disambig_syms_;
// maps from vector<int32>, representing phonetic contexts of length
// context_width_ - 1, to Label. These are actually the output labels of this
// InverseContextFst (because of the "Inverse" part), but for historical
// reasons and because we've used the term ilabels" in the documentation, we
// still call these "ilabels").
VectorToLabelMap ilabel_map_;
// ilabel_info_ is the reverse map of ilabel_map_.
// Indexed by olabel (although we call this ilabel_info_ for historical
// reasons and because is for the ilabels of C), ilabel_info_[i] gives
// information about the meaning of each symbol on the input of C
// aka the output of inv(C).
// See "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
vector<vector<int32> > ilabel_info_;
};
} // namespace fst
#endif // KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_