make_subword_lexicon_fst.py
14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
# 2019 Dongji Gao
# Apache 2.0.
from make_lexicon_fst import read_lexiconp
import argparse
import math
import sys
# see get_args() below for usage mesage
def get_args():
parser = argparse.ArgumentParser(description="""This script creates the
text form of a subword lexicon FST to be compiled by fstcompile using
the appropriate symbol tables (phones.txt and words.txt). It will mostly
be invoked indirectly via utils/prepare_lang_subword.sh. The output
goes to the stdout. This script is the subword version of make_lexicon_fst.py.
It only allows optional silence to appear after end-subword or singleton-subword,
(i.e., subwords without separator). In this version we do not support
pronunciation probability. (i.e., pron-prob = 1.0)""")
parser.add_argument('--sil-phone', type=str, help="""Text form of
optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
of silence between words (including the beginning and end of word sequence).
Must be in range [0.0, 1.0). This refer to the optional silence inserted by
the lexicon; see the --sil-phone option.""")
parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
to disambiguate silence, e.g. #5. Will only be supplied if you are creating
the version of L.fst with disambiguation symbols, intended for use with cyclic
G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism
of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
parser.add_argument('--position-dependent', action="store_true", help="""Whether
the input lexicon is position-dependent.""")
parser.add_argument("--separator", type=str, default="@@", help="""Separator
indicates the position of a subword in a word.
Subword followed by separator can only appear at the beginning or middle of a word.
Subword without separator can only appear at the end of a word or is a word itself.
E.g. "international -> inter@@ nation@@ al";
"nation -> nation"
The separator should match the separator used in the input lexicon.""")
parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
pronunciation probabilities (normally lexiconp.txt), with lines of the
form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
args = parser.parse_args()
return args
def contain_disambig_symbol(phones):
"""Return true if the phone sequence contains disambiguation symbol.
Return false otherwise. Disambiguation symbol is at the end of phones
in the form of #1, #2... There is at most one disambiguation
symbol for each phone sequence"""
return True if phones[-1].startswith("#") else False
def print_arc(src, dest, phone, word, cost):
print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))
def is_end(word, separator):
"""Return true if the subword can appear at the end of a word (i.e., the subword
does not end with separator). Return false otherwise."""
return not word.endswith(separator)
def get_suffix(phone):
"""Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
if len(phone) < 3:
print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
sys.argv[0], phone), file=sys.stderr)
sys.exit(1)
return phone[-2:]
def write_fst_no_silence(lexicon, position_dependent, separator):
"""Writes the text format of L.fst to the standard output. This version is for
when --sil-prob=0.0, meaning there is no optional silence allowed.
loop_state here is the start and final state of the fst. It goes to word_start_state
via epsilon transition.
In position-independent case, there is no difference between beginning word and
middle word. So all subwords with separator would leave from and enter word_start_state.
All subword without separator would leave from word_start_state and enter loop_state.
This guarantees that optional silence can only follow a word-end subword.
In position-dependent case, there are 4 types of position-dependent subword:
1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
nation@@ 1.0 n_B ey_I sh_I ih_I n_I
n@@ 1.0 n_B
2) Middle subword. All phone suffixes should be "_I"s:
nation@@ 1.0 n_I ey_I sh_I ih_I n_I
3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
nation 1.0 n_I ey_I sh_I ih_I n_E
n 1.0 n_E
4) Singleton subword (i.e., the subword is word it self).
The first phone suffix should be "_B" and the last suffix should be "_E".
All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
nation 1.0 n_B ey_I sh_I ih_I n_E
n 1.0 n_S
So we need an extra word_internal_state. The beginning word
would leave from word_start_state and enter word_internal_state and middle word
would leave from and enter word_internal_state. The rest part is same.
'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
'position_dependent', which is true is the lexicon is position-dependent.
'separator' is a symbol which indicates the position of a subword in word.
"""
# regular setting
loop_state = 0
word_start_state = 1
next_state = 2
print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
# optional setting for word_internal_state
if position_dependent:
word_internal_state = next_state
next_state += 1
for (word, pron_prob, phones) in lexicon:
pron_cost = 0.0 # do not support pron_prob
phones_len = len(phones)
# set start and end state for different cases
if position_dependent:
first_phone_suffix = get_suffix(phones[0])
last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
last_phone_suffix = get_suffix(last_phone)
# singleton word
if first_phone_suffix == "_S":
current_state = word_start_state
end_state = loop_state
# set the current_state
elif first_phone_suffix == "_B":
current_state = word_start_state
elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
current_state = word_internal_state
# then set the end_state
if last_phone_suffix == "_B" or last_phone_suffix == "_I":
end_state = word_internal_state
elif last_phone_suffix == "_E":
end_state = loop_state
else:
current_state = word_start_state
end_state = loop_state if is_end(word, separator) else word_start_state
# print arcs (except the last one) for the subword
for i in range(phones_len - 1):
word = word if i == 0 else "<eps>"
cost = pron_cost if i == 0 else 0.0
print_arc(current_state, next_state, phones[i], word, cost)
current_state = next_state
next_state += 1
# print the last arc
i = phones_len - 1
phone = phones[i] if i >=0 else "<eps>"
word = word if i <= 0 else "<eps>"
cost = pron_cost if i <= 0 else 0.0
print_arc(current_state, end_state, phone, word, cost)
# set the final state
print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
"""Writes the text format of L.fst to the standard output. This version is for
when --sil-prob=0.0, meaning there is no optional silence allowed.
loop_state here is the start and final state of the fst. It goes to word_start_state
via epsilon transition.
In position-independent case, there is no difference between beginning word and
middle word. So all subwords with separator would leave from and enter word_start_state.
All subword without separator would leave from word_start_state and enter sil_state.
This guarantees that optional silence can only follow a word-end subword and such subwords
must appear at the end of the whole subword sequence.
In position-dependent case, there are 4 types of position-dependent subword:
1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
nation@@ 1.0 n_B ey_I sh_I ih_I n_I
n@@ 1.0 n_B
2) Middle subword. All phone suffixes should be "_I"s:
nation@@ 1.0 n_I ey_I sh_I ih_I n_I
3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
nation 1.0 n_I ey_I sh_I ih_I n_E
n 1.0 n_E
4) Singleton subword (i.e., the subword is word it self).
The first phone suffix should be "_B" and the last suffix should be "_E".
All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
nation 1.0 n_B ey_I sh_I ih_I n_E
n 1.0 n_S
So we need an extra word_internal_state. The beginning word
would leave from word_start_state and enter word_internal_state and middle word
would leave from and enter word_internal_state. The rest part is same.
'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
as returned by read_lexiconp().
'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
probability of silence
'sil_phone' is the silence phone, e.g. "SIL".
'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
'position_dependent', which is True is the lexicion is position-dependent.
'separator' is the symbol we use to indicate the position of a subword in word.
"""
sil_cost = -math.log(sil_prob)
no_sil_cost = -math.log(1 - sil_prob)
# regular setting
start_state = 0
loop_state = 1 # also the final state
sil_state = 2 # words terminate here when followed by silence; this state
# has a licence transition to loop_state
word_start_state = 3 # subword leave from here
next_state = 4 # the next un-allocated state, will be incremented as we go
print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
# optional setting for disambig_state
if sil_disambig is None:
print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
else:
disambig_state = next_state
next_state += 1
print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)
# optional setting for word_internal_state
if position_dependent:
word_internal_state = next_state
next_state += 1
for (word, pron_prob, phones) in lexicon:
pron_cost = 0.0 # do not support pron_prob
phones_len = len(phones)
# set start and end state for different cases
if position_dependent:
first_phone_suffix = get_suffix(phones[0])
last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
last_phone_suffix = get_suffix(last_phone)
# singleton subword
if first_phone_suffix == "_S":
current_state = word_start_state
end_state_list = [loop_state, sil_state]
end_cost_list = [no_sil_cost, sil_cost]
# first set the current_state
elif first_phone_suffix == "_B":
current_state = word_start_state
elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
current_state = word_internal_state
# then set the end_state (end_state_list)
if last_phone_suffix == "_B" or last_phone_suffix == "_I":
end_state_list = [word_internal_state]
end_cost_list = [0.0]
elif last_phone_suffix == "_E":
end_state_list = [loop_state, sil_state]
end_cost_list = [no_sil_cost, sil_cost]
else:
current_state = word_start_state
if is_end(word, separator):
end_state_list = [loop_state, sil_state]
end_cost_list = [no_sil_cost, sil_cost]
else:
end_state_list = [word_start_state]
end_cost_list = [0.0]
# print arcs (except the last one) for the subword
for i in range(phones_len - 1):
word = word if i == 0 else "<eps>"
cost = pron_cost if i == 0 else 0.0
print_arc(current_state, next_state, phones[i], word, cost)
current_state = next_state
next_state += 1
# print the last arc
i = phones_len - 1
phone = phones[i] if i >= 0 else "<eps>"
word = word if i <= 0 else "<eps>"
cost = pron_cost if i <= 0 else 0.0
for (end_state, end_cost) in zip(end_state_list, end_cost_list):
print_arc(current_state, end_state, phone, word, cost + end_cost)
# set the final state
print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
def main():
args = get_args()
if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
print("{}: invalid value specified --sil-prob={}".format(
sys.argv[0], args.sil_prob), file=sys.stderr)
sys.exit(1)
lexicon = read_lexiconp(args.lexiconp)
if args.sil_prob == 0.0:
write_fst_no_silence(lexicon, args.position_dependent, args.separator)
else:
write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob,
args.sil_disambig, args.position_dependent, args.separator)
if __name__ == "__main__":
main()