Blame view

egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py 14 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
  #!/usr/bin/env python3
  
  # 2019 Dongji Gao
  # Apache 2.0.
  
  from make_lexicon_fst import read_lexiconp
  import argparse
  import math
  import sys
  
  # see get_args() below for usage mesage
  def get_args():
      parser = argparse.ArgumentParser(description="""This script creates the
          text form of a subword lexicon FST to be compiled by fstcompile using
          the appropriate symbol tables (phones.txt and words.txt). It will mostly
          be invoked indirectly via utils/prepare_lang_subword.sh. The output
          goes to the stdout. This script is the subword version of make_lexicon_fst.py.
          It only allows optional silence to appear after end-subword or singleton-subword,
          (i.e., subwords without separator). In this version we do not support
          pronunciation probability. (i.e., pron-prob = 1.0)""")
  
      parser.add_argument('--sil-phone', type=str, help="""Text form of
          optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
      parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
          of silence between words (including the beginning and end of word sequence).
          Must be in range [0.0, 1.0). This refer to the optional silence inserted by
          the lexicon; see the --sil-phone option.""")
      parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
          to disambiguate silence, e.g. #5. Will only be supplied if you are creating 
          the version of L.fst with disambiguation symbols, intended for use with cyclic 
          G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism 
          of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
      parser.add_argument('--position-dependent', action="store_true", help="""Whether 
          the input lexicon is position-dependent.""")
      parser.add_argument("--separator", type=str, default="@@", help="""Separator
          indicates the position of a subword in a word.
          Subword followed by separator can only appear at the beginning or middle of a word.
          Subword without separator can only appear at the end of a word or is a word itself.
          E.g. "international -> inter@@ nation@@ al";
               "nation        -> nation"
      The separator should match the separator used in the input lexicon.""")
      parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
          pronunciation probabilities (normally lexiconp.txt), with lines of the
          form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
      args = parser.parse_args()
      return args
  
  def contain_disambig_symbol(phones):
      """Return true if the phone sequence contains disambiguation symbol.
      Return false otherwise. Disambiguation symbol is at the end of phones 
      in the form of #1, #2... There is at most one disambiguation 
      symbol for each phone sequence"""
      return True if phones[-1].startswith("#") else False
  
  def print_arc(src, dest, phone, word, cost):
      print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))
  
  def is_end(word, separator):
      """Return true if the subword can appear at the end of a word (i.e., the subword
      does not end with separator). Return false otherwise."""
      return not word.endswith(separator)
  
  def get_suffix(phone):
      """Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
      if len(phone) < 3:
          print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
                sys.argv[0], phone), file=sys.stderr)
          sys.exit(1)
      return phone[-2:]
  
  def write_fst_no_silence(lexicon, position_dependent, separator):
      """Writes the text format of L.fst to the standard output.  This version is for
      when --sil-prob=0.0, meaning there is no optional silence allowed.
      loop_state here is the start and final state of the fst. It goes to word_start_state
      via epsilon transition.
      In position-independent case, there is no difference between beginning word and 
      middle word. So all subwords with separator would leave from and enter word_start_state.
      All subword without separator would leave from word_start_state and enter loop_state.
      This guarantees that optional silence can only follow a word-end subword.
  
      In position-dependent case, there are 4 types of position-dependent subword:
      1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
          nation@@ 1.0 n_B ey_I sh_I ih_I n_I
          n@@      1.0 n_B
      2) Middle subword. All phone suffixes should be "_I"s:
          nation@@ 1.0 n_I ey_I sh_I ih_I n_I
      3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
          nation   1.0 n_I ey_I sh_I ih_I n_E
          n        1.0 n_E
      4) Singleton subword (i.e., the subword is word it self).
         The first phone suffix should be "_B" and the last suffix should be "_E".
         All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
          nation   1.0 n_B ey_I sh_I ih_I n_E
          n        1.0 n_S
  
      So we need an extra word_internal_state. The beginning word 
      would leave from word_start_state and enter word_internal_state and middle word
      would leave from and enter word_internal_state. The rest part is same.
  
        'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
        'position_dependent', which is true is the lexicon is position-dependent.
        'separator' is a symbol which indicates the position of a subword in word.
      """
      # regular setting
      loop_state = 0
      word_start_state = 1
      next_state = 2
  
      print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
  
      # optional setting for word_internal_state
      if position_dependent:
          word_internal_state = next_state
          next_state += 1
  
      for (word, pron_prob, phones) in lexicon:
          pron_cost = 0.0                # do not support pron_prob
          phones_len = len(phones)
  
          # set start and end state for different cases
          if position_dependent:
              first_phone_suffix = get_suffix(phones[0])
              last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
              last_phone_suffix = get_suffix(last_phone)
  
              # singleton word
              if first_phone_suffix == "_S":
                  current_state = word_start_state
                  end_state = loop_state
              # set the current_state
              elif first_phone_suffix == "_B":
                  current_state = word_start_state
              elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
                  current_state = word_internal_state
              # then set the end_state
              if last_phone_suffix == "_B" or last_phone_suffix == "_I":
                  end_state = word_internal_state
              elif last_phone_suffix == "_E":
                  end_state = loop_state
          else:
              current_state = word_start_state
              end_state = loop_state if is_end(word, separator) else word_start_state
  
          # print arcs (except the last one) for the subword
          for i in range(phones_len - 1):
              word = word if i == 0 else "<eps>"
              cost = pron_cost if i == 0 else 0.0
              print_arc(current_state, next_state, phones[i], word, cost)
              current_state = next_state
              next_state += 1
  
          # print the last arc
          i = phones_len - 1
          phone = phones[i] if i >=0 else "<eps>"
          word = word if i <= 0 else "<eps>"
          cost = pron_cost if i <= 0 else 0.0
          print_arc(current_state, end_state, phone, word, cost)
  
      # set the final state
      print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
  
  def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
      """Writes the text format of L.fst to the standard output.  This version is for
      when --sil-prob=0.0, meaning there is no optional silence allowed.
      loop_state here is the start and final state of the fst. It goes to word_start_state
      via epsilon transition.
  
      In position-independent case, there is no difference between beginning word and 
      middle word. So all subwords with separator would leave from and enter word_start_state.
      All subword without separator would leave from word_start_state and enter sil_state.
      This guarantees that optional silence can only follow a word-end subword and such subwords
      must appear at the end of the whole subword sequence.
  
      In position-dependent case, there are 4 types of position-dependent subword:
      1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
          nation@@ 1.0 n_B ey_I sh_I ih_I n_I
          n@@      1.0 n_B
      2) Middle subword. All phone suffixes should be "_I"s:
          nation@@ 1.0 n_I ey_I sh_I ih_I n_I
      3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
          nation   1.0 n_I ey_I sh_I ih_I n_E
          n        1.0 n_E
      4) Singleton subword (i.e., the subword is word it self).
         The first phone suffix should be "_B" and the last suffix should be "_E".
         All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
          nation   1.0 n_B ey_I sh_I ih_I n_E
          n        1.0 n_S
  
      So we need an extra word_internal_state. The beginning word 
      would leave from word_start_state and enter word_internal_state and middle word
      would leave from and enter word_internal_state. The rest part is same.
  
        'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
           as returned by read_lexiconp().
        'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
           probability of silence
        'sil_phone' is the silence phone, e.g. "SIL".
        'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
        'position_dependent', which is True is the lexicion is position-dependent.
        'separator' is the symbol we use to indicate the position of a subword in word.
      """
  
      sil_cost = -math.log(sil_prob)
      no_sil_cost = -math.log(1 - sil_prob)
  
      # regular setting
      start_state = 0
      loop_state = 1         # also the final state
      sil_state = 2          # words terminate here when followed by silence; this state
                             # has a licence transition to loop_state
      word_start_state = 3   # subword leave from here
      next_state = 4         # the next un-allocated state, will be incremented as we go
  
      print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
      print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
      print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
  
      # optional setting for disambig_state
      if sil_disambig is None:
          print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
      else:
          disambig_state = next_state
          next_state += 1
          print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
          print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)
  
      # optional setting for word_internal_state
      if position_dependent:
          word_internal_state = next_state
          next_state += 1
  
      for (word, pron_prob, phones) in lexicon:
          pron_cost = 0.0           # do not support pron_prob
          phones_len = len(phones)
          
          # set start and end state for different cases
          if position_dependent:
              first_phone_suffix = get_suffix(phones[0])
              last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
              last_phone_suffix = get_suffix(last_phone)
  
              # singleton subword
              if first_phone_suffix == "_S":
                  current_state = word_start_state
                  end_state_list = [loop_state, sil_state]
                  end_cost_list = [no_sil_cost, sil_cost]
              # first set the current_state
              elif first_phone_suffix == "_B":
                  current_state = word_start_state
              elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
                  current_state = word_internal_state
              # then set the end_state (end_state_list)
              if last_phone_suffix == "_B" or last_phone_suffix == "_I":
                  end_state_list = [word_internal_state]
                  end_cost_list = [0.0]
              elif last_phone_suffix == "_E":
                  end_state_list = [loop_state, sil_state]
                  end_cost_list = [no_sil_cost, sil_cost]
          else:
              current_state = word_start_state
              if is_end(word, separator):
                  end_state_list = [loop_state, sil_state]
                  end_cost_list = [no_sil_cost, sil_cost]
              else:
                  end_state_list = [word_start_state]
                  end_cost_list = [0.0]
  
          # print arcs (except the last one) for the subword
          for i in range(phones_len - 1):
              word = word if i == 0 else "<eps>"
              cost = pron_cost if i == 0 else 0.0
              print_arc(current_state, next_state, phones[i], word, cost)
              current_state = next_state
              next_state += 1
  
          # print the last arc
          i = phones_len - 1
          phone = phones[i] if i >= 0 else "<eps>"
          word = word if i <= 0 else "<eps>"
          cost = pron_cost if i <= 0 else 0.0
          for (end_state, end_cost) in zip(end_state_list, end_cost_list):
              print_arc(current_state, end_state, phone, word, cost + end_cost)
  
      # set the final state
      print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
  
  def main():
      args = get_args()
      if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
          print("{}: invalid value specified --sil-prob={}".format(
                sys.argv[0], args.sil_prob), file=sys.stderr)
          sys.exit(1)
      lexicon = read_lexiconp(args.lexiconp)
      if args.sil_prob == 0.0:
          write_fst_no_silence(lexicon, args.position_dependent, args.separator)
      else:
          write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, 
              args.sil_disambig, args.position_dependent, args.separator)
  
  if __name__ == "__main__":
      main()