Blame view
egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
14 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
#!/usr/bin/env python3 # 2019 Dongji Gao # Apache 2.0. from make_lexicon_fst import read_lexiconp import argparse import math import sys # see get_args() below for usage mesage def get_args(): parser = argparse.ArgumentParser(description="""This script creates the text form of a subword lexicon FST to be compiled by fstcompile using the appropriate symbol tables (phones.txt and words.txt). It will mostly be invoked indirectly via utils/prepare_lang_subword.sh. The output goes to the stdout. This script is the subword version of make_lexicon_fst.py. It only allows optional silence to appear after end-subword or singleton-subword, (i.e., subwords without separator). In this version we do not support pronunciation probability. (i.e., pron-prob = 1.0)""") parser.add_argument('--sil-phone', type=str, help="""Text form of optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""") parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability of silence between words (including the beginning and end of word sequence). Must be in range [0.0, 1.0). This refer to the optional silence inserted by the lexicon; see the --sil-phone option.""") parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol to disambiguate silence, e.g. #5. Will only be supplied if you are creating the version of L.fst with disambiguation symbols, intended for use with cyclic G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""") parser.add_argument('--position-dependent', action="store_true", help="""Whether the input lexicon is position-dependent.""") parser.add_argument("--separator", type=str, default="@@", help="""Separator indicates the position of a subword in a word. Subword followed by separator can only appear at the beginning or middle of a word. Subword without separator can only appear at the end of a word or is a word itself. E.g. "international -> inter@@ nation@@ al"; "nation -> nation" The separator should match the separator used in the input lexicon.""") parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with pronunciation probabilities (normally lexiconp.txt), with lines of the form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""") args = parser.parse_args() return args def contain_disambig_symbol(phones): """Return true if the phone sequence contains disambiguation symbol. Return false otherwise. Disambiguation symbol is at the end of phones in the form of #1, #2... There is at most one disambiguation symbol for each phone sequence""" return True if phones[-1].startswith("#") else False def print_arc(src, dest, phone, word, cost): print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost)) def is_end(word, separator): """Return true if the subword can appear at the end of a word (i.e., the subword does not end with separator). Return false otherwise.""" return not word.endswith(separator) def get_suffix(phone): """Return the suffix of a phone. The suffix is in the form of '_B', '_I'...""" if len(phone) < 3: print("{}: invalid phone {} (please check if the phone is position-dependent)".format( sys.argv[0], phone), file=sys.stderr) sys.exit(1) return phone[-2:] def write_fst_no_silence(lexicon, position_dependent, separator): """Writes the text format of L.fst to the standard output. This version is for when --sil-prob=0.0, meaning there is no optional silence allowed. loop_state here is the start and final state of the fst. It goes to word_start_state via epsilon transition. In position-independent case, there is no difference between beginning word and middle word. So all subwords with separator would leave from and enter word_start_state. All subword without separator would leave from word_start_state and enter loop_state. This guarantees that optional silence can only follow a word-end subword. In position-dependent case, there are 4 types of position-dependent subword: 1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s: nation@@ 1.0 n_B ey_I sh_I ih_I n_I n@@ 1.0 n_B 2) Middle subword. All phone suffixes should be "_I"s: nation@@ 1.0 n_I ey_I sh_I ih_I n_I 3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s: nation 1.0 n_I ey_I sh_I ih_I n_E n 1.0 n_E 4) Singleton subword (i.e., the subword is word it self). The first phone suffix should be "_B" and the last suffix should be "_E". All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S": nation 1.0 n_B ey_I sh_I ih_I n_E n 1.0 n_S So we need an extra word_internal_state. The beginning word would leave from word_start_state and enter word_internal_state and middle word would leave from and enter word_internal_state. The rest part is same. 'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by 'position_dependent', which is true is the lexicon is position-dependent. 'separator' is a symbol which indicates the position of a subword in word. """ # regular setting loop_state = 0 word_start_state = 1 next_state = 2 print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0) # optional setting for word_internal_state if position_dependent: word_internal_state = next_state next_state += 1 for (word, pron_prob, phones) in lexicon: pron_cost = 0.0 # do not support pron_prob phones_len = len(phones) # set start and end state for different cases if position_dependent: first_phone_suffix = get_suffix(phones[0]) last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1] last_phone_suffix = get_suffix(last_phone) # singleton word if first_phone_suffix == "_S": current_state = word_start_state end_state = loop_state # set the current_state elif first_phone_suffix == "_B": current_state = word_start_state elif first_phone_suffix == "_I" or first_phone_suffix == "_E": current_state = word_internal_state # then set the end_state if last_phone_suffix == "_B" or last_phone_suffix == "_I": end_state = word_internal_state elif last_phone_suffix == "_E": end_state = loop_state else: current_state = word_start_state end_state = loop_state if is_end(word, separator) else word_start_state # print arcs (except the last one) for the subword for i in range(phones_len - 1): word = word if i == 0 else "<eps>" cost = pron_cost if i == 0 else 0.0 print_arc(current_state, next_state, phones[i], word, cost) current_state = next_state next_state += 1 # print the last arc i = phones_len - 1 phone = phones[i] if i >=0 else "<eps>" word = word if i <= 0 else "<eps>" cost = pron_cost if i <= 0 else 0.0 print_arc(current_state, end_state, phone, word, cost) # set the final state print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0)) def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator): """Writes the text format of L.fst to the standard output. This version is for when --sil-prob=0.0, meaning there is no optional silence allowed. loop_state here is the start and final state of the fst. It goes to word_start_state via epsilon transition. In position-independent case, there is no difference between beginning word and middle word. So all subwords with separator would leave from and enter word_start_state. All subword without separator would leave from word_start_state and enter sil_state. This guarantees that optional silence can only follow a word-end subword and such subwords must appear at the end of the whole subword sequence. In position-dependent case, there are 4 types of position-dependent subword: 1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s: nation@@ 1.0 n_B ey_I sh_I ih_I n_I n@@ 1.0 n_B 2) Middle subword. All phone suffixes should be "_I"s: nation@@ 1.0 n_I ey_I sh_I ih_I n_I 3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s: nation 1.0 n_I ey_I sh_I ih_I n_E n 1.0 n_E 4) Singleton subword (i.e., the subword is word it self). The first phone suffix should be "_B" and the last suffix should be "_E". All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S": nation 1.0 n_B ey_I sh_I ih_I n_E n 1.0 n_S So we need an extra word_internal_state. The beginning word would leave from word_start_state and enter word_internal_state and middle word would leave from and enter word_internal_state. The rest part is same. 'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by read_lexiconp(). 'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the probability of silence 'sil_phone' is the silence phone, e.g. "SIL". 'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5". 'position_dependent', which is True is the lexicion is position-dependent. 'separator' is the symbol we use to indicate the position of a subword in word. """ sil_cost = -math.log(sil_prob) no_sil_cost = -math.log(1 - sil_prob) # regular setting start_state = 0 loop_state = 1 # also the final state sil_state = 2 # words terminate here when followed by silence; this state # has a licence transition to loop_state word_start_state = 3 # subword leave from here next_state = 4 # the next un-allocated state, will be incremented as we go print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost) print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost) print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0) # optional setting for disambig_state if sil_disambig is None: print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0) else: disambig_state = next_state next_state += 1 print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0) print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0) # optional setting for word_internal_state if position_dependent: word_internal_state = next_state next_state += 1 for (word, pron_prob, phones) in lexicon: pron_cost = 0.0 # do not support pron_prob phones_len = len(phones) # set start and end state for different cases if position_dependent: first_phone_suffix = get_suffix(phones[0]) last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1] last_phone_suffix = get_suffix(last_phone) # singleton subword if first_phone_suffix == "_S": current_state = word_start_state end_state_list = [loop_state, sil_state] end_cost_list = [no_sil_cost, sil_cost] # first set the current_state elif first_phone_suffix == "_B": current_state = word_start_state elif first_phone_suffix == "_I" or first_phone_suffix == "_E": current_state = word_internal_state # then set the end_state (end_state_list) if last_phone_suffix == "_B" or last_phone_suffix == "_I": end_state_list = [word_internal_state] end_cost_list = [0.0] elif last_phone_suffix == "_E": end_state_list = [loop_state, sil_state] end_cost_list = [no_sil_cost, sil_cost] else: current_state = word_start_state if is_end(word, separator): end_state_list = [loop_state, sil_state] end_cost_list = [no_sil_cost, sil_cost] else: end_state_list = [word_start_state] end_cost_list = [0.0] # print arcs (except the last one) for the subword for i in range(phones_len - 1): word = word if i == 0 else "<eps>" cost = pron_cost if i == 0 else 0.0 print_arc(current_state, next_state, phones[i], word, cost) current_state = next_state next_state += 1 # print the last arc i = phones_len - 1 phone = phones[i] if i >= 0 else "<eps>" word = word if i <= 0 else "<eps>" cost = pron_cost if i <= 0 else 0.0 for (end_state, end_cost) in zip(end_state_list, end_cost_list): print_arc(current_state, end_state, phone, word, cost + end_cost) # set the final state print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0)) def main(): args = get_args() if args.sil_prob < 0.0 or args.sil_prob >= 1.0: print("{}: invalid value specified --sil-prob={}".format( sys.argv[0], args.sil_prob), file=sys.stderr) sys.exit(1) lexicon = read_lexiconp(args.lexiconp) if args.sil_prob == 0.0: write_fst_no_silence(lexicon, args.position_dependent, args.separator) else: write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, args.sil_disambig, args.position_dependent, args.separator) if __name__ == "__main__": main() |