Blame view
egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
13.4 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 |
#!/usr/bin/env bash # This script demonstrates how to use the grammar-decoding framework to build # graphs made out of more than one part. (This version uses word-specific # silence probabilities). It demonstrates using `fstequivalent` # that the graph constructed this way is equivalent to what you would create if # you had the LM all as a single piece. This uses the command line tools to # expand to a regular FST (--write-as-grammar=false) In practice you might not # want do to that, since the result might be large, and since writing the entire # thing might take too much time. The code itself allows you to construct these # GrammarFst objects in lightweight way and decode using them. # Unfortunately the filenames here are not very well through through. I hope to # rework this when I have time. stage=0 run_g2p=false # set this to true to run the g2p stuff, it's slow so # by default we fake it by providing what it previously output set -e . ./path.sh . utils/parse_options.sh tree_dir=exp/chain/tree_sp lang_base=data/lang_basevocab lang_ext=data/lang_extvocab # For the purposes of this script we just need a biphone tree and associated # transition-model for testing, because we're testing it at the graph level, # i.e. testing equivalence of compiled HCLG graphs; there is no decoding # involved here. # We're doing this with the "no-silprobs" dictionary dir for now, as we # need to write some scripts to support silprobs with this. # For reference, here is how we could create the 'lang' dir for the # baseline. #utils/prepare_lang.sh data/local/dict \ # "<UNK>" data/local/lang_tmp data/lang if [ $stage -le 0 ]; then cp -r data/local/dict data/local/dict_basevocab echo "#nonterm:unk" > data/local/dict_basevocab/nonterminals.txt utils/prepare_lang.sh data/local/dict_basevocab \ "<UNK>" data/local/lang_tmp $lang_base fi if [ $stage -le 1 ]; then # note: <UNK> does appear in that arpa file, with a reasonable probability # (0.0)... presumably because the vocab that the arpa file was built with was # not vast, so there were plenty of OOVs. It would be possible to adjust its # probability with adjust_unk_arpa.pl, but for now we just leave it as-is. # The <UNK> appears quite a few times in the ARPA. In the language model we # replaced it with #nonterm:unk, which will later expand to our custom graph # of new words. # We don't want the #nonterm:unk on the output side of G.fst, or it would # appear in the decoded output, so we remove it using the 'fstrmsymbols' command. nonterm_unk=$(grep '#nonterm:unk' $lang_base/words.txt | awk '{print $2}') gunzip -c data/local/lm/lm_tgsmall.arpa.gz | \ sed 's/<UNK>/#nonterm:unk/g' | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$lang_base/words.txt - | \ fstrmsymbols --remove-from-output=true "echo $nonterm_unk|" - $lang_base/G.fst fi if [ $stage -le 2 ]; then # make the top-level part of the graph. utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_top fi if [ $stage -le 3 ] && $run_g2p; then # you may have to do some stuff manually to install sequitur, to get this to work. dict=data/local/dict_basevocab steps/dict/train_g2p.sh --silence-phones $dict/silence_phones.txt $dict/lexicon.txt $tree_dir/extvocab_g2p fi if [ $stage -le 4 ]; then # Create data/local/dict_newvocab as a dict-dir containing just the # newly created vocabulary entries (but the same phone list as our old setup, not # that it matters) mkdir -p $tree_dir/extvocab_lexicon # First find a list of words in the test set that are out of vocabulary. # Of course this is totally cheating. awk -v w=data/lang/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} {for(n=2;n<=NF;n++) if(!($n in seen)) oov[$n] = 1} END{ for(k in oov) print k;}' < data/dev_clean_2/text > $tree_dir/extvocab_lexicon/words echo "$0: generating g2p entries for $(wc -l <$tree_dir/extvocab_lexicon/words) words" if $run_g2p; then steps/dict/apply_g2p.sh $tree_dir/extvocab_lexicon/words $tree_dir/extvocab_g2p $tree_dir/extvocab_lexicon else cat <<EOF >$tree_dir/extvocab_lexicon//lexicon.lex HARDWIGG 0.962436 HH AA1 R D W IH1 G SUDVESTR 0.162048 S AH1 D V EY1 S T R SUDVESTR 0.133349 S AH1 D V EH1 S T R SUDVESTR 0.114376 S AH1 D V EH1 S T ER0 VINOS 0.558345 V IY1 N OW0 Z VINOS 0.068883 V AY1 N OW0 Z VINOS 0.068431 V IY1 N OW0 S DOMA 0.645714 D OW1 M AH0 DOMA 0.118255 D UW1 M AH0 DOMA 0.080682 D OW0 M AH0 GWYNPLAINE'S 0.983053 G W IH1 N P L EY1 N Z SHIMERDA 0.610922 SH IH0 M EH1 R D AH0 SHIMERDA 0.175678 SH IY0 M EH1 R D AH0 SHIMERDA 0.069785 SH AY1 M ER1 D AH0 MYRDALS 0.479183 M IH1 R D AH0 L Z MYRDALS 0.135225 M ER1 D AH0 L Z MYRDALS 0.115478 M IH1 R D L Z HEUCHERA 0.650042 HH OY1 K IH1 R AH0 HEUCHERA 0.119363 HH OY1 K EH1 R AH0 HEUCHERA 0.077907 HH OY1 K ER0 AH0 IMPARA 0.906222 IH0 M P AA1 R AH0 VERLOC'S 0.564847 V ER0 L AA1 K S VERLOC'S 0.173540 V ER1 L AH0 K S VERLOC'S 0.050543 V ER1 L AA1 K S UNTRUSSING 0.998019 AH0 N T R AH1 S IH0 NG DARFHULVA 0.317057 D AA2 F UH1 L V AH0 DARFHULVA 0.262882 D AA2 F HH UH1 L V AH0 DARFHULVA 0.064055 D AA2 F HH UW1 L V AH0 FINNACTA 0.594586 F IH1 N AH0 K T AH0 FINNACTA 0.232454 F IH1 N AE1 K T AH0 FINNACTA 0.044733 F IH1 N IH0 K T AH0 YOKUL 0.845279 Y OW1 K AH0 L YOKUL 0.051082 Y OW2 K AH0 L YOKUL 0.029435 Y OW0 K AH0 L CONGAL 0.504228 K AA1 NG G AH0 L CONGAL 0.151648 K AA2 NG G AH0 L CONGAL 0.137837 K AH0 N JH AH0 L DELECTASTI 0.632180 D IH0 L EH0 K T EY1 S T IY0 DELECTASTI 0.203808 D IH0 L EH1 K T EY1 S T IY0 DELECTASTI 0.066722 D IH0 L EH0 K T AE1 S T IY0 YUNDT 0.975077 Y AH1 N T QUINCI 0.426115 K W IH1 N S IY0 QUINCI 0.369324 K W IH1 N CH IY0 QUINCI 0.064507 K W IY0 N CH IY0 BIRDIKINS 0.856979 B ER1 D IH0 K AH0 N Z BIRDIKINS 0.045315 B ER1 D AH0 K AH0 N Z SNEFFELS 0.928413 S N EH1 F AH0 L Z FJORDUNGR 0.130629 F Y AO1 R D UW0 NG G R FJORDUNGR 0.125082 F Y AO1 R D AH0 NG G R FJORDUNGR 0.111035 F Y AO1 R D UH1 NG R YULKA 0.540253 Y UW1 L K AH0 YULKA 0.295588 Y AH1 L K AH0 YULKA 0.076631 Y UH1 L K AH0 LACQUEY'S 0.987908 L AE1 K IY0 Z OSSIPON'S 0.651400 AA1 S AH0 P AA2 N Z OSSIPON'S 0.118444 AA1 S AH0 P AA0 N Z OSSIPON'S 0.106377 AA1 S AH0 P AH0 N Z SAKNUSSEMM 0.060270 S AE1 K N AH1 S EH1 M SAKNUSSEMM 0.044992 S AE1 K N AH0 S EH1 M SAKNUSSEMM 0.044084 S AA0 K N AH1 S EH1 M CONGAL'S 0.618287 K AA1 NG G AH0 L Z CONGAL'S 0.185952 K AA2 NG G AH0 L Z CONGAL'S 0.115143 K AH0 N G AH0 L Z TARRINZEAU 0.159153 T AA1 R IY0 N Z OW1 TARRINZEAU 0.136536 T AA1 R AH0 N Z OW1 TARRINZEAU 0.100924 T EH1 R IY0 N Z OW1 SHIMERDAS 0.230819 SH IH0 M EH1 R D AH0 Z SHIMERDAS 0.216235 SH IH0 M EH1 R D AH0 S SHIMERDAS 0.073311 SH AY1 M ER1 D AH0 Z RUGGEDO'S 0.821285 R UW0 JH EY1 D OW0 Z RUGGEDO'S 0.166825 R AH1 G AH0 D OW0 Z CORNCAKES 0.934118 K AO1 R N K EY2 K S VENDHYA 0.616662 V EH0 N D Y AH0 VENDHYA 0.178349 V EH1 N D Y AH0 VENDHYA 0.160768 V AA1 N D Y AH0 GINGLE 0.919815 G IH1 NG G AH0 L STUPIRTI 0.422653 S T UW0 P IH1 R T IY0 STUPIRTI 0.126925 S T UW1 P IH0 R T IY0 STUPIRTI 0.078422 S T UW1 P AH0 R T IY0 HERBIVORE 0.950887 HH ER1 B IH0 V AO2 R BRION'S 0.838326 B R AY1 AH0 N Z BRION'S 0.140310 B R IY0 AH0 N Z DELAUNAY'S 0.993259 D EH1 L AO0 N EY0 Z KHOSALA 0.920908 K OW0 S AA1 L AH0 BRANDD 0.827461 B R AE1 N D BRANDD 0.085646 B R AE2 N D GARDAR 0.598675 G AA0 R D AA1 R GARDAR 0.289831 G AA1 R D AA2 R GARDAR 0.057983 G AA0 R D AA2 R MACKLEWAIN 0.570209 M AE1 K AH0 L W EY0 N MACKLEWAIN 0.101477 M AH0 K AH0 L W EY0 N MACKLEWAIN 0.067905 M AE1 K AH0 L W EY2 N LIBANO 0.993297 L IY0 B AA1 N OW0 MOLING 0.782578 M OW1 L IH0 NG MOLING 0.059362 M OW2 L IH0 NG MOLING 0.056217 M AA1 L IH0 NG BENNYDECK'S 0.583859 B EH1 N IY0 D EH0 K S BENNYDECK'S 0.276699 B EH1 N IH0 D EH0 K S BENNYDECK'S 0.028343 B EH1 N IH0 D IH0 K S MACKLEWAIN'S 0.615766 M AE1 K AH0 L W EY0 N Z MACKLEWAIN'S 0.109585 M AH0 K AH0 L W EY0 N Z MACKLEWAIN'S 0.039423 M AE1 K AH0 L W AH0 N Z PRESTY 0.616071 P R EH1 S T IY0 PRESTY 0.288701 P R AH0 S T IY0 BREADHOUSE 0.995874 B R EH1 D HH AW2 S BUZZER'S 0.992495 B AH1 Z ER0 Z BHUNDA 0.502439 B UW1 N D AH0 BHUNDA 0.267733 B AH0 N D AH0 BHUNDA 0.193772 B UH1 N D AH0 PINKIES 0.998440 P IH1 NG K IY0 Z TROKE 0.723320 T R OW1 K TROKE 0.269707 T R OW2 K OSSIPON 0.728486 AA1 S AH0 P AA2 N OSSIPON 0.098752 AA1 S AH0 P AH0 N OSSIPON 0.033957 AA1 S AH0 P AO0 N RIVERLIKE 0.991731 R IH1 V ER0 L AY2 K NICLESS 0.478183 N IH1 K L AH0 S NICLESS 0.159889 N IH0 K L AH0 S NICLESS 0.120611 N IH1 K L IH0 S TRAMPE 0.959184 T R AE1 M P VERLOC 0.610461 V ER0 L AA1 K VERLOC 0.128479 V ER1 L AH0 K VERLOC 0.073687 V ER1 L AA0 K GANNY 0.991703 G AE1 N IY0 AMBROSCH 0.302906 AE0 M B R OW1 SH AMBROSCH 0.201163 AE0 M B R AO1 SH AMBROSCH 0.109274 AE1 M B R AO1 SH FIBI 0.619154 F IH1 B IY0 FIBI 0.163168 F IY1 B IY0 FIBI 0.083443 F AY1 B IY0 IROLG 0.823123 IH0 R OW1 L G IROLG 0.053196 IH0 R OW1 L JH IROLG 0.021038 IH0 R OW1 L JH IY1 BALVASTRO 0.251546 B AA0 L V AA1 S T R OW0 BALVASTRO 0.213351 B AE0 L V AE1 S T R OW0 BALVASTRO 0.133005 B AA0 L V AE1 S T R OW0 BOOLOOROO 0.676757 B UW1 L UW1 R UW0 BOOLOOROO 0.173653 B UW1 L UH2 R UW0 BOOLOOROO 0.086501 B UW1 L UH0 R UW0 EOF fi # extend_lang.sh needs it to have basename 'lexiconp.txt'. mv $tree_dir/extvocab_lexicon/lexicon.lex $tree_dir/extvocab_lexicon/lexiconp.txt [ -f data/lang_extvocab/G.fst ] && rm data/lang_extvocab/G.fst utils/lang/extend_lang.sh data/lang_basevocab $tree_dir/extvocab_lexicon/lexiconp.txt data/lang_extvocab fi if [ $stage -le 5 ]; then # make the G.fst for the extra words. Just assign equal probabilities to all of # them. The words will all transition from state 1 to 2. cat <<EOF > $lang_ext/G.txt 0 1 #nonterm_begin <eps> 2 3 #nonterm_end <eps> 3 EOF lexicon=$tree_dir/extvocab_lexicon/lexiconp.txt num_words=$(wc -l <$lexicon) cost=$(perl -e "print log($num_words)"); awk -v cost=$cost '{print 1, 2, $1, $1, cost}' <$lexicon >>$lang_ext/G.txt fstcompile --isymbols=$lang_ext/words.txt --osymbols=$lang_ext/words.txt <$lang_ext/G.txt | \ fstarcsort --sort_type=ilabel >$lang_ext/G.fst fi if [ $stage -le 6 ]; then # make the part of the graph that will be included. # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do # this in code. utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_part fi if [ $stage -le 7 ]; then offset=$(grep nonterm_bos $lang_ext/phones.txt | awk '{print $2}') nonterm_unk=$(grep nonterm:unk $lang_ext/phones.txt | awk '{print $2}') mkdir -p $tree_dir/extvocab_combined [ -d $tree_dir/extvocab_combined/phones ] && rm -r $tree_dir/extvocab_combined/phones # the decoding script expects words.txt and phones/, copy them from the extvocab_part # graph directory where they will have suitable values. cp -r $tree_dir/extvocab_part/{words.txt,phones.txt,phones/} $tree_dir/extvocab_combined # the following, due to --write-as-grammar=false, compiles it into an FST # which can be decoded by our normal decoder. make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/extvocab_top/HCLG.fst \ $nonterm_unk $tree_dir/extvocab_part/HCLG.fst $tree_dir/extvocab_combined/HCLG.fst # the following compiles it and writes as GrammarFst. The size is 176M, vs. 182M for HCLG.fst. # In other examples, of course the difference might be more. make-grammar-fst --write-as-grammar=true --nonterm-phones-offset=$offset $tree_dir/extvocab_top/HCLG.fst \ $nonterm_unk $tree_dir/extvocab_part/HCLG.fst $tree_dir/extvocab_combined/HCLG.gra fi if [ $stage -le 8 ]; then # OK, now we actually decode the test data. For reference, the command which was used to # decode the test data in the current (at the time of writing) chain TDNN system # local/chain/run_tdnn.sh (as figured out by running it from that stage), was: # steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \ # --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \ # exp/chain/tree_sp/graph_tgsmall data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2 # We just replace the graph with the one in $treedir/extvocab_combined. steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \ --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \ exp/chain/tree_sp/extvocab_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb/wer_* | utils/best_wer.sh # %WER 11.42 [ 2300 / 20138, 227 ins, 275 del, 1798 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb/wer_12_0.0 #.. versus the baseline below: # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_* | utils/best_wer.sh # %WER 12.01 [ 2418 / 20138, 244 ins, 307 del, 1867 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_13_0.0 fi if [ $stage -le 9 ]; then steps/nnet3/decode_grammar.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \ --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \ exp/chain/tree_sp/extvocab_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb_gra # WER with grammar decoding is exactly the same as decoding from the converted FST. # grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb_gra/wer_* | utils/best_wer.sh # %WER 11.42 [ 2300 / 20138, 227 ins, 275 del, 1798 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb_gra/wer_12_0.0 fi |