Blame view
egs/wsj/s5/run.sh
15.9 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 |
#!/bin/bash stage=0 train=true # set to false to disable the training-related scripts # note: you probably only want to set --train false if you # are using at least --stage 1. decode=true # set to false to disable the decoding-related scripts. . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . utils/parse_options.sh # e.g. this parses the --stage option if supplied. # This is a shell script, but it's recommended that you run the commands one by # one by copying and pasting into the shell. #wsj0=/ais/gobi2/speech/WSJ/csr_?_senn_d? #wsj1=/ais/gobi2/speech/WSJ/csr_senn_d? #wsj0=/mnt/matylda2/data/WSJ0 #wsj1=/mnt/matylda2/data/WSJ1 #wsj0=/data/corpora0/LDC93S6B #wsj1=/data/corpora0/LDC94S13B wsj0=/export/corpora5/LDC/LDC93S6B wsj1=/export/corpora5/LDC/LDC94S13B if [ $stage -le 0 ]; then # data preparation. local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.? || exit 1; # Sometimes, we have seen WSJ distributions that do not have subdirectories # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the # wsj0 or wsj1 directories. In such cases, try the following: # # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj # local/cstr_wsj_data_prep.sh $corpus # rm data/local/dict/lexiconp.txt # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work. # # "nosp" refers to the dictionary before silence probabilities and pronunciation # probabilities are added. local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1; utils/prepare_lang.sh data/local/dict_nosp \ "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1; local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1; # We suggest to run the next three commands in the background, # as they are not a precondition for the system building and # most of the tests: these commands build a dictionary # containing many of the OOVs in the WSJ LM training data, # and an LM trained directly on that data (i.e. not just # copying the arpa files from the disks from LDC). # Caution: the commands below will only work if $decode_cmd # is setup to use qsub. Else, just remove the --cmd option. # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style, # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead. ( local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1 && \ utils/prepare_lang.sh data/local/dict_nosp_larger \ "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \ local/wsj_train_lms.sh --dict-suffix "_nosp" && local/wsj_format_local_lms.sh --lang-suffix "_nosp" # && ) & # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. for x in test_eval92 test_eval93 test_dev93 train_si284; do steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1; steps/compute_cmvn_stats.sh data/$x || exit 1; done utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1 # Now make subset with the shortest 2k utterances from si-84. utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1; # Now make subset with half of the data from si-84. utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1; fi if [ $stage -le 1 ]; then # monophone # Note: the --boost-silence option should probably be omitted by default # for normal setups. It doesn't always help. [it's to discourage non-silence # models from modeling silence.] if $train; then steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1; fi if $decode; then utils/mkgraph.sh data/lang_nosp_test_tgpr exp/mono0a exp/mono0a/graph_nosp_tgpr && \ steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \ steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92 fi fi if [ $stage -le 2 ]; then # tri1 if $train; then steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1; steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \ data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1; fi if $decode; then utils/mkgraph.sh data/lang_nosp_test_tgpr \ exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1; for data in dev93 eval92; do nspk=$(wc -l <data/test_${data}/spk2utt) steps/decode.sh --nj $nspk --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \ data/test_${data} exp/tri1/decode_nosp_tgpr_${data} || exit 1; # test various modes of LM rescoring (4 is the default one). # This is just confirming they're equivalent. for mode in 1 2 3 4 5; do steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \ data/lang_nosp_test_{tgpr,tg} data/test_${data} \ exp/tri1/decode_nosp_tgpr_${data} \ exp/tri1/decode_nosp_tgpr_${data}_tg$mode || exit 1; done # later on we'll demonstrate const-arpa LM rescoring, which is now # the recommended method. done ## the following command demonstrates how to get lattices that are ## "word-aligned" (arcs coincide with words, with boundaries in the right ## place). #sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'` #steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \ # data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \ # exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1; fi fi if [ $stage -le 3 ]; then # tri2b. there is no special meaning in the "b"-- it's historical. if $train; then steps/align_si.sh --nj 10 --cmd "$train_cmd" \ data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1; steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1; fi if $decode; then utils/mkgraph.sh data/lang_nosp_test_tgpr \ exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1; for data in dev93 eval92; do nspk=$(wc -l <data/test_${data}/spk2utt) steps/decode.sh --nj ${nspk} --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \ data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} || exit 1; # compare lattice rescoring with biglm decoding, going from tgpr to tg. steps/decode_biglm.sh --nj ${nspk} --cmd "$decode_cmd" \ exp/tri2b/graph_nosp_tgpr data/lang_nosp_test_{tgpr,tg}/G.fst \ data/test_${data} exp/tri2b/decode_nosp_tgpr_${data}_tg_biglm # baseline via LM rescoring of lattices. steps/lmrescore.sh --cmd "$decode_cmd" \ data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \ data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} \ exp/tri2b/decode_nosp_tgpr_${data}_tg || exit 1; # Demonstrating Minimum Bayes Risk decoding (like Confusion Network decoding): mkdir exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr cp exp/tri2b/decode_nosp_tgpr_${data}_tg/lat.*.gz \ exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr; local/score_mbr.sh --cmd "$decode_cmd" \ data/test_${data}/ data/lang_nosp_test_tgpr/ \ exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr done fi # At this point, you could run the example scripts that show how VTLN works. # We haven't included this in the default recipes. # local/run_vtln.sh --lang-suffix "_nosp" # local/run_vtln2.sh --lang-suffix "_nosp" fi # local/run_delas.sh trains a delta+delta-delta system. It's not really recommended or # necessary, but it does contain a demonstration of the decode_fromlats.sh # script which isn't used elsewhere. # local/run_deltas.sh if [ $stage -le 4 ]; then # From 2b system, train 3b which is LDA + MLLT + SAT. # Align tri2b system with all the si284 data. if $train; then steps/align_si.sh --nj 10 --cmd "$train_cmd" \ data/train_si284 data/lang_nosp exp/tri2b exp/tri2b_ali_si284 || exit 1; steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ data/train_si284 data/lang_nosp exp/tri2b_ali_si284 exp/tri3b || exit 1; fi if $decode; then utils/mkgraph.sh data/lang_nosp_test_tgpr \ exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1; # the larger dictionary ("big-dict"/bd) + locally produced LM. utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \ exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1; # At this point you could run the command below; this gets # results that demonstrate the basis-fMLLR adaptation (adaptation # on small amounts of adaptation data). # local/run_basis_fmllr.sh --lang-suffix "_nosp" for data in dev93 eval92; do nspk=$(wc -l <data/test_${data}/spk2utt) steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \ exp/tri3b/graph_nosp_tgpr data/test_${data} \ exp/tri3b/decode_nosp_tgpr_${data} || exit 1; steps/lmrescore.sh --cmd "$decode_cmd" \ data/lang_nosp_test_tgpr data/lang_nosp_test_tg \ data/test_${data} exp/tri3b/decode_nosp_{tgpr,tg}_${data} || exit 1 # decode with big dictionary. steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \ exp/tri3b/graph_nosp_bd_tgpr data/test_${data} \ exp/tri3b/decode_nosp_bd_tgpr_${data} || exit 1; # Example of rescoring with ConstArpaLm. steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \ data/test_${data} exp/tri3b/decode_nosp_bd_tgpr_${data}{,_fg} || exit 1; done fi fi if [ $stage -le 5 ]; then # Estimate pronunciation and silence probabilities. # Silprob for normal lexicon. steps/get_prons.sh --cmd "$train_cmd" \ data/train_si284 data/lang_nosp exp/tri3b || exit 1; utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp \ exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \ exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict || exit 1 utils/prepare_lang.sh data/local/dict \ "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1; for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do mkdir -p data/lang_test_${lm_suffix} cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1; rm -rf data/lang_test_${lm_suffix}/tmp cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/ done # Silprob for larger ("bd") lexicon. utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp_larger \ exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \ exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1 utils/prepare_lang.sh data/local/dict_larger \ "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1; for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do mkdir -p data/lang_test_bd_${lm_suffix} cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1; rm -rf data/lang_test_bd_${lm_suffix}/tmp cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/ done fi if [ $stage -le 6 ]; then # From 3b system, now using data/lang as the lang directory (we have now added # pronunciation and silence probabilities), train another SAT system (tri4b). if $train; then steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ data/train_si284 data/lang exp/tri3b exp/tri4b || exit 1; fi if $decode; then utils/mkgraph.sh data/lang_test_tgpr \ exp/tri4b exp/tri4b/graph_tgpr || exit 1; utils/mkgraph.sh data/lang_test_bd_tgpr \ exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1; for data in dev93 eval92; do nspk=$(wc -l <data/test_${data}/spk2utt) steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \ exp/tri4b/graph_tgpr data/test_${data} \ exp/tri4b/decode_tgpr_${data} || exit 1; steps/lmrescore.sh --cmd "$decode_cmd" \ data/lang_test_tgpr data/lang_test_tg \ data/test_${data} exp/tri4b/decode_{tgpr,tg}_${data} || exit 1 steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \ exp/tri4b/graph_bd_tgpr data/test_${data} \ exp/tri4b/decode_bd_tgpr_${data} || exit 1; steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_bd_{tgpr,fgconst} \ data/test_${data} exp/tri4b/decode_bd_tgpr_${data}{,_fg} || exit 1; done fi fi if [ $stage -le 7 ]; then # Caution: this part needs a GPU. local/chain/run_tdnn.sh fi exit 0; # Below are some commented-out commands that demonstrate how to run various other things-- # mainly outdated methods. # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on # all the data). Use 30 jobs. # Note: there isn't much use for this these days. #steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ # data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1; #local/run_mmi_tri4b.sh # The following are the old nnet2 recipes. #local/online/run_nnet2.sh #local/online/run_nnet2_baseline.sh #local/online/run_nnet2_discriminative.sh # The following is the # Demonstration of RNNLM rescoring on nnet2 TDNN models. This is # outdated now. # local/run_rnnlms.sh #local/run_nnet2.sh # You probably want to run the sgmm2 recipe as it's generally a bit better: # The SGMM2 recipe. This is better than GMMs but you probably just want the neural net. # local/run_sgmm2.sh # We demonstrate MAP adaptation of GMMs to gender-dependent systems here. This also serves # as a generic way to demonstrate MAP adaptation to different domains. # local/run_gender_dep.sh # This is the old "nnet1" neural net. #local/nnet/run_dnn.sh # The following demonstrate how to re-segment long audios. # local/run_segmentation_long_utts.sh # The next two commands show how to train a bottleneck network based on the nnet2 setup, # and build an SGMM system on top of it. #local/run_bnf.sh #local/run_bnf_sgmm.sh # Getting results [see RESULTS file] # for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done # KWS setup. We leave it commented out by default # $duration is the length of the search collection, in seconds #duration=`feat-to-len scp:data/test_eval92/feats.scp ark,t:- | awk '{x+=$2} END{print x/100;}'` #local/generate_example_kws.sh data/test_eval92/ data/kws/ #local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/ # #steps/make_index.sh --cmd "$decode_cmd" --acwt 0.1 \ # data/kws/ data/lang_test_bd_tgpr/ \ # exp/tri4b/decode_bd_tgpr_eval92/ \ # exp/tri4b/decode_bd_tgpr_eval92/kws # #steps/search_index.sh --cmd "$decode_cmd" \ # data/kws \ # exp/tri4b/decode_bd_tgpr_eval92/kws # # If you want to provide the start time for each utterance, you can use the --segments # option. In WSJ each file is an utterance, so we don't have to set the start time. #cat exp/tri4b/decode_bd_tgpr_eval92/kws/result.* | \ # utils/write_kwslist.pl --flen=0.01 --duration=$duration \ # --normalize=true --map-utter=data/kws/utter_map \ # - exp/tri4b/decode_bd_tgpr_eval92/kws/kwslist.xml # # A couple of nnet3 recipes: # local/nnet3/run_tdnn_baseline.sh # designed for exact comparison with nnet2 recipe # local/nnet3/run_tdnn.sh # better absolute results # local/nnet3/run_lstm.sh # lstm recipe # bidirectional lstm recipe # local/nnet3/run_lstm.sh --affix bidirectional \ # --lstm-delay " [-1,1] [-2,2] [-3,3] " \ # --label-delay 0 \ # --cell-dim 640 \ # --recurrent-projection-dim 128 \ # --non-recurrent-projection-dim 128 \ # --chunk-left-context 40 \ # --chunk-right-context 40 |