Blame view
egs/tedlium/s5_r3/run.sh
6.95 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
#!/bin/bash # # Based mostly on the Switchboard recipe. The training database is TED-LIUM, # it consists of TED talks with cleaned automatic transcripts: # # https://lium.univ-lemans.fr/ted-lium3/ # http://www.openslr.org/resources (Mirror). # # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, # which allow free non-commercial use, while only a citation is required. # # Copyright 2014 Nickolay V. Shmyrev # 2014 Brno University of Technology (Author: Karel Vesely) # 2016 Vincent Nguyen # 2016 Johns Hopkins University (Author: Daniel Povey) # 2018 François Hernandez # # Apache 2.0 # . ./cmd.sh . ./path.sh set -e -o pipefail -u nj=35 decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set # after applying --seconds-per-spk-max 180. We decode with 4 threads, so # this will be too many jobs if you're using run.pl. stage=0 train_rnnlm=false train_lm=false . utils/parse_options.sh # accept options # Data preparation if [ $stage -le 0 ]; then local/download_data.sh fi if [ $stage -le 1 ]; then local/prepare_data.sh # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and # lets us use more jobs for decoding etc. # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is # more than our normal 30 jobs.] for dset in dev test train; do utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset} done fi if [ $stage -le 2 ]; then local/prepare_dict.sh fi if [ $stage -le 3 ]; then utils/prepare_lang.sh data/local/dict_nosp \ "<unk>" data/local/lang_nosp data/lang_nosp fi if [ $stage -le 4 ]; then # later on we'll change this script so you have the option to # download the pre-built LMs from openslr.org instead of building them # locally. if $train_lm; then local/ted_train_lm.sh else local/ted_download_lm.sh fi fi if [ $stage -le 5 ]; then local/format_lms.sh fi # Feature extraction if [ $stage -le 6 ]; then for set in test dev train; do dir=data/$set steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir steps/compute_cmvn_stats.sh $dir done fi # Now we have 452 hours of training data. # Well create a subset with 10k short segments to make flat-start training easier: if [ $stage -le 7 ]; then utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup fi # Train if [ $stage -le 8 ]; then steps/train_mono.sh --nj 20 --cmd "$train_cmd" \ data/train_10kshort_nodup data/lang_nosp exp/mono fi if [ $stage -le 9 ]; then steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang_nosp exp/mono exp/mono_ali steps/train_deltas.sh --cmd "$train_cmd" \ 2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1 fi if [ $stage -le 10 ]; then utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp # The slowest part about this decoding is the scoring, which we can't really # control as the bottleneck is the NIST tools. for dset in dev test; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore done fi if [ $stage -le 11 ]; then steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang_nosp exp/tri1 exp/tri1_ali steps/train_lda_mllt.sh --cmd "$train_cmd" \ 4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2 fi if [ $stage -le 12 ]; then utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp for dset in dev test; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore done fi if [ $stage -le 13 ]; then steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2 utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ exp/tri2/sil_counts_nowb.txt \ exp/tri2/pron_bigram_counts_nowb.txt data/local/dict fi if [ $stage -le 14 ]; then utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang cp -rT data/lang data/lang_rescore cp data/lang_nosp/G.fst data/lang/ cp data/lang_nosp_rescore/G.carpa data/lang_rescore/ utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph for dset in dev test; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ exp/tri2/graph data/${dset} exp/tri2/decode_${dset} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore done fi if [ $stage -le 15 ]; then steps/align_si.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/tri2 exp/tri2_ali steps/train_sat.sh --cmd "$train_cmd" \ 5000 100000 data/train data/lang exp/tri2_ali exp/tri3 utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph for dset in dev test; do steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ exp/tri3/graph data/${dset} exp/tri3/decode_${dset} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore done fi if [ $stage -le 16 ]; then # this does some data-cleaning. It actually degrades the GMM-level results # slightly, but the cleaned data should be useful when we add the neural net and chain # systems. If not we'll remove this stage. local/run_cleanup_segmentation.sh fi if [ $stage -le 17 ]; then # This will only work if you have GPUs on your system (and note that it requires # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) local/chain/run_tdnnf.sh fi if [ $stage -le 18 ]; then # You can either train your own rnnlm or download a pre-trained one if $train_rnnlm; then local/rnnlm/tuning/run_lstm_tdnn_a.sh local/rnnlm/average_rnnlm.sh else local/ted_download_rnnlm.sh fi fi if [ $stage -le 19 ]; then # Here we rescore the lattices generated at stage 17 rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged lang_dir=data/lang_chain ngram_order=4 for dset in dev test; do data_dir=data/${dset}_hires decoding_dir=exp/chain_cleaned/tdnnf_1a/decode_${dset} suffix=$(basename $rnnlm_dir) output_dir=${decoding_dir}_$suffix rnnlm/lmrescore_pruned.sh \ --cmd "$decode_cmd --mem 4G" \ --weight 0.5 --max-ngram-order $ngram_order \ $lang_dir $rnnlm_dir \ $data_dir $decoding_dir \ $output_dir done fi echo "$0: success." exit 0 |