Blame view
egs/fisher_swbd/s5/run.sh
14.6 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
#!/bin/bash # It's best to run the commands in this one by one. . ./cmd.sh . ./path.sh mfccdir=mfcc set -e rescore=true # check for kaldi_lm which get_word_map.pl > /dev/null if [ $? -ne 0 ]; then echo "This recipe requires installation of tools/kaldi_lm. Please run extras/kaldi_lm.sh in tools/" && exit 1; fi # prepare fisher data and put it under data/train_fisher local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \ /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13 # at BUT: ####local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/ local/swbd1_data_download.sh /export/corpora3/LDC/LDC97S62 # prepare dictionary and acronym mapping list local/fisher_swbd_prepare_dict.sh # prepare swbd data and put it under data/train_swbd local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62 # local/swbd1_data_prep.sh /data/corpora0/LDC97S62 # local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1 utils/prepare_lang.sh data/local/dict_nosp \ "<unk>" data/local/lang_nosp data/lang_nosp # LM for swbd could be used for decoding purposes #fisher_opt="--fisher /scail/group/deeplearning/speech/datasets/LDC2004T19-Fisher-Transcripts" #local/swbd1_train_lms.sh $fisher_opt \ # data/local/train_swbd/text data/local/dict/lexicon.txt data/local/lm # merge two datasets into one mkdir -p data/train_all for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do cat data/train_fisher/$f data/train_swbd/$f > data/train_all/$f done # LM for train_all local/fisher_train_lms.sh #local/fisher_create_test_lang.sh # Compiles G for trigram LM LM=data/local/lm/3gram-mincount/lm_unpruned.gz srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3" utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \ data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_fsh_sw1_tg LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz [ -f $LM_fg ] || rescore=false if [ $rescore ]; then utils/build_const_arpa_lm.sh $LM_fg data/lang_nosp data/lang_nosp_fsh_sw1_fg fi # Prepare Eval2000 and RT-03 test sets #local/eval2000_data_prep.sh /scail/group/deeplearning/speech/datasets/LDC2002S09/hub5e_00/ /scail/group/deeplearning/speech/datasets/LDC2002T43 || exit 1 local/eval2000_data_prep.sh /export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43 || exit 1 #local/rt03_data_prep.sh /scail/group/deeplearning/speech/datasets/rt_03 || exit 1 local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10 || exit 1 utils/fix_data_dir.sh data/train_all # Make MFCCs for the training set # spread the mfccs over various machines, as this data-set is quite large. if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then mfcc=$(basename $mfccdir) # in case was absolute pathname (unlikely), get basename. utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/fisher_swbd/s5/$mfcc/storage \ $mfccdir/storage fi steps/make_mfcc.sh --nj 100 --cmd "$train_cmd" data/train_all exp/make_mfcc/train_all $mfccdir || exit 1; utils/fix_data_dir.sh data/train_all utils/validate_data_dir.sh data/train_all steps/compute_cmvn_stats.sh data/train_all exp/make_mfcc/train_all $mfccdir # subset swbd features and put them back into train_swbd in case separate training is needed awk -F , '{print $1}' data/train_swbd/spk2utt > data/swbd_spklist utils/subset_data_dir.sh --spk-list data/swbd_spklist data/train_all data/train_swbd steps/compute_cmvn_stats.sh data/train_swbd exp/make_mfcc/train_all $mfccdir # Make MFCCs for the test sets utils/fix_data_dir.sh data/eval2000 steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/eval2000 exp/make_mfcc/eval2000 $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/eval2000 exp/make_mfcc/eval2000 $mfccdir utils/fix_data_dir.sh data/rt03 steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/rt03 exp/make_mfcc/rt03 $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/rt03 exp/make_mfcc/rt03 $mfccdir utils/fix_data_dir.sh data/eval2000 utils/validate_data_dir.sh data/eval2000 utils/fix_data_dir.sh data/rt03 utils/validate_data_dir.sh data/rt03 n=$[`cat data/train_all/segments | wc -l`] utils/subset_data_dir.sh --last data/train_all $n data/train # Now-- there are 2.1 million utterances, and we want to start the monophone training # on relatively short utterances (easier to align), but not only the very shortest # ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random # utterances from those. We also take these subsets from Switchboard, which has # more carefully hand-labeled alignments utils/subset_data_dir.sh --shortest data/train_swbd 100000 data/train_100kshort utils/data/remove_dup_utts.sh 10 data/train_100kshort data/train_100kshort_nodup utils/subset_data_dir.sh data/train_100kshort_nodup 10000 data/train_10k_nodup utils/subset_data_dir.sh --speakers data/train_swbd 30000 data/train_30k utils/subset_data_dir.sh --speakers data/train_swbd 100000 data/train_100k utils/data/remove_dup_utts.sh 200 data/train_30k data/train_30k_nodup utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup utils/data/remove_dup_utts.sh 300 data/train data/train_nodup # The next commands are not necessary for the scripts to run, but increase # efficiency of data access by putting the mfcc's of the subset # in a contiguous place in a file. ( . ./path.sh; # make sure mfccdir is defined as above.. cp data/train_10k_nodup/feats.scp{,.bak} copy-feats scp:data/train_10k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \ && cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_10k_nodup/feats.scp ) ( . ./path.sh; # make sure mfccdir is defined as above.. cp data/train_30k_nodup/feats.scp{,.bak} copy-feats scp:data/train_30k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_30k_nodup.ark,$mfccdir/kaldi_fish_30k_nodup.scp \ && cp $mfccdir/kaldi_fish_30k_nodup.scp data/train_30k_nodup/feats.scp ) ( . ./path.sh; # make sure mfccdir is defined as above.. cp data/train_100k_nodup/feats.scp{,.bak} copy-feats scp:data/train_100k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_100k_nodup.ark,$mfccdir/kaldi_fish_100k_nodup.scp \ && cp $mfccdir/kaldi_fish_100k_nodup.scp data/train_100k_nodup/feats.scp ) # Start training on the Switchboard subset, which has cleaner alignments steps/train_mono.sh --nj 3 --cmd "$train_cmd" \ data/train_10k_nodup data/lang_nosp exp/mono0a steps/align_si.sh --nj 10 --cmd "$train_cmd" \ data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1; steps/train_deltas.sh --cmd "$train_cmd" \ 3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1; #used to be 2500 20000 ( graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri1a $graph_dir steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri1a/decode_eval2000_nosp_fsh_sw1_tg steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri1a/decode_rt03_nosp_fsh_sw1_tg )& steps/align_si.sh --nj 10 --cmd "$train_cmd" \ data/train_30k_nodup data/lang_nosp exp/tri1a exp/tri1a_ali || exit 1; steps/train_deltas.sh --cmd "$train_cmd" \ 3200 30000 data/train_30k_nodup data/lang_nosp exp/tri1a_ali exp/tri1b || exit 1; #used to be 2500 20000 ( graph_dir=exp/tri1b/graph_nosp_fsh_sw1_tg utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri1b $graph_dir steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri1b/decode_eval2000_nosp_fsh_sw1_tg steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri1b/decode_rt03_nosp_fsh_sw1_tg )& steps/align_si.sh --nj 50 --cmd "$train_cmd" \ data/train_100k_nodup data/lang_nosp exp/tri1b exp/tri1b_ali || exit 1; steps/train_deltas.sh --cmd "$train_cmd" \ 5500 90000 data/train_100k_nodup data/lang_nosp exp/tri1b_ali exp/tri2 || exit 1; #used to be 2500 20000 on 30k ( graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri2 $graph_dir || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri2/decode_eval2000_nosp_fsh_sw1_tg || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri2/decode_rt03_nosp_fsh_sw1_tg || exit 1; )& # Train tri3a, the last speaker-independent triphone stage, # on the whole Switchboard training set steps/align_si.sh --nj 100 --cmd "$train_cmd" \ data/train_swbd data/lang_nosp exp/tri2 exp/tri2_ali || exit 1; steps/train_deltas.sh --cmd "$train_cmd" \ 11500 200000 data/train_swbd data/lang_nosp exp/tri2_ali exp/tri3a || exit 1; #used to be 2500 20000 ( graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3a $graph_dir || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri3a/decode_eval2000_nosp_fsh_sw1_tg || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri3a/decode_rt03_nosp_fsh_sw1_tg || exit 1; )& # Train tri3b, which is LDA+MLLT on the whole Switchboard+Fisher training set steps/align_si.sh --nj 100 --cmd "$train_cmd" \ data/train_nodup data/lang_nosp exp/tri3a exp/tri3a_ali || exit 1; steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ 11500 400000 data/train_nodup data/lang_nosp exp/tri3a_ali exp/tri3b || exit 1; ( graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3b $graph_dir || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri3b/decode_eval2000_nosp_fsh_sw1_tg || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri3b/decode_rt03_nosp_fsh_sw1_tg || exit 1; )& steps/get_prons.sh --cmd "$train_cmd" data/train_nodup data/lang_nosp exp/tri3b utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \ exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang LM=data/local/lm/3gram-mincount/lm_unpruned.gz srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3" utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \ data/lang $LM data/local/dict/lexicon.txt data/lang_fsh_sw1_tg LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz if [ $rescore ]; then utils/build_const_arpa_lm.sh $LM_fg data/lang data/lang_fsh_sw1_fg fi ( graph_dir=exp/tri3b/graph_fsh_sw1_tg utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri3b $graph_dir || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri3b/decode_eval2000_fsh_sw1_tg || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri3b/decode_rt03_fsh_sw1_tg || exit 1; )& # Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri3b exp/tri3b_ali || exit 1; steps/train_sat.sh --cmd "$train_cmd" \ 11500 800000 data/train_nodup data/lang exp/tri3b_ali exp/tri4a || exit 1; ( graph_dir=exp/tri4a/graph_fsh_sw1_tg utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri4a $graph_dir steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri4a/decode_eval2000_fsh_sw1_tg || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri4a/decode_rt03_fsh_sw1_tg || exit 1; )& wait if [ $rescore ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/eval2000 \ exp/tri4a/decode_eval2000_fsh_sw1_{tg,fg} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/rt03 \ exp/tri4a/decode_rt03_fsh_sw1_{tg,fg} fi steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri4a exp/tri4a_ali || exit 1; steps/train_sat.sh --cmd "$train_cmd" \ 11500 1600000 data/train_nodup data/lang exp/tri4a_ali exp/tri5a || exit 1; ( graph_dir=exp/tri5a/graph_fsh_sw1_tg utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri5a $graph_dir steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri5a/decode_eval2000_fsh_sw1_tg || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri5a/decode_rt03_fsh_sw1_tg || exit 1; )& wait if [ $rescore ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/eval2000 \ exp/tri5a/decode_eval2000_fsh_sw1_{tg,fg} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/rt03 \ exp/tri5a/decode_rt03_fsh_sw1_{tg,fg} fi hours=$(awk '{x += $4 - $3;} END{print x/3600;}' <data/train_fisher/segments) ! [ $hours == 1915 ] && echo "$0: expected 1915 hours of data, got $hours hours, please check." && exit 1; # at BUT: ####local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri5a exp/tri5a_ali || exit 1; steps/train_sat.sh --cmd "$train_cmd" \ 11500 3200000 data/train_nodup data/lang exp/tri5a_ali exp/tri6a || exit 1; ( graph_dir=exp/tri6a/graph_fsh_sw1_tg utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri6a $graph_dir steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/eval2000 exp/tri6a/decode_eval2000_fsh_sw1_tg || exit 1; steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/rt03 exp/tri6a/decode_rt03_fsh_sw1_tg || exit 1; )& wait if [ $rescore ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/eval2000 \ exp/tri6a/decode_eval2000_fsh_sw1_{tg,fg} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/rt03 \ exp/tri6a/decode_rt03_fsh_sw1_{tg,fg} fi # Optional tri6a alignment for further training purposes #steps/align_fmllr.sh --nj 200 --cmd "$train_cmd" \ # data/train_nodup data/lang exp/tri6a exp/tri6a_ali || exit 1; # The following is the current online-nnet2 recipe, with "multi-splice". # local/online/run_nnet2_ms.sh local/online/run_nnet2_ms.sh |