Blame view
egs/csj/s5/run.sh
10.4 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
#!/bin/bash # Copyright 2015 Tokyo Institute of Technology # (Authors: Takafumi Moriya, Tomohiro Tanaka and Takahiro Shinozaki) # 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) # Apache 2.0 # Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055. # This recipe is based on the Switchboard corpus recipe, by Arnab Ghoshal, # in the egs/swbd/s5c/ directory. # This is a shell script, but it's recommended that you run the commands one by # one by copying and pasting into the shell. # Caution: some of the graph creation steps use quite a bit of memory, so you # should run this on a machine that has sufficient memory. . ./cmd.sh . ./path.sh set -e # exit on error #: << '#SKIP' use_dev=false # Use the first 4k sentences from training data as dev set. (39 speakers.) CSJDATATOP=/export/corpora5/CSJ/USB #CSJDATATOP=/db/laputa1/data/processed/public/CSJ ## CSJ database top directory. CSJVER=usb ## Set your CSJ format (dvd or usb). ## Usage : ## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17. ## Neccesary directory is dvd3 - dvd17. ## e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17 ## ## Case USB : Neccesary directory is MORPH/SDB and WAV ## e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv ## Case merl :MERL setup. Neccesary directory is WAV and sdb if [ ! -e data/csj-data/.done_make_all ]; then echo "CSJ transcription file does not exist" #local/csj_make_trans/csj_autorun.sh <RESOUCE_DIR> <MAKING_PLACE(no change)> || exit 1; local/csj_make_trans/csj_autorun.sh $CSJDATATOP data/csj-data $CSJVER fi wait [ ! -e data/csj-data/.done_make_all ]\ && echo "Not finished processing CSJ data" && exit 1; # Prepare Corpus of Spontaneous Japanese (CSJ) data. # Processing CSJ data to KALDI format based on switchboard recipe. # local/csj_data_prep.sh <SPEECH_and_TRANSCRIPTION_DATA_DIRECTORY> [ <mode_number> ] # mode_number can be 0, 1, 2, 3 (0=default using "Academic lecture" and "other" data, # 1=using "Academic lecture" data, # 2=using All data except for "dialog" data, 3=using All data ) local/csj_data_prep.sh data/csj-data # local/csj_data_prep.sh data/csj-data 1 # local/csj_data_prep.sh data/csj-data 2 # local/csj_data_prep.sh data/csj-data 3 local/csj_prepare_dict.sh utils/prepare_lang.sh --num-sil-states 4 data/local/dict_nosp "<unk>" data/local/lang_nosp data/lang_nosp # Now train the language models. local/csj_train_lms.sh data/local/train/text data/local/dict_nosp/lexicon.txt data/local/lm # We don't really need all these options for SRILM, since the LM training script # does some of the same processing (e.g. -subset -tolower) srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3" LM=data/local/lm/csj.o3g.kn.gz utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \ data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_csj_tg # Data preparation and formatting for evaluation set. # CSJ has 3 types of evaluation data #local/csj_eval_data_prep.sh <SPEECH_and_TRANSCRIPTION_DATA_DIRECTORY_ABOUT_EVALUATION_DATA> <EVAL_NUM> for eval_num in eval1 eval2 eval3 ; do local/csj_eval_data_prep.sh data/csj-data/eval $eval_num done # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc for x in train eval1 eval2 eval3; do steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \ data/$x exp/make_mfcc/$x $mfccdir steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir utils/fix_data_dir.sh data/$x done echo "Finish creating MFCCs" #SKIP ##### Training and Decoding steps start from here ##### # Use the first 4k sentences as dev set. Note: when we trained the LM, we used # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the # LM training data. However, they will be in the lexicon, plus speakers # may overlap, so it's still not quite equivalent to a test set. if $use_dev ;then dev_set=train_dev utils/subset_data_dir.sh --first data/train 4000 data/$dev_set # 6hr 31min n=$[`cat data/train/segments | wc -l` - 4000] utils/subset_data_dir.sh --last data/train $n data/train_nodev else cp -r data/train data/train_nodev fi # Calculate the amount of utterance segmentations. # perl -ne 'split; $s+=($_[3]-$_[2]); END{$h=int($s/3600); $r=($s-$h*3600); $m=int($r/60); $r-=$m*60; printf "%.1f sec -- %d:%d:%.1f ", $s, $h, $m, $r;}' data/train/segments # Now-- there are 162k utterances (240hr 8min), and we want to start the # monophone training on relatively short utterances (easier to align), but want # to exclude the shortest ones. # Therefore, we first take the 100k shortest ones; # remove most of the repeated utterances, and # then take 10k random utterances from those (about 8hr 9mins) utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort utils/subset_data_dir.sh data/train_100kshort 30000 data/train_30kshort # Take the first 100k utterances (about half the data); we'll use # this for later stages of training. utils/subset_data_dir.sh --first data/train_nodev 100000 data/train_100k utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup # 147hr 6min # Finally, the full training set: utils/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup # 233hr 36min ## Starting basic training on MFCC features steps/train_mono.sh --nj 50 --cmd "$train_cmd" \ data/train_30kshort data/lang_nosp exp/mono steps/align_si.sh --nj 50 --cmd "$train_cmd" \ data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali steps/train_deltas.sh --cmd "$train_cmd" \ 3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1 graph_dir=exp/tri1/graph_csj_tg $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri1 $graph_dir for eval_num in eval1 eval2 eval3 $dev_set ; do steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/$eval_num exp/tri1/decode_${eval_num}_csj done steps/align_si.sh --nj 50 --cmd "$train_cmd" \ data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali steps/train_deltas.sh --cmd "$train_cmd" \ 4000 70000 data/train_100k_nodup data/lang_nosp exp/tri1_ali exp/tri2 # The previous mkgraph might be writing to this file. If the previous mkgraph # is not running, you can remove this loop and this mkgraph will create it. while [ ! -s data/lang_nosp_csj_tg/tmp/CLG_3_1.fst ]; do sleep 60; done sleep 20; # in case still writing. graph_dir=exp/tri2/graph_csj_tg $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri2 $graph_dir for eval_num in eval1 eval2 eval3 $dev_set ; do steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/$eval_num exp/tri2/decode_${eval_num}_csj done # From now, we start with the LDA+MLLT system steps/align_si.sh --nj 50 --cmd "$train_cmd" \ data/train_100k_nodup data/lang_nosp exp/tri2 exp/tri2_ali_100k_nodup # From now, we start using all of the data (except some duplicates of common # utterances, which don't really contribute much). steps/align_si.sh --nj 50 --cmd "$train_cmd" \ data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup # Do another iteration of LDA+MLLT training, on all the data. steps/train_lda_mllt.sh --cmd "$train_cmd" \ 6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3 graph_dir=exp/tri3/graph_csj_tg $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri3 $graph_dir for eval_num in eval1 eval2 eval3 $dev_set ; do steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/$eval_num exp/tri3/decode_${eval_num}_csj_nosp done # Now we compute the pronunciation and silence probabilities from training data, # and re-create the lang directory. steps/get_prons.sh --cmd "$train_cmd" data/train_nodup data/lang_nosp exp/tri3 utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \ exp/tri3/pron_bigram_counts_nowb.txt data/local/dict utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang LM=data/local/lm/csj.o3g.kn.gz srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3" utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \ data/lang $LM data/local/dict/lexicon.txt data/lang_csj_tg graph_dir=exp/tri3/graph_csj_tg $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_csj_tg exp/tri3 $graph_dir for eval_num in eval1 eval2 eval3 $dev_set ; do steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/$eval_num exp/tri3/decode_${eval_num}_csj done # Train tri4, which is LDA+MLLT+SAT, on all the (nodup) data. steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup steps/train_sat.sh --cmd "$train_cmd" \ 11500 200000 data/train_nodup data/lang exp/tri3_ali_nodup exp/tri4 graph_dir=exp/tri4/graph_csj_tg $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_csj_tg exp/tri4 $graph_dir for eval_num in eval1 eval2 eval3 $dev_set ; do steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \ $graph_dir data/$eval_num exp/tri4/decode_${eval_num}_csj done steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri4 exp/tri4_ali_nodup || exit 1 # You can execute DNN training script [e.g. local/chain/run_dnn.sh] from here. # MMI training # local/run_mmi.sh # this will help find issues with the lexicon. # steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_nodev data/lang exp/tri4 data/local/dict/lexicon.txt exp/debug_lexicon # SGMM system # local/run_sgmm2.sh #SKIP ##### Start DNN training ##### # Karel's DNN recipe on top of fMLLR features # local/nnet/run_dnn.sh # nnet3 TDNN+Chain local/chain/run_tdnn.sh # nnet3 TDNN recipe # local/nnet3/run_tdnn.sh ##### Start RNN-LM training for rescoring ##### # local/csj_run_rnnlm.sh # getting results (see RESULTS file) # for eval_num in eval1 eval2 eval3 $dev_set ; do # echo "=== evaluation set $eval_num ===" ; # for x in exp/{tri,dnn}*/decode_${eval_num}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done ; # done |