Blame view

egs/csj/s5/run.sh 10.4 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
  #!/bin/bash
  
  # Copyright  2015 Tokyo Institute of Technology
  #                 (Authors: Takafumi Moriya, Tomohiro Tanaka and Takahiro Shinozaki)
  #            2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
  # Apache 2.0
  # Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055.
  
  # This recipe is based on the Switchboard corpus recipe, by Arnab Ghoshal,
  # in the egs/swbd/s5c/ directory.
  
  # This is a shell script, but it's recommended that you run the commands one by
  # one by copying and pasting into the shell.
  # Caution: some of the graph creation steps use quite a bit of memory, so you
  # should run this on a machine that has sufficient memory.
  
  . ./cmd.sh
  . ./path.sh
  set -e # exit on error
  
  #: << '#SKIP'
  
  use_dev=false # Use the first 4k sentences from training data as dev set. (39 speakers.)
  
  CSJDATATOP=/export/corpora5/CSJ/USB
  #CSJDATATOP=/db/laputa1/data/processed/public/CSJ ## CSJ database top directory.
  CSJVER=usb  ## Set your CSJ format (dvd or usb).
              ## Usage    :
              ## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17.
              ##            Neccesary directory is dvd3 - dvd17.
              ##            e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17
              ##
              ## Case USB : Neccesary directory is MORPH/SDB and WAV
              ##            e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv
              ## Case merl :MERL setup. Neccesary directory is WAV and sdb
  
  if [ ! -e data/csj-data/.done_make_all ]; then
   echo "CSJ transcription file does not exist"
   #local/csj_make_trans/csj_autorun.sh <RESOUCE_DIR> <MAKING_PLACE(no change)> || exit 1;
   local/csj_make_trans/csj_autorun.sh $CSJDATATOP data/csj-data $CSJVER
  fi
  wait
  
  [ ! -e data/csj-data/.done_make_all ]\
      && echo "Not finished processing CSJ data" && exit 1;
  
  # Prepare Corpus of Spontaneous Japanese (CSJ) data.
  # Processing CSJ data to KALDI format based on switchboard recipe.
  # local/csj_data_prep.sh <SPEECH_and_TRANSCRIPTION_DATA_DIRECTORY> [ <mode_number> ]
  # mode_number can be 0, 1, 2, 3 (0=default using "Academic lecture" and "other" data, 
  #                                1=using "Academic lecture" data, 
  #                                2=using All data except for "dialog" data, 3=using All data )
  local/csj_data_prep.sh data/csj-data
  # local/csj_data_prep.sh data/csj-data 1
  # local/csj_data_prep.sh data/csj-data 2
  # local/csj_data_prep.sh data/csj-data 3
  
  local/csj_prepare_dict.sh
  
  utils/prepare_lang.sh --num-sil-states 4 data/local/dict_nosp "<unk>" data/local/lang_nosp data/lang_nosp
  
  # Now train the language models.
  local/csj_train_lms.sh data/local/train/text data/local/dict_nosp/lexicon.txt data/local/lm
  
  # We don't really need all these options for SRILM, since the LM training script
  # does some of the same processing (e.g. -subset -tolower)
  srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
  LM=data/local/lm/csj.o3g.kn.gz
  utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
    data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_csj_tg
  
  # Data preparation and formatting for evaluation set.
  # CSJ has 3 types of evaluation data
  #local/csj_eval_data_prep.sh <SPEECH_and_TRANSCRIPTION_DATA_DIRECTORY_ABOUT_EVALUATION_DATA> <EVAL_NUM>
  for eval_num in eval1 eval2 eval3 ; do
      local/csj_eval_data_prep.sh data/csj-data/eval $eval_num
  done
  
  # Now make MFCC features.
  # mfccdir should be some place with a largish disk where you
  # want to store MFCC features.
  mfccdir=mfcc
  
  for x in train eval1 eval2 eval3; do
    steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \
      data/$x exp/make_mfcc/$x $mfccdir
    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
    utils/fix_data_dir.sh data/$x
  done
  
  echo "Finish creating MFCCs"
  
  #SKIP
  
  ##### Training and Decoding steps start from here #####
  
  # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
  # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
  # LM training data.   However, they will be in the lexicon, plus speakers
  # may overlap, so it's still not quite equivalent to a test set.
  
  if $use_dev ;then
      dev_set=train_dev
      utils/subset_data_dir.sh --first data/train 4000 data/$dev_set # 6hr 31min
      n=$[`cat data/train/segments | wc -l` - 4000]
      utils/subset_data_dir.sh --last data/train $n data/train_nodev
  else
      cp -r data/train data/train_nodev
  fi
  
  # Calculate the amount of utterance segmentations.
  # perl -ne 'split; $s+=($_[3]-$_[2]); END{$h=int($s/3600); $r=($s-$h*3600); $m=int($r/60); $r-=$m*60; printf "%.1f sec -- %d:%d:%.1f
  ", $s, $h, $m, $r;}' data/train/segments
  
  # Now-- there are 162k utterances (240hr 8min), and we want to start the
  # monophone training on relatively short utterances (easier to align), but want
  # to exclude the shortest ones.
  # Therefore, we first take the 100k shortest ones;
  # remove most of the repeated utterances, and
  # then take 10k random utterances from those (about 8hr 9mins)
  utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort
  utils/subset_data_dir.sh data/train_100kshort 30000 data/train_30kshort
  
  # Take the first 100k utterances (about half the data); we'll use
  # this for later stages of training.
  utils/subset_data_dir.sh --first data/train_nodev 100000 data/train_100k
  utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup  # 147hr 6min
  
  # Finally, the full training set:
  utils/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup  # 233hr 36min
  
  ## Starting basic training on MFCC features
  steps/train_mono.sh --nj 50 --cmd "$train_cmd" \
    data/train_30kshort data/lang_nosp exp/mono
  
  steps/align_si.sh --nj 50 --cmd "$train_cmd" \
    data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali
  
  steps/train_deltas.sh --cmd "$train_cmd" \
    3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1
  
  graph_dir=exp/tri1/graph_csj_tg
  $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri1 $graph_dir
  for eval_num in eval1 eval2 eval3 $dev_set ; do
      steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
  	$graph_dir data/$eval_num exp/tri1/decode_${eval_num}_csj
  done
  
  steps/align_si.sh --nj 50 --cmd "$train_cmd" \
    data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali
  
  steps/train_deltas.sh --cmd "$train_cmd" \
    4000 70000 data/train_100k_nodup data/lang_nosp exp/tri1_ali exp/tri2
  
  # The previous mkgraph might be writing to this file.  If the previous mkgraph
  # is not running, you can remove this loop and this mkgraph will create it.
  while [ ! -s data/lang_nosp_csj_tg/tmp/CLG_3_1.fst ]; do sleep 60; done
  sleep 20; # in case still writing.
  graph_dir=exp/tri2/graph_csj_tg
  $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri2 $graph_dir
  for eval_num in eval1 eval2 eval3 $dev_set ; do
      steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
  	$graph_dir data/$eval_num exp/tri2/decode_${eval_num}_csj
  done
  
  # From now, we start with the LDA+MLLT system
  steps/align_si.sh --nj 50 --cmd "$train_cmd" \
    data/train_100k_nodup data/lang_nosp exp/tri2 exp/tri2_ali_100k_nodup
  
  # From now, we start using all of the data (except some duplicates of common
  # utterances, which don't really contribute much).
  steps/align_si.sh --nj 50 --cmd "$train_cmd" \
    data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup
  
  # Do another iteration of LDA+MLLT training, on all the data.
  steps/train_lda_mllt.sh --cmd "$train_cmd" \
    6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3
  
  graph_dir=exp/tri3/graph_csj_tg
  $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri3 $graph_dir
  for eval_num in eval1 eval2 eval3 $dev_set ; do
      steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
  	$graph_dir data/$eval_num exp/tri3/decode_${eval_num}_csj_nosp
  done
  
  # Now we compute the pronunciation and silence probabilities from training data,
  # and re-create the lang directory.
  steps/get_prons.sh --cmd "$train_cmd" data/train_nodup data/lang_nosp exp/tri3
  utils/dict_dir_add_pronprobs.sh --max-normalize true \
    data/local/dict_nosp exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
  
  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
  LM=data/local/lm/csj.o3g.kn.gz
  srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
  utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
    data/lang $LM data/local/dict/lexicon.txt data/lang_csj_tg
  
  graph_dir=exp/tri3/graph_csj_tg
  $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_csj_tg exp/tri3 $graph_dir
  for eval_num in eval1 eval2 eval3 $dev_set ; do
      steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
          $graph_dir data/$eval_num exp/tri3/decode_${eval_num}_csj
  done
  
  
  # Train tri4, which is LDA+MLLT+SAT, on all the (nodup) data.
  steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
    data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup
  
  steps/train_sat.sh  --cmd "$train_cmd" \
    11500 200000 data/train_nodup data/lang exp/tri3_ali_nodup exp/tri4
  
  graph_dir=exp/tri4/graph_csj_tg
  $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_csj_tg exp/tri4 $graph_dir
  for eval_num in eval1 eval2 eval3 $dev_set ; do
      steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
  	$graph_dir data/$eval_num exp/tri4/decode_${eval_num}_csj
  done
  
  steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
    data/train_nodup data/lang exp/tri4 exp/tri4_ali_nodup || exit 1
  
  # You can execute DNN training script [e.g. local/chain/run_dnn.sh] from here.
  
  # MMI training
  # local/run_mmi.sh
  
  # this will help find issues with the lexicon.
  # steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_nodev data/lang exp/tri4 data/local/dict/lexicon.txt exp/debug_lexicon
  
  # SGMM system
  # local/run_sgmm2.sh
  
  #SKIP
  
  ##### Start DNN training #####
  # Karel's DNN recipe on top of fMLLR features
  # local/nnet/run_dnn.sh
  
  # nnet3 TDNN+Chain 
  local/chain/run_tdnn.sh
  
  # nnet3 TDNN recipe
  # local/nnet3/run_tdnn.sh
  
  ##### Start RNN-LM training for rescoring #####
  # local/csj_run_rnnlm.sh
  
  # getting results (see RESULTS file)
  # for eval_num in eval1 eval2 eval3 $dev_set ; do
  #     echo "=== evaluation set $eval_num ===" ;
  #     for x in exp/{tri,dnn}*/decode_${eval_num}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done ;
  # done