Blame view

egs/fisher_swbd/s5/run.sh 14.6 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
  #!/bin/bash
  
  # It's best to run the commands in this one by one.
  . ./cmd.sh
  . ./path.sh
  mfccdir=mfcc
  set -e
  rescore=true
  
  # check for kaldi_lm
  which get_word_map.pl > /dev/null
  if [ $? -ne 0 ]; then
    echo "This recipe requires installation of tools/kaldi_lm. Please run extras/kaldi_lm.sh in tools/" && exit 1;
  fi
  
  # prepare fisher data and put it under data/train_fisher
  local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
     /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
  
  # at BUT:
  ####local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
  
  local/swbd1_data_download.sh /export/corpora3/LDC/LDC97S62
  
  # prepare dictionary and acronym mapping list
  local/fisher_swbd_prepare_dict.sh
  
  # prepare swbd data and put it under data/train_swbd
  local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
  # local/swbd1_data_prep.sh /data/corpora0/LDC97S62
  # local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
  # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1
  
  utils/prepare_lang.sh data/local/dict_nosp \
      "<unk>" data/local/lang_nosp data/lang_nosp
  
  # LM for swbd could be used for decoding purposes
  #fisher_opt="--fisher /scail/group/deeplearning/speech/datasets/LDC2004T19-Fisher-Transcripts"
  #local/swbd1_train_lms.sh $fisher_opt \
  #  data/local/train_swbd/text data/local/dict/lexicon.txt data/local/lm
  
  # merge two datasets into one
  mkdir -p data/train_all
  for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
    cat data/train_fisher/$f data/train_swbd/$f > data/train_all/$f
  done
  
  # LM for train_all
  local/fisher_train_lms.sh
  #local/fisher_create_test_lang.sh
  # Compiles G for trigram LM
  LM=data/local/lm/3gram-mincount/lm_unpruned.gz
  srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
  utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
    data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_fsh_sw1_tg
  
  LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz
  [ -f $LM_fg ] || rescore=false
  if [ $rescore ]; then
    utils/build_const_arpa_lm.sh $LM_fg data/lang_nosp data/lang_nosp_fsh_sw1_fg
  fi
  
  # Prepare Eval2000 and RT-03 test sets
  
  #local/eval2000_data_prep.sh /scail/group/deeplearning/speech/datasets/LDC2002S09/hub5e_00/ /scail/group/deeplearning/speech/datasets/LDC2002T43 || exit 1
  local/eval2000_data_prep.sh /export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43 || exit 1
  
  #local/rt03_data_prep.sh /scail/group/deeplearning/speech/datasets/rt_03 || exit 1
  local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10 || exit 1
  
  utils/fix_data_dir.sh data/train_all
  
  
  # Make MFCCs for the training set
  # spread the mfccs over various machines, as this data-set is quite large.
  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
    mfcc=$(basename $mfccdir) # in case was absolute pathname (unlikely), get basename.
    utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/fisher_swbd/s5/$mfcc/storage \
      $mfccdir/storage
  fi
  steps/make_mfcc.sh --nj 100 --cmd "$train_cmd" data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
  utils/fix_data_dir.sh data/train_all
  utils/validate_data_dir.sh data/train_all
  steps/compute_cmvn_stats.sh data/train_all exp/make_mfcc/train_all $mfccdir
  
  # subset swbd features and put them back into train_swbd in case separate training is needed
  awk -F , '{print $1}' data/train_swbd/spk2utt > data/swbd_spklist
  utils/subset_data_dir.sh --spk-list data/swbd_spklist data/train_all data/train_swbd
  steps/compute_cmvn_stats.sh data/train_swbd exp/make_mfcc/train_all $mfccdir
  
  # Make MFCCs for the test sets
  utils/fix_data_dir.sh data/eval2000
  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/eval2000 exp/make_mfcc/eval2000 $mfccdir || exit 1;
  steps/compute_cmvn_stats.sh data/eval2000 exp/make_mfcc/eval2000 $mfccdir
  
  utils/fix_data_dir.sh data/rt03
  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/rt03 exp/make_mfcc/rt03 $mfccdir || exit 1;
  steps/compute_cmvn_stats.sh data/rt03 exp/make_mfcc/rt03 $mfccdir
  
  utils/fix_data_dir.sh data/eval2000
  utils/validate_data_dir.sh data/eval2000
  
  utils/fix_data_dir.sh data/rt03
  utils/validate_data_dir.sh data/rt03
  
  n=$[`cat data/train_all/segments | wc -l`]
  utils/subset_data_dir.sh --last data/train_all $n data/train
  
  # Now-- there are 2.1 million utterances, and we want to start the monophone training
  # on relatively short utterances (easier to align), but not only the very shortest
  # ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
  # utterances from those. We also take these subsets from Switchboard, which has
  # more carefully hand-labeled alignments
  
  utils/subset_data_dir.sh --shortest data/train_swbd 100000 data/train_100kshort
  utils/data/remove_dup_utts.sh 10 data/train_100kshort data/train_100kshort_nodup
  utils/subset_data_dir.sh  data/train_100kshort_nodup 10000 data/train_10k_nodup
  
  utils/subset_data_dir.sh --speakers data/train_swbd 30000 data/train_30k
  utils/subset_data_dir.sh --speakers data/train_swbd 100000 data/train_100k
  
  utils/data/remove_dup_utts.sh 200 data/train_30k data/train_30k_nodup
  utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup
  utils/data/remove_dup_utts.sh 300 data/train data/train_nodup
  
  # The next commands are not necessary for the scripts to run, but increase
  # efficiency of data access by putting the mfcc's of the subset
  # in a contiguous place in a file.
  ( . ./path.sh;
    # make sure mfccdir is defined as above..
    cp data/train_10k_nodup/feats.scp{,.bak}
    copy-feats scp:data/train_10k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
    && cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_10k_nodup/feats.scp
  )
  ( . ./path.sh;
    # make sure mfccdir is defined as above..
    cp data/train_30k_nodup/feats.scp{,.bak}
    copy-feats scp:data/train_30k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_30k_nodup.ark,$mfccdir/kaldi_fish_30k_nodup.scp \
    && cp $mfccdir/kaldi_fish_30k_nodup.scp data/train_30k_nodup/feats.scp
  )
  ( . ./path.sh;
    # make sure mfccdir is defined as above..
    cp data/train_100k_nodup/feats.scp{,.bak}
    copy-feats scp:data/train_100k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_100k_nodup.ark,$mfccdir/kaldi_fish_100k_nodup.scp \
    && cp $mfccdir/kaldi_fish_100k_nodup.scp data/train_100k_nodup/feats.scp
  )
  
  # Start training on the Switchboard subset, which has cleaner alignments
  steps/train_mono.sh --nj 3 --cmd "$train_cmd" \
    data/train_10k_nodup data/lang_nosp exp/mono0a
  
  steps/align_si.sh --nj 10 --cmd "$train_cmd" \
     data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
  
  steps/train_deltas.sh --cmd "$train_cmd" \
      3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1;
  #used to be 2500 20000
  (
   graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri1a $graph_dir
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri1a/decode_eval2000_nosp_fsh_sw1_tg
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri1a/decode_rt03_nosp_fsh_sw1_tg
  )&
  steps/align_si.sh --nj 10 --cmd "$train_cmd" \
     data/train_30k_nodup data/lang_nosp exp/tri1a exp/tri1a_ali || exit 1;
  
  steps/train_deltas.sh --cmd "$train_cmd" \
      3200 30000 data/train_30k_nodup data/lang_nosp exp/tri1a_ali exp/tri1b || exit 1;
  #used to be 2500 20000
  (
   graph_dir=exp/tri1b/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri1b $graph_dir
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri1b/decode_eval2000_nosp_fsh_sw1_tg
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri1b/decode_rt03_nosp_fsh_sw1_tg
  )&
  steps/align_si.sh --nj 50 --cmd "$train_cmd" \
     data/train_100k_nodup data/lang_nosp exp/tri1b exp/tri1b_ali || exit 1;
  
  steps/train_deltas.sh --cmd "$train_cmd" \
      5500 90000 data/train_100k_nodup data/lang_nosp exp/tri1b_ali exp/tri2 || exit 1;
   #used to be 2500 20000 on 30k
  (
    graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg
    utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri2 $graph_dir || exit 1;
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri2/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri2/decode_rt03_nosp_fsh_sw1_tg || exit 1;
  )&
  
  # Train tri3a, the last speaker-independent triphone stage,
  # on the whole Switchboard training set
  steps/align_si.sh --nj 100 --cmd "$train_cmd" \
     data/train_swbd data/lang_nosp exp/tri2 exp/tri2_ali || exit 1;
  
  steps/train_deltas.sh --cmd "$train_cmd" \
      11500 200000 data/train_swbd data/lang_nosp exp/tri2_ali exp/tri3a || exit 1;
   #used to be 2500 20000
  
  (
    graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg
    utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3a $graph_dir || exit 1;
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri3a/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri3a/decode_rt03_nosp_fsh_sw1_tg || exit 1;
  )&
  
  # Train tri3b, which is LDA+MLLT on the whole Switchboard+Fisher training set
  steps/align_si.sh --nj 100 --cmd "$train_cmd" \
    data/train_nodup data/lang_nosp exp/tri3a exp/tri3a_ali || exit 1;
  
  steps/train_lda_mllt.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" \
     11500 400000 data/train_nodup data/lang_nosp exp/tri3a_ali exp/tri3b || exit 1;
  (
    graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg
    utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri3b/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri3b/decode_rt03_nosp_fsh_sw1_tg || exit 1;
  )&
  
  steps/get_prons.sh --cmd "$train_cmd" data/train_nodup data/lang_nosp exp/tri3b
  
  utils/dict_dir_add_pronprobs.sh --max-normalize true \
    data/local/dict_nosp exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict
  
  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
  
  LM=data/local/lm/3gram-mincount/lm_unpruned.gz
  srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
  utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
    data/lang $LM data/local/dict/lexicon.txt data/lang_fsh_sw1_tg
  
  LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz
  if [ $rescore ]; then
    utils/build_const_arpa_lm.sh $LM_fg data/lang data/lang_fsh_sw1_fg
  fi
  
  (
    graph_dir=exp/tri3b/graph_fsh_sw1_tg
    utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri3b/decode_eval2000_fsh_sw1_tg || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri3b/decode_rt03_fsh_sw1_tg || exit 1;
  )&
  
  # Next we'll use fMLLR and train with SAT (i.e. on
  # fMLLR features)
  
  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
    data/train_nodup data/lang exp/tri3b exp/tri3b_ali || exit 1;
  
  steps/train_sat.sh  --cmd "$train_cmd" \
    11500 800000 data/train_nodup data/lang exp/tri3b_ali  exp/tri4a || exit 1;
  
  (
    graph_dir=exp/tri4a/graph_fsh_sw1_tg
    utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri4a $graph_dir
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri4a/decode_eval2000_fsh_sw1_tg || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri4a/decode_rt03_fsh_sw1_tg || exit 1;
  )&
  wait
  
  if [ $rescore ]; then
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      data/lang_fsh_sw1_{tg,fg} data/eval2000 \
      exp/tri4a/decode_eval2000_fsh_sw1_{tg,fg}
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      data/lang_fsh_sw1_{tg,fg} data/rt03 \
      exp/tri4a/decode_rt03_fsh_sw1_{tg,fg}
  fi
  
  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
    data/train_nodup data/lang exp/tri4a exp/tri4a_ali || exit 1;
  
  steps/train_sat.sh  --cmd "$train_cmd" \
    11500 1600000 data/train_nodup data/lang exp/tri4a_ali  exp/tri5a || exit 1;
  
  (
    graph_dir=exp/tri5a/graph_fsh_sw1_tg
    utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri5a $graph_dir
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri5a/decode_eval2000_fsh_sw1_tg || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri5a/decode_rt03_fsh_sw1_tg || exit 1;
  )&
  wait
  
  if [ $rescore ]; then
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      data/lang_fsh_sw1_{tg,fg} data/eval2000 \
      exp/tri5a/decode_eval2000_fsh_sw1_{tg,fg}
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      data/lang_fsh_sw1_{tg,fg} data/rt03 \
      exp/tri5a/decode_rt03_fsh_sw1_{tg,fg}
  fi
  
  hours=$(awk '{x += $4 - $3;} END{print x/3600;}' <data/train_fisher/segments)
  ! [ $hours == 1915 ] && echo "$0: expected 1915 hours of data, got $hours hours, please check." && exit 1;
  
  # at BUT:
  ####local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
  
  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
    data/train_nodup data/lang exp/tri5a exp/tri5a_ali || exit 1;
  
  
  steps/train_sat.sh  --cmd "$train_cmd" \
    11500 3200000 data/train_nodup data/lang exp/tri5a_ali  exp/tri6a || exit 1;
  
  (
    graph_dir=exp/tri6a/graph_fsh_sw1_tg
    utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri6a $graph_dir
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri6a/decode_eval2000_fsh_sw1_tg || exit 1;
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     $graph_dir data/rt03 exp/tri6a/decode_rt03_fsh_sw1_tg || exit 1;
  )&
  wait
  if [ $rescore ]; then
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      data/lang_fsh_sw1_{tg,fg} data/eval2000 \
      exp/tri6a/decode_eval2000_fsh_sw1_{tg,fg}
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      data/lang_fsh_sw1_{tg,fg} data/rt03 \
      exp/tri6a/decode_rt03_fsh_sw1_{tg,fg}
  fi
  
  # Optional tri6a alignment for further training purposes
  
  #steps/align_fmllr.sh --nj 200 --cmd "$train_cmd" \
  #  data/train_nodup data/lang exp/tri6a exp/tri6a_ali || exit 1;
  
  # The following is the current online-nnet2 recipe, with "multi-splice".
  # local/online/run_nnet2_ms.sh
  local/online/run_nnet2_ms.sh