Blame view

egs/librispeech/s5/run.sh 17.2 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
  #!/bin/bash
  
  
  # Set this to somewhere where you want to put your data, or where
  # someone else has already put it.  You'll want to change this
  # if you're not on the CLSP grid.
  data=/export/a15/vpanayotov/data
  
  # base url for downloads.
  data_url=www.openslr.org/resources/12
  lm_url=www.openslr.org/resources/11
  mfccdir=mfcc
  stage=1
  
  . ./cmd.sh
  . ./path.sh
  . parse_options.sh
  
  # you might not want to do this for interactive shells.
  set -e
  
  
  if [ $stage -le 1 ]; then
    # download the data.  Note: we're using the 100 hour setup for
    # now; later in the script we'll download more and use it to train neural
    # nets.
    for part in dev-clean test-clean dev-other test-other train-clean-100; do
      local/download_and_untar.sh $data $data_url $part
    done
  
  
    # download the LM resources
    local/download_lm.sh $lm_url data/local/lm
  fi
  
  if [ $stage -le 2 ]; then
    # format the data as Kaldi data directories
    for part in dev-clean test-clean dev-other test-other train-clean-100; do
      # use underscore-separated names in data directories.
      local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
    done
  fi
  
  ## Optional text corpus normalization and LM training
  ## These scripts are here primarily as a documentation of the process that has been
  ## used to build the LM. Most users of this recipe will NOT need/want to run
  ## this step. The pre-built language models and the pronunciation lexicon, as
  ## well as some intermediate data(e.g. the normalized text used for LM training),
  ## are available for download at http://www.openslr.org/11/
  #local/lm/train_lm.sh $LM_CORPUS_ROOT \
  #  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm
  
  ## Optional G2P training scripts.
  ## As the LM training scripts above, this script is intended primarily to
  ## document our G2P model creation process
  #local/g2p/train_g2p.sh data/local/dict/cmudict data/local/lm
  
  if [ $stage -le 3 ]; then
    # when the "--stage 3" option is used below we skip the G2P steps, and use the
    # lexicon we have already downloaded from openslr.org/11/
    local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
     data/local/lm data/local/lm data/local/dict_nosp
  
    utils/prepare_lang.sh data/local/dict_nosp \
     "<UNK>" data/local/lang_tmp_nosp data/lang_nosp
  
    local/format_lms.sh --src-dir data/lang_nosp data/local/lm
  fi
  
  if [ $stage -le 4 ]; then
    # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
    utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
      data/lang_nosp data/lang_nosp_test_tglarge
    utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
      data/lang_nosp data/lang_nosp_test_fglarge
  fi
  
  if [ $stage -le 5 ]; then
    # spread the mfccs over various machines, as this data-set is quite large.
    if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
      mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
      utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
       $mfccdir/storage
    fi
  fi
  
  
  if [ $stage -le 6 ]; then
    for part in dev_clean test_clean dev_other test_other train_clean_100; do
      steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/$part exp/make_mfcc/$part $mfccdir
      steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
    done
  fi
  
  if [ $stage -le 7 ]; then
    # Make some small data subsets for early system-build stages.  Note, there are 29k
    # utterances in the train_clean_100 directory which has 100 hours of data.
    # For the monophone stages we select the shortest utterances, which should make it
    # easier to align the data from a flat start.
  
    utils/subset_data_dir.sh --shortest data/train_clean_100 2000 data/train_2kshort
    utils/subset_data_dir.sh data/train_clean_100 5000 data/train_5k
    utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
  fi
  
  if [ $stage -le 8 ]; then
    # train a monophone system
    steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
                        data/train_2kshort data/lang_nosp exp/mono
  
    # decode using the monophone model
    (
      utils/mkgraph.sh data/lang_nosp_test_tgsmall \
                       exp/mono exp/mono/graph_nosp_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
                        data/$test exp/mono/decode_nosp_tgsmall_$test
      done
    )&
  fi
  
  if [ $stage -le 9 ]; then
    steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
                      data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k
  
    # train a first delta + delta-delta triphone system on a subset of 5000 utterances
    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
                          2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
  
    # decode using the tri1 model
    (
      utils/mkgraph.sh data/lang_nosp_test_tgsmall \
                       exp/tri1 exp/tri1/graph_nosp_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
                        data/$test exp/tri1/decode_nosp_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
                           data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
      done
    )&
  fi
  
  if [ $stage -le 10 ]; then
    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
                      data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
  
  
    # train an LDA+MLLT system.
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
                            --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
                            data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
  
    # decode using the LDA+MLLT model
    (
      utils/mkgraph.sh data/lang_nosp_test_tgsmall \
                       exp/tri2b exp/tri2b/graph_nosp_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
                        data/$test exp/tri2b/decode_nosp_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
                           data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
      done
    )&
  fi
  
  if [ $stage -le 11 ]; then
    # Align a 10k utts subset using the tri2b model
    steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
                       data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
  
    # Train tri3b, which is LDA+MLLT+SAT on 10k utts
    steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
                       data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
  
    # decode using the tri3b model
    (
      utils/mkgraph.sh data/lang_nosp_test_tgsmall \
                       exp/tri3b exp/tri3b/graph_nosp_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
                              exp/tri3b/graph_nosp_tgsmall data/$test \
                              exp/tri3b/decode_nosp_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
                           data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
      done
    )&
  fi
  
  if [ $stage -le 12 ]; then
    # align the entire train_clean_100 subset using the tri3b model
    steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
      data/train_clean_100 data/lang_nosp \
      exp/tri3b exp/tri3b_ali_clean_100
  
    # train another LDA+MLLT+SAT system on the entire 100 hour subset
    steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
                        data/train_clean_100 data/lang_nosp \
                        exp/tri3b_ali_clean_100 exp/tri4b
  
    # decode using the tri4b model
    (
      utils/mkgraph.sh data/lang_nosp_test_tgsmall \
                       exp/tri4b exp/tri4b/graph_nosp_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
                              exp/tri4b/graph_nosp_tgsmall data/$test \
                              exp/tri4b/decode_nosp_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
                           data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
          data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
      done
    )&
  fi
  
  if [ $stage -le 13 ]; then
    # Now we compute the pronunciation and silence probabilities from training data,
    # and re-create the lang directory.
    steps/get_prons.sh --cmd "$train_cmd" \
                       data/train_clean_100 data/lang_nosp exp/tri4b
    utils/dict_dir_add_pronprobs.sh --max-normalize true \
                                    data/local/dict_nosp \
                                    exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
                                    exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
  
    utils/prepare_lang.sh data/local/dict \
                          "<UNK>" data/local/lang_tmp data/lang
    local/format_lms.sh --src-dir data/lang data/local/lm
  
    utils/build_const_arpa_lm.sh \
      data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
    utils/build_const_arpa_lm.sh \
      data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
  
    # decode using the tri4b model with pronunciation and silence probabilities
    (
      utils/mkgraph.sh \
        data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
                              exp/tri4b/graph_tgsmall data/$test \
                              exp/tri4b/decode_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
                           data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
          data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
          data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
      done
    )&
  fi
  
  if [ $stage -le 14 ] && false; then
    # This stage is for nnet2 training on 100 hours; we're commenting it out
    # as it's deprecated.
    # align train_clean_100 using the tri4b model
    steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
      data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100
  
    # This nnet2 training script is deprecated.
    local/nnet2/run_5a_clean_100.sh
  fi
  
  if [ $stage -le 15 ]; then
    local/download_and_untar.sh $data $data_url train-clean-360
  
    # now add the "clean-360" subset to the mix ...
    local/data_prep.sh \
      $data/LibriSpeech/train-clean-360 data/train_clean_360
    steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
                       exp/make_mfcc/train_clean_360 $mfccdir
    steps/compute_cmvn_stats.sh \
      data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir
  
    # ... and then combine the two sets into a 460 hour one
    utils/combine_data.sh \
      data/train_clean_460 data/train_clean_100 data/train_clean_360
  fi
  
  if [ $stage -le 16 ]; then
    # align the new, combined set, using the tri4b model
    steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
                         data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
  
    # create a larger SAT model, trained on the 460 hours of data.
    steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
                        data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
  
    # decode using the tri5b model
    (
      utils/mkgraph.sh data/lang_test_tgsmall \
                       exp/tri5b exp/tri5b/graph_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
                              exp/tri5b/graph_tgsmall data/$test \
                              exp/tri5b/decode_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
                           data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
          data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
          data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
      done
    )&
  fi
  
  
  # The following command trains an nnet3 model on the 460 hour setup.  This
  # is deprecated now.
  ## train a NN model on the 460 hour set
  #local/nnet2/run_6a_clean_460.sh
  
  if [ $stage -le 17 ]; then
    # prepare the remaining 500 hours of data
    local/download_and_untar.sh $data $data_url train-other-500
  
    # prepare the 500 hour subset.
    local/data_prep.sh \
      $data/LibriSpeech/train-other-500 data/train_other_500
    steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
                       exp/make_mfcc/train_other_500 $mfccdir
    steps/compute_cmvn_stats.sh \
      data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir
  
    # combine all the data
    utils/combine_data.sh \
      data/train_960 data/train_clean_460 data/train_other_500
  fi
  
  if [ $stage -le 18 ]; then
    steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
                         data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
  
    # train a SAT model on the 960 hour mixed data.  Use the train_quick.sh script
    # as it is faster.
    steps/train_quick.sh --cmd "$train_cmd" \
                         7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
  
    # decode using the tri6b model
    (
      utils/mkgraph.sh data/lang_test_tgsmall \
                       exp/tri6b exp/tri6b/graph_tgsmall
      for test in test_clean test_other dev_clean dev_other; do
        steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
                              exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
        steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
                           data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
          data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
          data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
      done
    )&
  fi
  
  
  if [ $stage -le 19 ]; then
    # this does some data-cleaning. The cleaned data should be useful when we add
    # the neural net and chain systems.  (although actually it was pretty clean already.)
    local/run_cleanup_segmentation.sh
  fi
  
  # steps/cleanup/debug_lexicon.sh --remove-stress true  --nj 200 --cmd "$train_cmd" data/train_clean_100 \
  #    data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h
  
  # #Perform rescoring of tri6b be means of faster-rnnlm
  # #Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
  # wait && local/run_rnnlm.sh \
  #     --rnnlm-ver "faster-rnnlm" \
  #     --rnnlm-options "-hidden 150 -direct 1000 -direct-order 5" \
  #     --rnnlm-tag "h150-me5-1000" $data data/local/lm
  
  # #Perform rescoring of tri6b be means of faster-rnnlm using Noise contrastive estimation
  # #Note, that could be extremely slow without CUDA
  # #We use smaller direct layer size so that it could be stored in GPU memory (~2Gb)
  # #Suprisingly, bottleneck here is validation rather then learning
  # #Therefore you can use smaller validation dataset to speed up training
  # wait && local/run_rnnlm.sh \
  #     --rnnlm-ver "faster-rnnlm" \
  #     --rnnlm-options "-hidden 150 -direct 400 -direct-order 3 --nce 20" \
  #     --rnnlm-tag "h150-me3-400-nce20" $data data/local/lm
  
  
  if [ $stage -le 20 ]; then
    # train and test nnet3 tdnn models on the entire data with data-cleaning.
    local/chain/run_tdnn.sh # set "--stage 11" if you have already run local/nnet3/run_tdnn.sh
  fi
  
  # The nnet3 TDNN recipe:
  # local/nnet3/run_tdnn.sh # set "--stage 11" if you have already run local/chain/run_tdnn.sh
  
  # # train models on cleaned-up data
  # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
  # local/run_data_cleaning.sh
  
  # # The following is the current online-nnet2 recipe, with "multi-splice".
  # local/online/run_nnet2_ms.sh
  
  # # The following is the discriminative-training continuation of the above.
  # local/online/run_nnet2_ms_disc.sh
  
  # ## The following is an older version of the online-nnet2 recipe, without "multi-splice".  It's faster
  # ## to train but slightly worse.
  # # local/online/run_nnet2.sh
  
  # Wait for decodings in the background
  wait