Blame view

egs/tedlium/s5_r3/run.sh 6.95 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
  #!/bin/bash
  #
  # Based mostly on the Switchboard recipe. The training database is TED-LIUM,
  # it consists of TED talks with cleaned automatic transcripts:
  #
  # https://lium.univ-lemans.fr/ted-lium3/
  # http://www.openslr.org/resources (Mirror).
  #
  # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
  # which allow free non-commercial use, while only a citation is required.
  #
  # Copyright  2014  Nickolay V. Shmyrev
  #            2014  Brno University of Technology (Author: Karel Vesely)
  #            2016  Vincent Nguyen
  #            2016  Johns Hopkins University (Author: Daniel Povey)
  #            2018  François Hernandez
  #
  # Apache 2.0
  #
  
  . ./cmd.sh
  . ./path.sh
  
  
  set -e -o pipefail -u
  
  nj=35
  decode_nj=30   # note: should not be >38 which is the number of speakers in the dev set
                 # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
                 # this will be too many jobs if you're using run.pl.
  stage=0
  train_rnnlm=false
  train_lm=false
  
  . utils/parse_options.sh # accept options
  
  # Data preparation
  if [ $stage -le 0 ]; then
    local/download_data.sh
  fi
  
  if [ $stage -le 1 ]; then
    local/prepare_data.sh
    # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
    # lets us use more jobs for decoding etc.
    # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is
    #  more than our normal 30 jobs.]
    for dset in dev test train; do
      utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset}
    done
  fi
  
  
  if [ $stage -le 2 ]; then
    local/prepare_dict.sh
  fi
  
  if [ $stage -le 3 ]; then
    utils/prepare_lang.sh data/local/dict_nosp \
      "<unk>" data/local/lang_nosp data/lang_nosp
  fi
  
  if [ $stage -le 4 ]; then
    # later on we'll change this script so you have the option to
    # download the pre-built LMs from openslr.org instead of building them
    # locally.
    if $train_lm; then
      local/ted_train_lm.sh
    else
      local/ted_download_lm.sh
    fi
  fi
  
  if [ $stage -le 5 ]; then
    local/format_lms.sh
  fi
  
  # Feature extraction
  if [ $stage -le 6 ]; then
    for set in test dev train; do
      dir=data/$set
      steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir
      steps/compute_cmvn_stats.sh $dir
    done
  fi
  
  # Now we have 452 hours of training data.
  # Well create a subset with 10k short segments to make flat-start training easier:
  if [ $stage -le 7 ]; then
    utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort
    utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup
  fi
  
  # Train
  if [ $stage -le 8 ]; then
    steps/train_mono.sh --nj 20 --cmd "$train_cmd" \
      data/train_10kshort_nodup data/lang_nosp exp/mono
  fi
  
  if [ $stage -le 9 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang_nosp exp/mono exp/mono_ali
    steps/train_deltas.sh --cmd "$train_cmd" \
      2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1
  fi
  
  if [ $stage -le 10 ]; then
    utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp
  
    # The slowest part about this decoding is the scoring, which we can't really
    # control as the bottleneck is the NIST tools.
    for dset in dev test; do
      steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
        exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset}
      steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
         data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore
    done
  fi
  
  if [ $stage -le 11 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang_nosp exp/tri1 exp/tri1_ali
  
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
      4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2
  fi
  
  if [ $stage -le 12 ]; then
    utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp
    for dset in dev test; do
      steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
        exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset}
      steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
         data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore
    done
  fi
  
  if [ $stage -le 13 ]; then
    steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2
    utils/dict_dir_add_pronprobs.sh --max-normalize true \
      data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \
      exp/tri2/sil_counts_nowb.txt \
      exp/tri2/pron_bigram_counts_nowb.txt data/local/dict
  fi
  
  if [ $stage -le 14 ]; then
    utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
    cp -rT data/lang data/lang_rescore
    cp data/lang_nosp/G.fst data/lang/
    cp data/lang_nosp_rescore/G.carpa data/lang_rescore/
  
    utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
  
    for dset in dev test; do
      steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
        exp/tri2/graph data/${dset} exp/tri2/decode_${dset}
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
         data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore
    done
  fi
  
  if [ $stage -le 15 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang exp/tri2 exp/tri2_ali
  
    steps/train_sat.sh --cmd "$train_cmd" \
      5000 100000 data/train data/lang exp/tri2_ali exp/tri3
  
    utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
  
    for dset in dev test; do
      steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
        exp/tri3/graph data/${dset} exp/tri3/decode_${dset}
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
         data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore
    done
  fi
  
  if [ $stage -le 16 ]; then
    # this does some data-cleaning.  It actually degrades the GMM-level results
    # slightly, but the cleaned data should be useful when we add the neural net and chain
    # systems.  If not we'll remove this stage.
    local/run_cleanup_segmentation.sh
  fi
  
  if [ $stage -le 17 ]; then
    # This will only work if you have GPUs on your system (and note that it requires
    # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
    local/chain/run_tdnnf.sh
  fi
  
  if [ $stage -le 18 ]; then
    # You can either train your own rnnlm or download a pre-trained one
    if $train_rnnlm; then
      local/rnnlm/tuning/run_lstm_tdnn_a.sh
      local/rnnlm/average_rnnlm.sh
    else
      local/ted_download_rnnlm.sh
    fi
  fi
  
  if [ $stage -le 19 ]; then
    # Here we rescore the lattices generated at stage 17
    rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged
    lang_dir=data/lang_chain
    ngram_order=4
  
    for dset in dev test; do
      data_dir=data/${dset}_hires
      decoding_dir=exp/chain_cleaned/tdnnf_1a/decode_${dset}
      suffix=$(basename $rnnlm_dir)
      output_dir=${decoding_dir}_$suffix
  
      rnnlm/lmrescore_pruned.sh \
        --cmd "$decode_cmd --mem 4G" \
        --weight 0.5 --max-ngram-order $ngram_order \
        $lang_dir $rnnlm_dir \
        $data_dir $decoding_dir \
        $output_dir
    done
  fi
  
  echo "$0: success."
  exit 0