Blame view

egs/chime5/s5b/run.sh 10.3 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  #!/bin/bash
  #
  # Based mostly on the TED-LIUM and Switchboard recipe
  #
  # Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
  # Apache 2.0
  #
  
  # Begin configuration section.
  nj=96
  decode_nj=20
  stage=0
  nnet_stage=-10
  num_data_reps=4
  snrs="20:10:15:5:0"
  foreground_snrs="20:10:15:5:0"
  background_snrs="20:10:15:5:0"
  enhancement=beamformit # for a new enhancement method,
                         # change this variable and stage 4
  # End configuration section
  . ./utils/parse_options.sh
  
  . ./cmd.sh
  . ./path.sh
  
  
  set -e # exit on error
  
  # chime5 main directory path
  # please change the path accordingly
  chime5_corpus=/export/corpora4/CHiME5
  json_dir=${chime5_corpus}/transcriptions
  audio_dir=${chime5_corpus}/audio
  
  # training and test data
  train_set=train_worn_simu_u400k
  test_sets="dev_${enhancement}_dereverb_ref" #"dev_worn dev_addition_dereverb_ref"
  #test_sets="dev_${enhancement}_ref" #"dev_worn dev_addition_dereverb_ref"
  
  # This script also needs the phonetisaurus g2p, srilm, beamformit
  ./local/check_tools.sh || exit 1
  
  if [ $stage -le 1 ]; then
    # skip u03 as they are missing
    for mictype in worn u01 u02 u04 u05 u06; do
      local/prepare_data.sh --mictype ${mictype} \
  			  ${audio_dir}/train ${json_dir}/train data/train_${mictype}
    done
    for dataset in dev; do
      for mictype in worn; do
        local/prepare_data.sh --mictype ${mictype} \
  			    ${audio_dir}/${dataset} ${json_dir}/${dataset} \
  			    data/${dataset}_${mictype}
      done
    done
  fi
  
  if [ $stage -le 2 ]; then
    local/prepare_dict.sh
  
    utils/prepare_lang.sh \
      data/local/dict "<unk>" data/local/lang data/lang
  
    local/train_lms_srilm.sh \
      --train-text data/train_worn/text --dev-text data/dev_worn/text \
      --oov-symbol "<unk>" --words-file data/lang/words.txt \
      data/ data/srilm
  fi
  
  LM=data/srilm/best_3gram.gz
  if [ $stage -le 3 ]; then
    # Compiles G for chime5 trigram LM
    utils/format_lm.sh \
  		data/lang $LM data/local/dict/lexicon.txt data/lang
  
  fi
  
  if [ $stage -le 4 ]; then
    # Beamforming using reference arrays
    # enhanced WAV directory
    enhandir=enhan
    dereverb_dir=${PWD}/wav/wpe/
    for dset in dev eval; do
      for mictype in u01 u02 u03 u04 u06; do
        local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
  			      ${audio_dir}/${dset} \
  			      ${dereverb_dir}/${dset} \
  			      ${mictype}
      done
    done
  
    for dset in dev eval; do
      for mictype in u01 u02 u03 u04 u06; do
        local/run_beamformit.sh --cmd "$train_cmd" \
  			      ${dereverb_dir}/${dset} \
  			      ${enhandir}/${dset}_${enhancement}_${mictype} \
  			      ${mictype}
      done
    done
  
    for dset in dev eval; do
      local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
  			  ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
    done
  fi
  
  if [ $stage -le 5 ]; then
    # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
    # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
    utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
    grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
    utils/fix_data_dir.sh data/train_worn
  fi
  
  if [ $stage -le 6 ]; then
    local/extract_noises.py $chime5_corpus/audio/train $chime5_corpus/transcriptions/train \
      local/distant_audio_list distant_noises
    local/make_noise_list.py distant_noises > distant_noise_list
  
    noise_list=distant_noise_list
    
    if [ ! -d RIRS_NOISES/ ]; then
      # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
      wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
      unzip rirs_noises.zip
    fi
  
    # This is the config for the system using simulated RIRs and point-source noises
    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
    rvb_opts+=(--noise-set-parameters $noise_list)
  
    steps/data/reverberate_data_dir.py \
      "${rvb_opts[@]}" \
      --prefix "rev" \
      --foreground-snrs $foreground_snrs \
      --background-snrs $background_snrs \
      --speech-rvb-probability 1 \
      --pointsource-noise-addition-probability 1 \
      --isotropic-noise-addition-probability 1 \
      --num-replications $num_data_reps \
      --max-noises-per-minute 1 \
      --source-sampling-rate 16000 \
      data/train_worn data/train_worn_rvb
  fi
  
  if [ $stage -le 7 ]; then
    # combine mix array and worn mics
    # randomly extract first 100k utterances from all mics
    # if you want to include more training data, you can increase the number of array mic utterances
    utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06
    utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
    utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
  
    # only use left channel for worn mic recognition
    # you can use both left and right channels for training
    for dset in train dev; do
      utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo
      grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text
      utils/fix_data_dir.sh data/${dset}_worn
    done
  fi
  
  if [ $stage -le 8 ]; then
    # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
    # add array ID to the speaker ID to avoid the use of other array information to meet regulations
    # Before this fix
    # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
    # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
    # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
    # After this fix
    # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
    # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
    # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
    for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
      utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
      mkdir -p data/${dset}_nosplit_fix
      cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
      awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
      utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
    done
  
    # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
    # lets us use more jobs for decoding etc.
    for dset in ${train_set} dev_worn; do
      utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
      utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
    done
    for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
      utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
    done
  fi
  
  if [ $stage -le 8 ]; then
    # Now make MFCC features.
    # mfccdir should be some place with a largish disk where you
    # want to store MFCC features.
    mfccdir=mfcc
    for x in ${train_set} ${test_sets}; do
      steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
  		       data/$x exp/make_mfcc/$x $mfccdir
      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
      utils/fix_data_dir.sh data/$x
    done
  fi
  
  if [ $stage -le 9 ]; then
    # make a subset for monophone training
    utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
    utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
  fi
  
  if [ $stage -le 10 ]; then
    # Starting basic training on MFCC features
    steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
  		      data/${train_set}_30kshort data/lang exp/mono
  fi
  
  if [ $stage -le 11 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
  		    data/${train_set} data/lang exp/mono exp/mono_ali
  
    steps/train_deltas.sh --cmd "$train_cmd" \
  			2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
  fi
  
  if [ $stage -le 12 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
  		    data/${train_set} data/lang exp/tri1 exp/tri1_ali
  
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
  			  4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
  fi
  
  if [ $stage -le 13 ]; then
    utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
    for dset in ${test_sets}; do
      steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
  		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset} &
    done
    wait
  fi
  
  if [ $stage -le 14 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
  		    data/${train_set} data/lang exp/tri2 exp/tri2_ali
  
    steps/train_sat.sh --cmd "$train_cmd" \
  		     5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
  fi
  
  if [ $stage -le 15 ]; then
    utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
    for dset in ${test_sets}; do
      steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
  			  exp/tri3/graph data/${dset} exp/tri3/decode_${dset} &
    done
    wait
  fi
  
  if [ $stage -le 16 ]; then
    # The following script cleans the data and produces cleaned data
    steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \
      --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
      data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
  fi
  
  if [ $stage -le 17 ]; then
    # chain TDNN
    local/chain/tuning/run_tdnn_1b.sh --nj ${nj} \
      --stage $nnet_stage \
      --train-set ${train_set}_cleaned \
      --test-sets "$test_sets" \
      --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
  fi
  
  if [ $stage -le 18 ]; then
    # 2-stage decoding
    for test_set in $test_sets; do
      local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
        --acwt 1.0 --post-decode-acwt 10.0 \
        --frames-per-chunk 150 --nj $decode_nj \
        --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \
        data/${test_set} data/lang_chain \
        exp/chain_${train_set}_cleaned_rvb/tree_sp/graph \
        exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp 
    done
  fi
  
  if [ $stage -le 19 ]; then
    # final scoring to get the official challenge result
    # please specify both dev and eval set directories so that the search parameters
    # (insertion penalty and language model weight) will be tuned using the dev set
    local/score_for_submit.sh \
        --dev exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_${enhancement}_dereverb_ref \
        --eval exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_${enhancement}_dereverb_ref
  fi