run.sh
10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/bin/bash
#
# Based mostly on the TED-LIUM and Switchboard recipe
#
# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
# Apache 2.0
#
# Begin configuration section.
nj=96
decode_nj=20
stage=0
nnet_stage=-10
num_data_reps=4
snrs="20:10:15:5:0"
foreground_snrs="20:10:15:5:0"
background_snrs="20:10:15:5:0"
enhancement=beamformit # for a new enhancement method,
# change this variable and stage 4
# End configuration section
. ./utils/parse_options.sh
. ./cmd.sh
. ./path.sh
set -e # exit on error
# chime5 main directory path
# please change the path accordingly
chime5_corpus=/export/corpora4/CHiME5
json_dir=${chime5_corpus}/transcriptions
audio_dir=${chime5_corpus}/audio
# training and test data
train_set=train_worn_simu_u400k
test_sets="dev_${enhancement}_dereverb_ref" #"dev_worn dev_addition_dereverb_ref"
#test_sets="dev_${enhancement}_ref" #"dev_worn dev_addition_dereverb_ref"
# This script also needs the phonetisaurus g2p, srilm, beamformit
./local/check_tools.sh || exit 1
if [ $stage -le 1 ]; then
# skip u03 as they are missing
for mictype in worn u01 u02 u04 u05 u06; do
local/prepare_data.sh --mictype ${mictype} \
${audio_dir}/train ${json_dir}/train data/train_${mictype}
done
for dataset in dev; do
for mictype in worn; do
local/prepare_data.sh --mictype ${mictype} \
${audio_dir}/${dataset} ${json_dir}/${dataset} \
data/${dataset}_${mictype}
done
done
fi
if [ $stage -le 2 ]; then
local/prepare_dict.sh
utils/prepare_lang.sh \
data/local/dict "<unk>" data/local/lang data/lang
local/train_lms_srilm.sh \
--train-text data/train_worn/text --dev-text data/dev_worn/text \
--oov-symbol "<unk>" --words-file data/lang/words.txt \
data/ data/srilm
fi
LM=data/srilm/best_3gram.gz
if [ $stage -le 3 ]; then
# Compiles G for chime5 trigram LM
utils/format_lm.sh \
data/lang $LM data/local/dict/lexicon.txt data/lang
fi
if [ $stage -le 4 ]; then
# Beamforming using reference arrays
# enhanced WAV directory
enhandir=enhan
dereverb_dir=${PWD}/wav/wpe/
for dset in dev eval; do
for mictype in u01 u02 u03 u04 u06; do
local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
${audio_dir}/${dset} \
${dereverb_dir}/${dset} \
${mictype}
done
done
for dset in dev eval; do
for mictype in u01 u02 u03 u04 u06; do
local/run_beamformit.sh --cmd "$train_cmd" \
${dereverb_dir}/${dset} \
${enhandir}/${dset}_${enhancement}_${mictype} \
${mictype}
done
done
for dset in dev eval; do
local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
done
fi
if [ $stage -le 5 ]; then
# remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
# see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
utils/fix_data_dir.sh data/train_worn
fi
if [ $stage -le 6 ]; then
local/extract_noises.py $chime5_corpus/audio/train $chime5_corpus/transcriptions/train \
local/distant_audio_list distant_noises
local/make_noise_list.py distant_noises > distant_noise_list
noise_list=distant_noise_list
if [ ! -d RIRS_NOISES/ ]; then
# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
unzip rirs_noises.zip
fi
# This is the config for the system using simulated RIRs and point-source noises
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
rvb_opts+=(--noise-set-parameters $noise_list)
steps/data/reverberate_data_dir.py \
"${rvb_opts[@]}" \
--prefix "rev" \
--foreground-snrs $foreground_snrs \
--background-snrs $background_snrs \
--speech-rvb-probability 1 \
--pointsource-noise-addition-probability 1 \
--isotropic-noise-addition-probability 1 \
--num-replications $num_data_reps \
--max-noises-per-minute 1 \
--source-sampling-rate 16000 \
data/train_worn data/train_worn_rvb
fi
if [ $stage -le 7 ]; then
# combine mix array and worn mics
# randomly extract first 100k utterances from all mics
# if you want to include more training data, you can increase the number of array mic utterances
utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06
utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
# only use left channel for worn mic recognition
# you can use both left and right channels for training
for dset in train dev; do
utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo
grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text
utils/fix_data_dir.sh data/${dset}_worn
done
fi
if [ $stage -le 8 ]; then
# fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
# add array ID to the speaker ID to avoid the use of other array information to meet regulations
# Before this fix
# $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
# P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
# P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
# After this fix
# $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
# P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
# P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
mkdir -p data/${dset}_nosplit_fix
cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
done
# Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and
# lets us use more jobs for decoding etc.
for dset in ${train_set} dev_worn; do
utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
done
for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
done
fi
if [ $stage -le 8 ]; then
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in ${train_set} ${test_sets}; do
steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
data/$x exp/make_mfcc/$x $mfccdir
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x
done
fi
if [ $stage -le 9 ]; then
# make a subset for monophone training
utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
fi
if [ $stage -le 10 ]; then
# Starting basic training on MFCC features
steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
data/${train_set}_30kshort data/lang exp/mono
fi
if [ $stage -le 11 ]; then
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
data/${train_set} data/lang exp/mono exp/mono_ali
steps/train_deltas.sh --cmd "$train_cmd" \
2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
fi
if [ $stage -le 12 ]; then
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
data/${train_set} data/lang exp/tri1 exp/tri1_ali
steps/train_lda_mllt.sh --cmd "$train_cmd" \
4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
fi
if [ $stage -le 13 ]; then
utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
for dset in ${test_sets}; do
steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri2/graph data/${dset} exp/tri2/decode_${dset} &
done
wait
fi
if [ $stage -le 14 ]; then
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
data/${train_set} data/lang exp/tri2 exp/tri2_ali
steps/train_sat.sh --cmd "$train_cmd" \
5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
fi
if [ $stage -le 15 ]; then
utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
for dset in ${test_sets}; do
steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri3/graph data/${dset} exp/tri3/decode_${dset} &
done
wait
fi
if [ $stage -le 16 ]; then
# The following script cleans the data and produces cleaned data
steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \
--segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
fi
if [ $stage -le 17 ]; then
# chain TDNN
local/chain/tuning/run_tdnn_1b.sh --nj ${nj} \
--stage $nnet_stage \
--train-set ${train_set}_cleaned \
--test-sets "$test_sets" \
--gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
fi
if [ $stage -le 18 ]; then
# 2-stage decoding
for test_set in $test_sets; do
local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
--acwt 1.0 --post-decode-acwt 10.0 \
--frames-per-chunk 150 --nj $decode_nj \
--ivector-dir exp/nnet3_${train_set}_cleaned_rvb \
data/${test_set} data/lang_chain \
exp/chain_${train_set}_cleaned_rvb/tree_sp/graph \
exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp
done
fi
if [ $stage -le 19 ]; then
# final scoring to get the official challenge result
# please specify both dev and eval set directories so that the search parameters
# (insertion penalty and language model weight) will be tuned using the dev set
local/score_for_submit.sh \
--dev exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_${enhancement}_dereverb_ref \
--eval exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_${enhancement}_dereverb_ref
fi