run.sh
14.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/bin/bash
# It's best to run the commands in this one by one.
. ./cmd.sh
. ./path.sh
mfccdir=mfcc
set -e
rescore=true
# check for kaldi_lm
which get_word_map.pl > /dev/null
if [ $? -ne 0 ]; then
echo "This recipe requires installation of tools/kaldi_lm. Please run extras/kaldi_lm.sh in tools/" && exit 1;
fi
# prepare fisher data and put it under data/train_fisher
local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
/export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
# at BUT:
####local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
local/swbd1_data_download.sh /export/corpora3/LDC/LDC97S62
# prepare dictionary and acronym mapping list
local/fisher_swbd_prepare_dict.sh
# prepare swbd data and put it under data/train_swbd
local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
# local/swbd1_data_prep.sh /data/corpora0/LDC97S62
# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
# local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1
utils/prepare_lang.sh data/local/dict_nosp \
"<unk>" data/local/lang_nosp data/lang_nosp
# LM for swbd could be used for decoding purposes
#fisher_opt="--fisher /scail/group/deeplearning/speech/datasets/LDC2004T19-Fisher-Transcripts"
#local/swbd1_train_lms.sh $fisher_opt \
# data/local/train_swbd/text data/local/dict/lexicon.txt data/local/lm
# merge two datasets into one
mkdir -p data/train_all
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cat data/train_fisher/$f data/train_swbd/$f > data/train_all/$f
done
# LM for train_all
local/fisher_train_lms.sh
#local/fisher_create_test_lang.sh
# Compiles G for trigram LM
LM=data/local/lm/3gram-mincount/lm_unpruned.gz
srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_fsh_sw1_tg
LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz
[ -f $LM_fg ] || rescore=false
if [ $rescore ]; then
utils/build_const_arpa_lm.sh $LM_fg data/lang_nosp data/lang_nosp_fsh_sw1_fg
fi
# Prepare Eval2000 and RT-03 test sets
#local/eval2000_data_prep.sh /scail/group/deeplearning/speech/datasets/LDC2002S09/hub5e_00/ /scail/group/deeplearning/speech/datasets/LDC2002T43 || exit 1
local/eval2000_data_prep.sh /export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43 || exit 1
#local/rt03_data_prep.sh /scail/group/deeplearning/speech/datasets/rt_03 || exit 1
local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10 || exit 1
utils/fix_data_dir.sh data/train_all
# Make MFCCs for the training set
# spread the mfccs over various machines, as this data-set is quite large.
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
mfcc=$(basename $mfccdir) # in case was absolute pathname (unlikely), get basename.
utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/fisher_swbd/s5/$mfcc/storage \
$mfccdir/storage
fi
steps/make_mfcc.sh --nj 100 --cmd "$train_cmd" data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
utils/fix_data_dir.sh data/train_all
utils/validate_data_dir.sh data/train_all
steps/compute_cmvn_stats.sh data/train_all exp/make_mfcc/train_all $mfccdir
# subset swbd features and put them back into train_swbd in case separate training is needed
awk -F , '{print $1}' data/train_swbd/spk2utt > data/swbd_spklist
utils/subset_data_dir.sh --spk-list data/swbd_spklist data/train_all data/train_swbd
steps/compute_cmvn_stats.sh data/train_swbd exp/make_mfcc/train_all $mfccdir
# Make MFCCs for the test sets
utils/fix_data_dir.sh data/eval2000
steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/eval2000 exp/make_mfcc/eval2000 $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/eval2000 exp/make_mfcc/eval2000 $mfccdir
utils/fix_data_dir.sh data/rt03
steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/rt03 exp/make_mfcc/rt03 $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/rt03 exp/make_mfcc/rt03 $mfccdir
utils/fix_data_dir.sh data/eval2000
utils/validate_data_dir.sh data/eval2000
utils/fix_data_dir.sh data/rt03
utils/validate_data_dir.sh data/rt03
n=$[`cat data/train_all/segments | wc -l`]
utils/subset_data_dir.sh --last data/train_all $n data/train
# Now-- there are 2.1 million utterances, and we want to start the monophone training
# on relatively short utterances (easier to align), but not only the very shortest
# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
# utterances from those. We also take these subsets from Switchboard, which has
# more carefully hand-labeled alignments
utils/subset_data_dir.sh --shortest data/train_swbd 100000 data/train_100kshort
utils/data/remove_dup_utts.sh 10 data/train_100kshort data/train_100kshort_nodup
utils/subset_data_dir.sh data/train_100kshort_nodup 10000 data/train_10k_nodup
utils/subset_data_dir.sh --speakers data/train_swbd 30000 data/train_30k
utils/subset_data_dir.sh --speakers data/train_swbd 100000 data/train_100k
utils/data/remove_dup_utts.sh 200 data/train_30k data/train_30k_nodup
utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup
utils/data/remove_dup_utts.sh 300 data/train data/train_nodup
# The next commands are not necessary for the scripts to run, but increase
# efficiency of data access by putting the mfcc's of the subset
# in a contiguous place in a file.
( . ./path.sh;
# make sure mfccdir is defined as above..
cp data/train_10k_nodup/feats.scp{,.bak}
copy-feats scp:data/train_10k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
&& cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_10k_nodup/feats.scp
)
( . ./path.sh;
# make sure mfccdir is defined as above..
cp data/train_30k_nodup/feats.scp{,.bak}
copy-feats scp:data/train_30k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_30k_nodup.ark,$mfccdir/kaldi_fish_30k_nodup.scp \
&& cp $mfccdir/kaldi_fish_30k_nodup.scp data/train_30k_nodup/feats.scp
)
( . ./path.sh;
# make sure mfccdir is defined as above..
cp data/train_100k_nodup/feats.scp{,.bak}
copy-feats scp:data/train_100k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_100k_nodup.ark,$mfccdir/kaldi_fish_100k_nodup.scp \
&& cp $mfccdir/kaldi_fish_100k_nodup.scp data/train_100k_nodup/feats.scp
)
# Start training on the Switchboard subset, which has cleaner alignments
steps/train_mono.sh --nj 3 --cmd "$train_cmd" \
data/train_10k_nodup data/lang_nosp exp/mono0a
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1;
#used to be 2500 20000
(
graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg
utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri1a $graph_dir
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri1a/decode_eval2000_nosp_fsh_sw1_tg
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri1a/decode_rt03_nosp_fsh_sw1_tg
)&
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/train_30k_nodup data/lang_nosp exp/tri1a exp/tri1a_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
3200 30000 data/train_30k_nodup data/lang_nosp exp/tri1a_ali exp/tri1b || exit 1;
#used to be 2500 20000
(
graph_dir=exp/tri1b/graph_nosp_fsh_sw1_tg
utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri1b $graph_dir
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri1b/decode_eval2000_nosp_fsh_sw1_tg
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri1b/decode_rt03_nosp_fsh_sw1_tg
)&
steps/align_si.sh --nj 50 --cmd "$train_cmd" \
data/train_100k_nodup data/lang_nosp exp/tri1b exp/tri1b_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
5500 90000 data/train_100k_nodup data/lang_nosp exp/tri1b_ali exp/tri2 || exit 1;
#used to be 2500 20000 on 30k
(
graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg
utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri2 $graph_dir || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri2/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri2/decode_rt03_nosp_fsh_sw1_tg || exit 1;
)&
# Train tri3a, the last speaker-independent triphone stage,
# on the whole Switchboard training set
steps/align_si.sh --nj 100 --cmd "$train_cmd" \
data/train_swbd data/lang_nosp exp/tri2 exp/tri2_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
11500 200000 data/train_swbd data/lang_nosp exp/tri2_ali exp/tri3a || exit 1;
#used to be 2500 20000
(
graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg
utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3a $graph_dir || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri3a/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri3a/decode_rt03_nosp_fsh_sw1_tg || exit 1;
)&
# Train tri3b, which is LDA+MLLT on the whole Switchboard+Fisher training set
steps/align_si.sh --nj 100 --cmd "$train_cmd" \
data/train_nodup data/lang_nosp exp/tri3a exp/tri3a_ali || exit 1;
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
11500 400000 data/train_nodup data/lang_nosp exp/tri3a_ali exp/tri3b || exit 1;
(
graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg
utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri3b/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri3b/decode_rt03_nosp_fsh_sw1_tg || exit 1;
)&
steps/get_prons.sh --cmd "$train_cmd" data/train_nodup data/lang_nosp exp/tri3b
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_nosp exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
LM=data/local/lm/3gram-mincount/lm_unpruned.gz
srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
data/lang $LM data/local/dict/lexicon.txt data/lang_fsh_sw1_tg
LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz
if [ $rescore ]; then
utils/build_const_arpa_lm.sh $LM_fg data/lang data/lang_fsh_sw1_fg
fi
(
graph_dir=exp/tri3b/graph_fsh_sw1_tg
utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri3b/decode_eval2000_fsh_sw1_tg || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri3b/decode_rt03_fsh_sw1_tg || exit 1;
)&
# Next we'll use fMLLR and train with SAT (i.e. on
# fMLLR features)
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri3b exp/tri3b_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
11500 800000 data/train_nodup data/lang exp/tri3b_ali exp/tri4a || exit 1;
(
graph_dir=exp/tri4a/graph_fsh_sw1_tg
utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri4a $graph_dir
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri4a/decode_eval2000_fsh_sw1_tg || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri4a/decode_rt03_fsh_sw1_tg || exit 1;
)&
wait
if [ $rescore ]; then
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/eval2000 \
exp/tri4a/decode_eval2000_fsh_sw1_{tg,fg}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/rt03 \
exp/tri4a/decode_rt03_fsh_sw1_{tg,fg}
fi
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri4a exp/tri4a_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
11500 1600000 data/train_nodup data/lang exp/tri4a_ali exp/tri5a || exit 1;
(
graph_dir=exp/tri5a/graph_fsh_sw1_tg
utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri5a $graph_dir
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri5a/decode_eval2000_fsh_sw1_tg || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri5a/decode_rt03_fsh_sw1_tg || exit 1;
)&
wait
if [ $rescore ]; then
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/eval2000 \
exp/tri5a/decode_eval2000_fsh_sw1_{tg,fg}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/rt03 \
exp/tri5a/decode_rt03_fsh_sw1_{tg,fg}
fi
hours=$(awk '{x += $4 - $3;} END{print x/3600;}' <data/train_fisher/segments)
! [ $hours == 1915 ] && echo "$0: expected 1915 hours of data, got $hours hours, please check." && exit 1;
# at BUT:
####local/fisher_data_prep.sh /mnt/matylda6/jhu09/qpovey/FISHER/LDC2005T19 /mnt/matylda2/data/FISHER/
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri5a exp/tri5a_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
11500 3200000 data/train_nodup data/lang exp/tri5a_ali exp/tri6a || exit 1;
(
graph_dir=exp/tri6a/graph_fsh_sw1_tg
utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri6a $graph_dir
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/eval2000 exp/tri6a/decode_eval2000_fsh_sw1_tg || exit 1;
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/rt03 exp/tri6a/decode_rt03_fsh_sw1_tg || exit 1;
)&
wait
if [ $rescore ]; then
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/eval2000 \
exp/tri6a/decode_eval2000_fsh_sw1_{tg,fg}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
data/lang_fsh_sw1_{tg,fg} data/rt03 \
exp/tri6a/decode_rt03_fsh_sw1_{tg,fg}
fi
# Optional tri6a alignment for further training purposes
#steps/align_fmllr.sh --nj 200 --cmd "$train_cmd" \
# data/train_nodup data/lang exp/tri6a exp/tri6a_ali || exit 1;
# The following is the current online-nnet2 recipe, with "multi-splice".
# local/online/run_nnet2_ms.sh
local/online/run_nnet2_ms.sh