run.sh
12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
#
# Copyright 2016 by Idiap Research Institute, http://www.idiap.ch
#
# See the file COPYING for the licence associated with this software.
#
# Author(s):
# Bogdan Vlasenko, February 2016
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script shows the steps needed to build a recognizer for certain languages
# of the GlobalPhone corpus.
# !!! NOTE: The current recipe assumes that you have pre-built LMs.
echo "This shell script may run as-is on your system, but it is recommended
that you run the commands one by one by copying and pasting into the shell."
#exit 1;
[ -f cmd.sh ] && source ./cmd.sh || echo "cmd.sh not found. Jobs may not execute properly."
# CHECKING FOR AND INSTALLING REQUIRED TOOLS:
# This recipe requires shorten (3.6.1) and sox (14.3.2).
# If they are not found, the local/gp_install.sh script will install them.
#local/gp_check_tools.sh $PWD path.sh || exit 1;
. ./path.sh || { echo "Cannot source path.sh"; exit 1; }
# Set the locations of the GlobalPhone corpus and language models
GP_CORPUS=/idiap/resource/database/GLOBALPHONE
GP_LM=$PWD/language_models
# Set the languages that will actually be processed
export GP_LANGUAGES="FR GE RU"
# The following data preparation step actually converts the audio files from
# shorten to WAV to take out the empty files and those with compression errors.
local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS --languages="$GP_LANGUAGES" || exit 1;
local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES || exit 1;
for L in $GP_LANGUAGES; do
utils/prepare_lang.sh --position-dependent-phones true \
data/$L/local/dict "<unk>" data/$L/local/lang_tmp data/$L/lang \
>& data/$L/prepare_lang.log || exit 1;
done
# Convert the different available language models to FSTs, and create separate
# decoding configurations for each.
for L in $GP_LANGUAGES; do
local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L &
done
wait
# Now make MFCC features.
for L in $GP_LANGUAGES; do
mfccdir=mfcc/$L
for x in train dev eval; do
(
steps/make_mfcc.sh --nj 6 --cmd "$train_cmd" data/$L/$x \
exp/$L/make_mfcc/$x $mfccdir;
steps/compute_cmvn_stats.sh data/$L/$x exp/$L/make_mfcc/$x $mfccdir;
) &
done
done
wait;
for L in $GP_LANGUAGES; do
mkdir -p exp/$L/mono;
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
data/$L/train data/$L/lang exp/$L/mono >& exp/$L/mono/train.log &
done
wait;
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/mono/graph_${lm_suffix}
mkdir -p $graph_dir
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/mono \
$graph_dir
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
exp/$L/mono/decode_dev_${lm_suffix}
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
exp/$L/mono/decode_eval_${lm_suffix}
) &
done
done
# Train tri1, which is first triphone pass
for L in $GP_LANGUAGES; do
(
mkdir -p exp/$L/mono_ali
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \
>& exp/$L/mono_ali/align.log
num_states=$(grep "^$L" conf/tri.conf | cut -f2)
num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
mkdir -p exp/$L/tri1
steps/train_deltas.sh --cmd "$train_cmd" \
--cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
exp/$L/mono_ali exp/$L/tri1 >& exp/$L/tri1/train.log
) &
done
wait;
# Decode tri1
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/tri1/graph_${lm_suffix}
mkdir -p $graph_dir
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 \
$graph_dir
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
exp/$L/tri1/decode_dev_${lm_suffix}
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
exp/$L/tri1/decode_eval_${lm_suffix}
) &
done
done
# Train tri2a, which is deltas + delta-deltas
for L in $GP_LANGUAGES; do
(
mkdir -p exp/$L/tri1_ali
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \
>& exp/$L/tri1_ali/tri1_ali.log
num_states=$(grep "^$L" conf/tri.conf | cut -f2)
num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
mkdir -p exp/$L/tri2a
steps/train_deltas.sh --cmd "$train_cmd" \
--cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
exp/$L/tri1_ali exp/$L/tri2a >& exp/$L/tri2a/train.log
) &
done
wait;
# Decode tri2a
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/tri2a/graph_${lm_suffix}
mkdir -p $graph_dir
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a \
$graph_dir
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
exp/$L/tri2a/decode_dev_${lm_suffix}
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
exp/$L/tri2a/decode_eval_${lm_suffix}
) &
done
done
# Train tri2b, which is LDA+MLLT
for L in $GP_LANGUAGES; do
(
num_states=$(grep "^$L" conf/tri.conf | cut -f2)
num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
mkdir -p exp/$L/tri2b
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" $num_states $num_gauss data/$L/train \
data/$L/lang exp/$L/tri1_ali exp/$L/tri2b >& exp/$L/tri2b/tri2_ali.log
) &
done
wait;
# for L in $GP_LANGUAGES; do
# mode=4
# # Doing this only for the LMs whose vocabs were limited using SRILM, since the
# # other approach didn't yield LMs for all languages.
# steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
# data/$L/lang_test_tgpr_sri data/$L/lang_test_tg_sri data/$L/dev \
# exp/$L/tri2a/decode_dev_tgpr_sri exp/$L/tri2a/decode_dev_tg_sri$mode
# done
# Decode tri2b
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/tri2b/graph_${lm_suffix}
mkdir -p $graph_dir
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2b \
$graph_dir
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
exp/$L/tri2b/decode_dev_${lm_suffix}
steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
exp/$L/tri2b/decode_eval_${lm_suffix}
) &
done
done
wait;
# Train tri3b, which is LDA+MLLT+SAT.
for L in $GP_LANGUAGES; do
(
mkdir -p exp/$L/tri2b_ali
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
--use-graphs true data/$L/train data/$L/lang exp/$L/tri2b exp/$L/tri2b_ali \
>& exp/$L/tri2b_ali/align.log
num_states=$(grep "^$L" conf/tri.conf | cut -f2)
num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
mkdir -p exp/$L/tri3b
steps/train_sat.sh --cmd "$train_cmd" \
--cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
exp/$L/tri2b_ali exp/$L/tri3b >& exp/$L/tri3b/train.log
) &
done
wait;
# Decode 3b
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/tri3b/graph_${lm_suffix}
mkdir -p $graph_dir
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri3b \
$graph_dir
mkdir -p exp/$L/tri3b/decode_dev_${lm_suffix}
steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \
$graph_dir data/$L/dev exp/$L/tri3b/decode_dev_${lm_suffix}
steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \
$graph_dir data/$L/eval exp/$L/tri3b/decode_eval_${lm_suffix}
) &
done
done
wait;
## Train sgmm2b, which is SGMM on top of LDA+MLLT+SAT features.
for L in $GP_LANGUAGES; do
(
mkdir -p exp/$L/tri3b_ali
steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
data/$L/train data/$L/lang exp/$L/tri3b exp/$L/tri3b_ali
num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
mkdir -p exp/$L/ubm4a
steps/train_ubm.sh --cmd "$train_cmd" \
600 data/$L/train data/$L/lang exp/$L/tri3b_ali exp/$L/ubm4a
mkdir -p exp/$L/sgmm2_4a
steps/train_sgmm2.sh --cmd "$train_cmd" \
$num_states $num_substates data/$L/train data/$L/lang exp/$L/tri3b_ali \
exp/$L/ubm4a/final.ubm exp/$L/sgmm2_4a
) &
done
wait;
## Decode sgmm2_4a
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix}
mkdir -p $graph_dir
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2_4a \
$graph_dir
steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \
--transform-dir exp/$L/tri3b/decode_dev_${lm_suffix} $graph_dir data/$L/dev \
exp/$L/sgmm2_4a/decode_dev_${lm_suffix}
steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \
--transform-dir exp/$L/tri3b/decode_eval_${lm_suffix} $graph_dir data/$L/eval \
exp/$L/sgmm2_4a/decode_eval_${lm_suffix}
)
done
done
wait;
# Now we'll align the SGMM system to prepare for discriminative training MMI
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
mkdir -p exp/$L/sgmm2_4a_ali
steps/align_sgmm2.sh --nj 10 --cmd "$train_cmd" \
--transform-dir exp/$L/tri3b_ali --use-graphs true --use-gselect true data/$L/train \
data/$L/lang exp/$L/sgmm2_4a exp/$L/sgmm2_4a_ali
mkdir -p exp/$L/sgmm2_4a_denlats
steps/make_denlats_sgmm2.sh --nj 10 --sub-split 10 --cmd "$decode_cmd" \
--transform-dir exp/$L/tri3b_ali data/$L/train data/$L/lang \
exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats
mkdir -p exp/$L/sgmm2_4a_mmi_b0.1
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \
--transform-dir exp/$L/tri3b_ali --boost 0.1 data/$L/train data/$L/lang \
exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats exp/$L/sgmm2_4a_mmi_b0.1
) &
done
done
wait;
# decode sgmm2_4a-mmi_b0.1
for L in $GP_LANGUAGES; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix}
for iter in 1 2 3 4; do
for test in dev eval; do
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" \
--iter $iter --transform-dir exp/$L/tri3b/decode_${test}_${lm_suffix} data/$L/lang_test_${lm_suffix} \
data/$L/${test} exp/$L/sgmm2_4a/decode_${test}_${lm_suffix} \
exp/$L/sgmm2_4a_mmi_b0.1/decode_${test}_${lm_suffix}_it$iter
done
done
) &
done
done
wait;
# SGMMs starting from non-SAT triphone system, both with and without
# speaker vectors.
for L in $GP_LANGUAGES; do
(
mkdir -p exp/$L/ubm2a
steps/train_ubm.sh --cmd "$train_cmd" \
400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \
>& exp/$L/ubm2a/train.log
num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
mkdir -p exp/$L/sgmm2a
steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \
$num_states $num_substates data/$L/train data/$L/lang exp/$L/tri1_ali \
exp/$L/ubm2a/final.ubm exp/$L/sgmm2a >& exp/$L/sgmm2a/train.log
mkdir -p exp/$L/sgmm2b
steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 \
$num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali \
exp/$L/ubm2a/final.ubm exp/$L/sgmm2b >& exp/$L/sgmm2b/train.log
) &
done
wait
for L in $GP_LANGUAGES; do
# Need separate decoding graphs for models with and without speaker vectors,
# since the trees may be different.
for sgmm in sgmm2a sgmm2b; do
for lm_suffix in tgpr_sri; do
(
graph_dir=exp/$L/$sgmm/graph_${lm_suffix}
mkdir -p $graph_dir
$highmem_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/$sgmm $graph_dir
steps/decode_sgmm2.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
exp/$L/$sgmm/decode_dev_${lm_suffix}
) &
done # loop over LMs
done # loop over model with and without speaker vecs
done # loop over languages