run_tdnn_multilingual.sh
12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#!/bin/bash
# Copyright 2016 Pegah Ghahremani
# This script can be used for training multilingual setup using different
# languages (specifically babel languages) with no shared phones.
# It will generates separate egs directory for each dataset and combine them
# during training.
# In the new multilingual training setup, mini-batches of data corresponding to
# different languages are randomly combined to generate egs.*.scp files
# using steps/nnet3/multilingual/combine_egs.sh and generated egs.*.scp files used
# for multilingual training.
#
# For all languages, we share all except last hidden layer and there is separate final
# layer per language.
# The bottleneck layer can be added to the network structure using --bnf-dim option
#
# The script requires baseline PLP features and alignment (e.g. tri5_ali) for all languages.
# and it will generate 40dim MFCC + pitch features for all languages.
#
# The global iVector extractor trained using all languages by specifying
# --use-global-ivector-extractor and the iVectors are extracts for all languages.
#
# local.conf should exists (check README.txt), which contains configs for
# multilingual training such as lang_list as array of space-separated languages used
# for multilingual training.
#
echo "$0 $@" # Print the command line for logging
. ./cmd.sh
set -e
remove_egs=false
cmd=queue.pl
srand=0
stage=0
train_stage=-10
get_egs_stage=-10
decode_stage=-10
num_jobs_initial=2
num_jobs_final=8
speed_perturb=true
use_pitch=true # if true, pitch feature used to train multilingual setup
use_pitch_ivector=false # if true, pitch feature used in ivector extraction.
use_ivector=true
megs_dir=
alidir=tri5_ali
ivector_extractor= # If empty, the global iVector extractor trained on pooled data
# from all languages and iVectors are
# extracted using global ivector extractor and ivector_suffix ='_gb'.
# Otherwise this extractor is used to extract iVector and
# ivector_suffix = ''.
bnf_dim= # If non-empty, the bottleneck layer with this dimension is added at two layers before softmax.
dir=exp/nnet3/multi_bnf
. ./path.sh
. ./cmd.sh
. ./utils/parse_options.sh
[ ! -f local.conf ] && echo 'the file local.conf does not exist! Read README.txt for more details.' && exit 1;
. local.conf || exit 1;
num_langs=${#lang_list[@]}
feat_suffix=_hires # The feature suffix describing features used in
# multilingual training
# _hires -> 40dim MFCC
# _hires_pitch -> 40dim MFCC + pitch
# _hires_pitch_bnf -> 40dim MFCC +pitch + BNF
echo "$0 $@" # Print the command line for logging
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
for lang_index in `seq 0 $[$num_langs-1]`; do
for f in data/${lang_list[$lang_index]}/train/{feats.scp,text} exp/${lang_list[$lang_index]}/$alidir/ali.1.gz exp/${lang_list[$lang_index]}/$alidir/tree; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
done
if [ "$speed_perturb" == "true" ]; then suffix=_sp; fi
dir=${dir}${suffix}
ivec_feat_suffix=${feat_suffix}
if $use_pitch; then feat_suffix=${feat_suffix}_pitch ; fi
if $use_pitch_ivector; then nnet3_affix=_pitch; ivec_feat_suffix=${feat_suffix}_pitch ; fi
for lang_index in `seq 0 $[$num_langs-1]`; do
echo "$0: extract high resolution 40dim MFCC + pitch for speed-perturbed data "
echo "and extract alignment."
local/nnet3/run_common_langs.sh --stage $stage \
--feat-suffix $feat_suffix \
--use-pitch $use_pitch \
--speed-perturb $speed_perturb ${lang_list[$lang_index]} || exit 1;
if $use_pitch && ! $use_pitch_ivector; then
echo "$0: select MFCC features for ivector extraction."
featdir=data/${lang_list[$lang_index]}/train${suffix}${feat_suffix}
ivec_featdir=data/${lang_list[$lang_index]}/train${suffix}${ivec_feat_suffix}
mfcc_only_dim=`feat-to-dim scp:$featdir/feats.scp - | awk '{print $1-3}'`
if [ ! -f $ivec_featdir/.done ]; then
steps/select_feats.sh --cmd "$train_cmd" --nj 70 0-$[$mfcc_only_dim-1] \
$featdir ${ivec_featdir} || exit 1;
steps/compute_cmvn_stats.sh ${ivec_featdir} || exit 1;
touch ${ivec_featdir}/.done || exit 1;
fi
fi
done
if $use_ivector; then
ivector_suffix=""
if [ -z "$ivector_extractor" ]; then
mkdir -p data/multi
global_extractor=exp/multi/nnet3${nnet3_affix}
mkdir -p $global_extractor
ivector_extractor=$global_extractor/extractor
multi_data_dir_for_ivec=data/multi/train${suffix}${ivec_feat_suffix}
ivector_suffix=_gb
echo "$0: combine training data using all langs for training global i-vector extractor."
if [ ! -f $multi_data_dir_for_ivec/.done ]; then
echo ---------------------------------------------------------------------
echo "Pooling training data in $multi_data_dir_for_ivec on" `date`
echo ---------------------------------------------------------------------
mkdir -p $multi_data_dir_for_ivec
combine_lang_list=""
for lang_index in `seq 0 $[$num_langs-1]`;do
combine_lang_list="$combine_lang_list data/${lang_list[$lang_index]}/train${suffix}${ivec_feat_suffix}"
done
utils/combine_data.sh $multi_data_dir_for_ivec $combine_lang_list
utils/validate_data_dir.sh --no-feats $multi_data_dir_for_ivec
touch $multi_data_dir_for_ivec/.done
fi
fi
if [ ! -f $global_extractor/extractor/.done ]; then
if [ -z $lda_mllt_lang ]; then lda_mllt_lang=${lang_list[0]}; fi
echo "$0: Generate global i-vector extractor on pooled data from all "
echo "languages in $multi_data_dir_for_ivec, using an LDA+MLLT transform trained "
echo "on ${lda_mllt_lang}."
local/nnet3/run_shared_ivector_extractor.sh \
--suffix "$suffix" --nnet3-affix "$nnet3_affix" \
--feat-suffix "$ivec_feat_suffix" \
--stage $stage $lda_mllt_lang \
$multi_data_dir_for_ivec $global_extractor || exit 1;
touch $global_extractor/extractor/.done
fi
echo "$0: Extracts ivector for all languages using $global_extractor/extractor."
for lang_index in `seq 0 $[$num_langs-1]`; do
local/nnet3/extract_ivector_lang.sh --stage $stage \
--train-set train${suffix}${ivec_feat_suffix} \
--ivector-suffix "$ivector_suffix" \
--nnet3-affix "$nnet3_affix" \
${lang_list[$lang_index]} \
$ivector_extractor || exit;
done
fi
for lang_index in `seq 0 $[$num_langs-1]`; do
multi_data_dirs[$lang_index]=data/${lang_list[$lang_index]}/train${suffix}${feat_suffix}
multi_egs_dirs[$lang_index]=exp/${lang_list[$lang_index]}/nnet3${nnet3_affix}/egs${feat_suffix}${ivector_suffix}
multi_ali_dirs[$lang_index]=exp/${lang_list[$lang_index]}/${alidir}${suffix}
multi_ivector_dirs[$lang_index]=exp/${lang_list[$lang_index]}/nnet3${nnet3_affix}/ivectors_train${suffix}${ivec_feat_suffix}${ivector_suffix}
done
if $use_ivector; then
ivector_dim=$(feat-to-dim scp:${multi_ivector_dirs[0]}/ivector_online.scp -) || exit 1;
else
echo "$0: Not using iVectors in multilingual training."
ivector_dim=0
fi
feat_dim=`feat-to-dim scp:${multi_data_dirs[0]}/feats.scp -`
if [ $stage -le 8 ]; then
echo "$0: creating multilingual neural net configs using the xconfig parser";
if [ -z $bnf_dim ]; then
bnf_dim=1024
fi
mkdir -p $dir/configs
ivector_node_xconfig=""
ivector_to_append=""
if $use_ivector; then
ivector_node_xconfig="input dim=$ivector_dim name=ivector"
ivector_to_append=", ReplaceIndex(ivector, t, 0)"
fi
cat <<EOF > $dir/configs/network.xconfig
$ivector_node_xconfig
input dim=$feat_dim name=input
# please note that it is important to have input layer with the name=input
# as the layer immediately preceding the fixed-affine-layer to enable
# the use of short notation for the descriptor
# the first splicing is moved before the lda layer, so no splicing here
relu-renorm-layer name=tdnn1 input=Append(input@-2,input@-1,input,input@1,input@2$ivector_to_append) dim=1024
relu-renorm-layer name=tdnn2 dim=1024
relu-renorm-layer name=tdnn3 input=Append(-1,2) dim=1024
relu-renorm-layer name=tdnn4 input=Append(-3,3) dim=1024
relu-renorm-layer name=tdnn5 input=Append(-3,3) dim=1024
relu-renorm-layer name=tdnn6 input=Append(-7,2) dim=1024
relu-renorm-layer name=tdnn_bn dim=$bnf_dim
# adding the layers for diffrent language's output
EOF
# added separate outptut layer and softmax for all languages.
for lang_index in `seq 0 $[$num_langs-1]`;do
num_targets=`tree-info ${multi_ali_dirs[$lang_index]}/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1;
echo " relu-renorm-layer name=prefinal-affine-lang-${lang_index} input=tdnn_bn dim=1024"
echo " output-layer name=output-${lang_index} dim=$num_targets max-change=1.5"
done >> $dir/configs/network.xconfig
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
--config-dir $dir/configs/ \
--nnet-edits="rename-node old-name=output-0 new-name=output"
fi
if [ $stage -le 9 ]; then
echo "$0: Generates separate egs dir per language for multilingual training."
# sourcing the "vars" below sets
#model_left_context=(something)
#model_right_context=(something)
#num_hidden_layers=(something)
. $dir/configs/vars || exit 1;
ivec="${multi_ivector_dirs[@]}"
if $use_ivector; then
ivector_opts=(--online-multi-ivector-dirs "$ivec")
fi
local/nnet3/prepare_multilingual_egs.sh --cmd "$decode_cmd" \
"${ivector_opts[@]}" \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--left-context $model_left_context --right-context $model_right_context \
$num_langs ${multi_data_dirs[@]} ${multi_ali_dirs[@]} ${multi_egs_dirs[@]} || exit 1;
fi
if [ -z $megs_dir ];then
megs_dir=$dir/egs
fi
if [ $stage -le 10 ] && [ ! -z $megs_dir ]; then
echo "$0: Generate multilingual egs dir using "
echo "separate egs dirs for multilingual training."
if [ ! -z "$lang2weight" ]; then
egs_opts="--lang2weight '$lang2weight'"
fi
common_egs_dir="${multi_egs_dirs[@]} $megs_dir"
steps/nnet3/multilingual/combine_egs.sh $egs_opts \
--cmd "$decode_cmd" \
$num_langs ${common_egs_dir[@]} || exit 1;
fi
if [ $stage -le 11 ]; then
common_ivec_dir=
if $use_ivector;then
common_ivec_dir=${multi_ivector_dirs[0]}
fi
steps/nnet3/train_raw_dnn.py --stage=$train_stage \
--cmd="$decode_cmd" \
--feat.cmvn-opts="--norm-means=false --norm-vars=false" \
--trainer.num-epochs 2 \
--trainer.optimization.num-jobs-initial=2 \
--trainer.optimization.num-jobs-final=12 \
--trainer.optimization.initial-effective-lrate=0.0015 \
--trainer.optimization.final-effective-lrate=0.00015 \
--trainer.optimization.minibatch-size=256,128 \
--trainer.samples-per-iter=400000 \
--trainer.max-param-change=2.0 \
--trainer.srand=$srand \
--feat-dir ${multi_data_dirs[0]} \
--feat.online-ivector-dir "$common_ivec_dir" \
--egs.dir $megs_dir \
--use-dense-targets false \
--targets-scp ${multi_ali_dirs[0]} \
--cleanup.remove-egs $remove_egs \
--cleanup.preserve-model-interval 50 \
--use-gpu true \
--dir=$dir || exit 1;
fi
if [ $stage -le 12 ]; then
for lang_index in `seq 0 $[$num_langs-1]`;do
lang_dir=$dir/${lang_list[$lang_index]}
mkdir -p $lang_dir
echo "$0: rename output name for each lang to 'output' and "
echo "add transition model."
nnet3-copy --edits="rename-node old-name=output-$lang_index new-name=output" \
$dir/final.raw - | \
nnet3-am-init ${multi_ali_dirs[$lang_index]}/final.mdl - \
$lang_dir/final.mdl || exit 1;
cp $dir/cmvn_opts $lang_dir/cmvn_opts || exit 1;
echo "$0: compute average posterior and readjust priors for language ${lang_list[$lang_index]}."
steps/nnet3/adjust_priors.sh --cmd "$decode_cmd" \
--use-gpu true \
--iter final --use-raw-nnet false --use-gpu true \
$lang_dir ${multi_egs_dirs[$lang_index]} || exit 1;
done
fi
# decoding different languages
if [ $stage -le 13 ]; then
num_decode_lang=${#decode_lang_list[@]}
for lang_index in `seq 0 $[$num_decode_lang-1]`; do
if [ ! -f $dir/${decode_lang_list[$lang_index]}/decode_dev10h.pem/.done ]; then
echo "Decoding lang ${decode_lang_list[$lang_index]} using multilingual hybrid model $dir"
local/nnet3/run_decode_lang.sh --use-ivector $use_ivector \
--use-pitch-ivector $use_pitch_ivector --iter final_adj \
--nnet3-affix "$nnet3_affix" \
${decode_lang_list[$lang_index]} $dir || exit 1;
touch $dir/${decode_lang_list[$lang_index]}/decode_dev10h.pem/.done
fi
done
fi