run_multilingual.sh
8.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/bin/bash
# Copyright 2015 University of Illinois (Author: Amit Das)
# Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This example script trains Multi-lingual DNN with <BlockSoftmax> output, using FBANK features.
# The network is trained on multiple languages simultaneously, creating a separate softmax layer
# per language while sharing hidden layers across all languages.
# The script supports arbitrary number of languages.
. ./cmd.sh
. ./path.sh
# Example setup, the options are in 'csl' format, they must have same number of elements,
lang_code_csl="rm,wsj" # One label for each language,
lang_weight_csl="1.0,0.1" # Per-language weights, they scale loss-function and gradient, 1.0 for each language is good,
ali_dir_csl="exp/tri3b_ali,../../wsj/s5/exp/tri4b_ali_si284" # One ali-dir per language,
data_dir_csl="data/train,../../wsj/s5/data/train_si284" # One train-data-dir per language (features will be re-computed),
nnet_type=dnn_small # dnn_small | dnn | bn
stage=0
. utils/parse_options.sh || exit 1;
set -euxo pipefail
# Convert 'csl' to bash array (accept separators ',' ':'),
lang_code=($(echo $lang_code_csl | tr ',:' ' '))
ali_dir=($(echo $ali_dir_csl | tr ',:' ' '))
data_dir=($(echo $data_dir_csl | tr ',:' ' '))
# Make sure we have same number of items in lists,
! [ ${#lang_code[@]} -eq ${#ali_dir[@]} -a ${#lang_code[@]} -eq ${#data_dir[@]} ] && \
echo "Non-matching number of 'csl' items: lang_code ${#lang_code[@]}, ali_dir ${ali_dir[@]}, data_dir ${#data_dir[@]}" && \
exit 1
num_langs=${#lang_code[@]}
# Check if all the input directories exist,
for i in $(seq 0 $[num_langs-1]); do
echo "lang = ${lang_code[$i]}, alidir = ${ali_dir[$i]}, datadir = ${data_dir[$i]}"
[ ! -d ${ali_dir[$i]} ] && echo "Missing ${ali_dir[$i]}" && exit 1
[ ! -d ${data_dir[$i]} ] && echo "Missing ${data_dir[$i]}" && exit 1
done
# Make the features,
data=data-fbank-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')
data_tr90=$data/combined_tr90
data_cv10=$data/combined_cv10
if [ $stage -le 0 ]; then
# Make local copy of data-dirs (while adding language-code),
tr90=""
cv10=""
for i in $(seq 0 $[num_langs-1]); do
code=${lang_code[$i]}
dir=${data_dir[$i]}
tgt_dir=$data/${code}_$(basename $dir)
utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features,
# extract features, get cmvn stats,
steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd --max-jobs-run 10" $tgt_dir{,/log,/data}
steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data}
# split lists 90% train / 10% held-out,
utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10
tr90="$tr90 ${tgt_dir}_tr90"
cv10="$cv10 ${tgt_dir}_cv10"
done
# Merge the datasets,
utils/combine_data.sh $data_tr90 $tr90
utils/combine_data.sh $data_cv10 $cv10
# Validate,
utils/validate_data_dir.sh $data_tr90
utils/validate_data_dir.sh $data_cv10
fi
# Extract the tied-state numbers from transition models,
for i in $(seq 0 $[num_langs-1]); do
ali_dim[i]=$(hmm-info ${ali_dir[i]}/final.mdl | grep pdfs | awk '{ print $NF }')
done
ali_dim_csl=$(echo ${ali_dim[@]} | tr ' ' ',')
# Total number of DNN outputs (sum of all per-language blocks),
output_dim=$(echo ${ali_dim[@]} | tr ' ' '\n' | awk '{ sum += $i; } END{ print sum; }')
echo "Total number of DNN outputs: $output_dim = $(echo ${ali_dim[@]} | sed 's: : + :g')"
# Objective function string (per-language weights are imported from '$lang_weight_csl'),
objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' '\n' | \
awk -v w=$lang_weight_csl 'BEGIN{ split(w,w_arr,/[,:]/); } { printf(",xent,%d,%s", $1, w_arr[NR]); }')"
echo "Multitask objective function: $objective_function"
# DNN training will be in $dir, the alignments are prepared beforehand,
dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')-${nnet_type}
[ ! -e $dir ] && mkdir -p $dir
echo "$lang_code_csl" >$dir/lang_code_csl
echo "$ali_dir_csl" >$dir/ali_dir_csl
echo "$data_dir_csl" >$dir/data_dir_csl
echo "$ali_dim_csl" >$dir/ali_dim_csl
echo "$objective_function" >$dir/objective_function
# Prepare the merged targets,
if [ $stage -le 1 ]; then
[ ! -e $dir/ali-post ] && mkdir -p $dir/ali-post
# re-saving the ali in posterior format, indexed by 'scp',
for i in $(seq 0 $[num_langs-1]); do
code=${lang_code[$i]}
ali=${ali_dir[$i]}
# utt suffix added by 'awk',
ali-to-pdf $ali/final.mdl "ark:gunzip -c ${ali}/ali.*.gz |" ark,t:- | awk -v c=$code '{ $1=$1"_"c; print $0; }' | \
ali-to-post ark:- ark,scp:$dir/ali-post/$code.ark,$dir/ali-post/$code.scp
done
# pasting the ali's, adding language-specific offsets to the posteriors,
featlen="ark:feat-to-len 'scp:cat $data_tr90/feats.scp $data_cv10/feats.scp |' ark,t:- |" # get number of frames for every utterance,
post_scp_list=$(echo ${lang_code[@]} | tr ' ' '\n' | awk -v d=$dir '{ printf(" scp:%s/ali-post/%s.scp", d, $1); }')
paste-post --allow-partial=true "$featlen" "${ali_dim_csl}" ${post_scp_list} \
ark,scp:$dir/ali-post/combined.ark,$dir/ali-post/combined.scp
fi
# Train the <BlockSoftmax> system, 1st stage of Stacked-Bottleneck-Network,
if [ $stage -le 2 ]; then
case $nnet_type in
bn)
# Bottleneck network (40 dimensional bottleneck is good for fMLLR),
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--hid-layers 2 --hid-dim 1500 --bn-dim 40 \
--cmvn-opts "--norm-means=true --norm-vars=false" \
--feat-type "traps" --splice 5 --traps-dct-basis 6 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
sbn)
# Stacked Bottleneck Netowork, no fMLLR in between,
bn1_dim=80
bn2_dim=30
# Train 1st part,
dir_part1=${dir}_part1
$cuda_cmd ${dir}_part1/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--hid-layers 2 --hid-dim 1500 --bn-dim $bn1_dim \
--cmvn-opts "--norm-means=true --norm-vars=false" \
--feat-type "traps" --splice 5 --traps-dct-basis 6 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir_part1
# Compose feature_transform for 2nd part,
nnet-initialize <(echo "<Splice> <InputDim> $bn1_dim <OutputDim> $((13*bn1_dim)) <BuildVector> -10 -5:5 10 </BuildVector>") \
$dir_part1/splice_for_bottleneck.nnet
nnet-concat $dir_part1/final.feature_transform "nnet-copy --remove-last-components=4 $dir_part1/final.nnet - |" \
$dir_part1/splice_for_bottleneck.nnet $dir_part1/final.feature_transform.part1
# Train 2nd part,
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--feature-transform $dir_part1/final.feature_transform.part1 \
--hid-layers 2 --hid-dim 1500 --bn-dim $bn2_dim \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
dnn_small)
# 4 hidden layers, 1024 sigmoid neurons,
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--cmvn-opts "--norm-means=true --norm-vars=true" \
--delta-opts "--delta-order=2" --splice 5 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
dnn)
# 6 hidden layers, 2048 simgoid neurons,
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --learn-rate 0.008 \
--hid-layers 6 --hid-dim 2048 \
--cmvn-opts "--norm-means=true --norm-vars=false" \
--delta-opts "--delta-order=2" --splice 5 \
--labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims=${ali_dim_csl}" \
--train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
;;
*)
echo "Unknown --nnet-type $nnet_type"; exit 1;
;;
esac
fi
exit 0