run_nnet2_wsj.sh
7.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/bin/bash
# note: see the newer, better script run_nnet2_wsj_joint.sh
# This script assumes you have previously run the WSJ example script including
# the optional part local/online/run_online_decoding_nnet2.sh. It builds a
# neural net for online decoding on top of the network we previously trained on
# WSJ, by keeping everything but the last layer of that network and then
# training just the last layer on our data. We then train the whole thing.
stage=0
set -e
train_stage=-10
use_gpu=true
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="--gpu 1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online_wsj/nnet_a
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
# later we'll change the script to download the trained model from kaldi-asr.org.
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="--num-threads $num_threads"
dir=exp/nnet2_online_wsj/nnet_a
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
fi
if [ $stage -le 0 ]; then
echo "$0: dumping activations from WSJ model"
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $trainfeats/feats/storage ]; then
# this shows how you can split the data across multiple file-systems; it's optional.
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$date/s5/$trainfeats/feats/storage \
$trainfeats/feats/storage
fi
steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
data/train $srcdir $trainfeats
fi
if [ $stage -le 1 ]; then
echo "$0: training 0-hidden-layer model on top of WSJ activations"
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
steps/nnet2/retrain_fast.sh --stage $train_stage \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--cmd "$decode_cmd" \
--num-jobs-nnet 4 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
$trainfeats/data data/lang exp/tri3b_ali $dir
fi
if [ $stage -le 2 ]; then
echo "$0: formatting combined model for online decoding."
steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
fi
if [ $stage -le 3 ]; then
# do online decoding with the combined model.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph data/test ${dir}_online/decode &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
wait
fi
if [ $stage -le 4 ]; then
# do online per-utterance decoding with the combined model.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph data/test ${dir}_online/decode_utt &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_utt || exit 1;
wait
fi
## From this point on we try something else: we try training all the layers of
## the model on this dataset. First we need to create a combined version of the
## model.
if [ $stage -le 5 ]; then
steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
# Set the learning rate in this initial value to our guess of a suitable value.
# note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
# (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
initial_learning_rate=0.01
nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
fi
if [ $stage -le 6 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
$dir_combined/egs/storage
fi
# This version of the get_egs.sh script does the feature extraction and iVector
# extraction in a single binary, reading the config, as part of the script.
steps/online/nnet2/get_egs.sh --cmd "$train_cmd" --num-jobs-nnet 4 \
data/train exp/tri3b_ali ${dir}_online ${dir}_combined
fi
if [ $stage -le 7 ]; then
steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
${dir}_combined_init/final.mdl ${dir}_combined/egs ${dir}_combined
fi
if [ $stage -le 8 ]; then
# Create an online-decoding dir corresponding to what we just trained above.
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang $srcdir/ivector_extractor \
${dir}_combined ${dir}_combined_online || exit 1;
fi
if [ $stage -le 9 ]; then
# do the online decoding on top of the retrained _combined_online model, and
# also the per-utterance version of the online decoding.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph data/test ${dir}_combined_online/decode &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true exp/tri3b/graph data/test ${dir}_combined_online/decode_per_utt &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true exp/tri3b/graph_ug data/test ${dir}_combined_online/decode_ug_per_utt || exit 1;
wait
fi
exit 0;
# Here are the results when we just retrain the last layer:
# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh
#%WER 1.60 [ 201 / 12533, 22 ins, 46 del, 133 sub ] exp/nnet2_online_wsj/nnet_a_online/decode/wer_3
#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 8.02 [ 1005 / 12533, 74 ins, 155 del, 776 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_6
# and with per-utterance decoding:
# %WER 8.47 [ 1061 / 12533, 88 ins, 157 del, 816 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug_utt/wer_6
# %WER 1.70 [ 213 / 12533, 24 ins, 46 del, 143 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_utt/wer_3
#, here when we retrain the whole thing:
#%WER 1.42 [ 178 / 12533, 16 ins, 44 del, 118 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode/wer_4
#%WER 7.08 [ 887 / 12533, 74 ins, 133 del, 680 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug/wer_6
# and the same with per-utterance decoding:
# %WER 1.56 [ 196 / 12533, 31 ins, 26 del, 139 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_per_utt/wer_2
# %WER 7.86 [ 985 / 12533, 59 ins, 171 del, 755 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug_per_utt/wer_8
# And this is a suitable baseline: a system trained on RM only.
#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_a_online/decode/wer_8
#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_11