run_nnet2_perturbed.sh
7.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/bin/bash
. ./cmd.sh
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_perturbed
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="--gpu 1"
num_threads=1
minibatch_size=512
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="--num-threads $num_threads"
fi
if [ $stage -le 1 ]; then
# Note: if you've already run run_online_decoding_nnet2.sh you can
# skip this stage.
mkdir -p exp/nnet2_online
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
# Note: if you've already run run_online_decoding_nnet2.sh you can
# skip this stage.
# use a smaller iVector dim (50) than the default (100) because RM has a very
# small amount of data.
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
--ivector-dim 50 \
data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 3 ]; then
# Dump perturbed versions of the features.
# store them in a sub-directory of the experimental directory.
featdir=exp/perturbed_mfcc/feats; mkdir -p $featdir
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$featdir $featdir/storage
fi
# We can afford to run 80 jobs as we have 4 separate machines for storage.
steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc --nj 80 \
conf/mfcc.conf "$featdir" exp/perturbed_mfcc data/train data/train_perturbed_mfcc
fi
if [ $stage -le 4 ]; then
# Align the perturbed features.
steps/align_fmllr.sh --nj 80 --cmd "$train_cmd" \
data/train_perturbed_mfcc data/lang exp/tri3b exp/tri3b_ali_perturbed_mfcc
fi
ivectordir=exp/nnet2_online/ivectors_perturbed_mfcc
if [ $stage -le 5 ]; then
# Extract iVectors for the perturbed features.
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$ivectordir $ivectordir/storage
fi
# Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
# of speakers into "fake-speakers" with about 2 utterances each, by randomly making
# some have 2 and some 3 utterances... this randomness will be different in different
# copies of the data.
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
data/train_perturbed_mfcc_max2.5
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
if [ $stage -le 6 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
# dir is the neural-net training dir.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$dir/egs $dir/egs/storage
fi
# the --max-jobs-run 15 allows more of the dump_egs jobs than the default (5), since we
# have 4 filesystems to access. We reduce the number of epochs since we have
# more data and we don't want so slow down the training too much, and we also
# reduce the final learning rate (when we have a lot of data we like a ratio of 10
# between the initial and final learning rate). I also have --add-layers-period 2
# which is typical when we have enough data, and increase the number of hidden layers
# and pnorm dimentions vs. run_online_decoding_nnet2.sh since we have more data.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--splice-width 7 \
--feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_perturbed_mfcc \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "--max-jobs-run 15" \
--num-jobs-nnet 4 \
--num-epochs 5 --num-epochs-extra 2 \
--add-layers-period 2 \
--num-hidden-layers 3 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.002 \
--cmd "$decode_cmd" \
--pnorm-input-dim 1200 \
--pnorm-output-dim 200 \
data/train_perturbed_mfcc data/lang exp/tri3b_ali_perturbed_mfcc $dir || exit 1;
fi
# This time we don't bother testing with offline decoding, only with online.
if [ $stage -le 7 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
"$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph data/test ${dir}_online/decode &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
wait
fi
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph data/test ${dir}_online/decode_per_utt &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_per_utt || exit 1;
wait
fi
exit 0;
# the experiment (with GPU)
# per-speaker (carrying adaptation info forward):
#for x in exp/nnet2_online/nnet_gpu_perturbed_online/decode*; do grep WER $x/wer_* | utils/best_wer.sh ; done
#%WER 1.62 [ 203 / 12533, 20 ins, 41 del, 142 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode/wer_5
#%WER 8.97 [ 1124 / 12533, 87 ins, 204 del, 833 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug/wer_11
# Note, this is the baseline with no perturbing of features, from ./run_nnet2.sh
# [different hidden-layer configuration though.]
#%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
#%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
# per-utterance:
#%WER 1.85 [ 232 / 12533, 23 ins, 45 del, 164 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_per_utt/wer_5
#%WER 9.17 [ 1149 / 12533, 118 ins, 174 del, 857 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug_per_utt/wer_9
# this is the per-utterance baseline with no perturbing of features, from ./run_nnet2.sh
# [different hidden-layer configuration though]
#%WER 2.21 [ 277 / 12533, 45 ins, 48 del, 184 sub ] exp/nnet2_online/nnet_gpu_online/decode_per_utt/wer_4
#%WER 10.27 [ 1287 / 12533, 142 ins, 186 del, 959 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug_per_utt/wer_10