run_nnet2.sh
7.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
# Copyright 2014 Vimal Manohar
# This is our online neural net build for Gale system
. ./cmd.sh
stage=-1
train_stage=-10
use_gpu=true
mfccdir=mfcc
train_nj=120
decode_nj=30
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="--gpu 1"
num_threads=1
minibatch_size=512
# the _a is in case I want to change the parameters.
dir=exp/nnet2_online/nnet_a_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="--num-threads $num_threads"
dir=exp/nnet2_online/nnet_a
fi
if [ $stage -le 0 ]; then
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/gale-$date/s5/$mfccdir/storage $mfccdir/storage || exit 1
fi
utils/copy_data_dir.sh data/train data/train_hires || exit 1
steps/make_mfcc_pitch_online.sh --nj $train_nj --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
fi
if [ $stage -le 1 ]; then
# we'll use the features with just MFCC, no pitch, to train the iVector
# extractor on. Check that we're using 40-dim features so the command line is correct.
! grep 'num-ceps=40' conf/mfcc_hires.conf >/dev/null && \
echo "Change the script if you change conf/mfcc_hires.conf" && exit 1;
steps/select_feats.sh --nj 5 --cmd "$train_cmd" 0-39 data/train_hires \
data/train_hires_mfcconly exp/nnet2_online/select_hires_train $mfccdir || exit 1
steps/compute_cmvn_stats.sh data/train_hires_mfcconly exp/nnet2_online/select_hires_train $mfccdir || exit 1
# Make a subset of about 1/3 of the data.
utils/subset_data_dir.sh data/train_hires_mfcconly 100000 \
data/train_hires_mfcconly_100k || exit 1
# make a corresponding subset of normal-dimensional-MFCC training data.
utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/train_hires_mfcconly_100k/utt2spk) \
data/train data/train_100k || exit 1
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. First align the data of the 100k subset using
# the tri3b system and normal MFCC features, so we have alignments to build our
# system on hires MFCCs on top of.
steps/align_fmllr.sh --nj $train_nj --cmd "$train_cmd" \
data/train_100k data/lang exp/tri3b exp/tri3b_ali_100k || exit 1;
# Build a small LDA+MLLT system on top of the hires MFCC features, just
# because we need the transform. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 --realign-iters "" \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_hires_mfcconly_100k data/lang exp/tri3b_ali_100k exp/nnet2_online/tri4a || exit 1
fi
if [ $stage -le 3 ]; then
# Train a diagonal UBM. The input directory exp/nnet2_online/tri3a is only
# needed for the splice-opts and the LDA+MLLT transform.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj $train_nj --num-frames 400000 \
data/train_hires_mfcconly_100k 512 exp/nnet2_online/tri4a exp/nnet2_online/diag_ubm || exit 1
fi
if [ $stage -le 4 ]; then
# train an iVector extractor on all the mfcconly data. Note: although we use
# only 10 job, each job uses 16 processes in total.
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_hires_mfcconly exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 5 ]; then
# extract iVectors for the training data.
ivectordir=exp/nnet2_online/ivectors_train
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/gale/s5/$ivectordir/storage $ivectordir/storage || exit 1
fi
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires_mfcconly data/train_hires_mfcconly_max2 || exit 1
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $train_nj \
data/train_hires_mfcconly_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
if [ $stage -le 6 ]; then
# this shows how you can split across multiple file-systems.
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-online/egs/bolt/s5/$dir/egs $dir/egs/storage || exit 1
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15) to (8).
# The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
--num-epochs 8 \
--samples-per-iter 400000 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "--max-jobs-run 12" \
--num-jobs-nnet 6 \
--num-hidden-layers 4 \
--mix-up 12000 \
--initial-learning-rate 0.06 --final-learning-rate 0.006 \
--cmd "$decode_cmd" \
--pnorm-input-dim 3000 \
--pnorm-output-dim 300 \
data/train_hires data/lang exp/tri3b $dir || exit 1;
fi
if [ $stage -le 7 ]; then
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
--add-pitch true data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
exp/tri3b/graph data/test ${dir}_online/decode_test || exit 1;
fi
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
--per-utt true \
exp/tri3b/graph data/test ${dir}_online/decode_test_utt || exit 1;
fi
if [ $stage -le 10 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information, but looks to the end
# of the utterance while computing the iVector.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
--per-utt true --online false \
exp/tri3b/graph data/test ${dir}_online/decode_test_utt_offline || exit 1;
fi
exit 0;