retrain_tanh.sh
9.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# This script is for training networks with tanh nonlinearities; it starts with
# a given model and supports increasing the hidden-layer dimension. It is
# otherwise similar to train_tanh.sh
# Begin configuration section.
cmd=run.pl
num_epochs=15 # Number of epochs during which we reduce
# the learning rate; number of iteration is worked out from this.
num_epochs_extra=5 # Number of epochs after we stop reducing
# the learning rate.
num_iters_final=20 # Maximum number of final iterations to give to the
# optimization over the validation set.
initial_learning_rate=0.04
final_learning_rate=0.004
softmax_learning_rate_factor=0.5 # Train this layer half as fast as the other layers.
minibatch_size=128 # by default use a smallish minibatch size for neural net
# training; this controls instability which would otherwise
# be a problem with multi-threaded update. Note: it also
# interacts with the "preconditioned" update which generally
# works better with larger minibatch size, so it's not
# completely cost free.
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
# on each iter. You could set it to 0 or to a large value for complete
# randomization, but this would both consume memory and cause spikes in
# disk I/O. Smaller is easier on disk and memory but less random. It's
# not a huge deal though, as samples are anyway randomized right at the start.
stage=-5
mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
# specified.) Will do this at the start.
widen=0 # If specified, it will increase the hidden-layer dimension
# to this value. Will do this at the start.
bias_stddev=0.5 # will be used for widen
num_threads=16
parallel_opts="--num-threads $num_threads" # using a smallish #threads by default, out of stability concerns.
# note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
cleanup=true
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [opts] <egs-dir> <old-nnet-dir> <exp-dir>"
echo " e.g.: $0 --widen 1024 exp/tri4_nnet/egs exp/tri4_nnet exp/tri5_nnet"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --num-epochs <#epochs|15> # Number of epochs of main training"
echo " # while reducing learning rate (determines #iterations, together"
echo " # with --samples-per-iter and --num-jobs-nnet)"
echo " --num-epochs-extra <#epochs-extra|5> # Number of extra epochs of training"
echo " # after learning rate fully reduced"
echo " --initial-learning-rate <initial-learning-rate|0.02> # Learning rate at start of training, e.g. 0.02 for small"
echo " # data, 0.01 for large data"
echo " --final-learning-rate <final-learning-rate|0.004> # Learning rate at end of training, e.g. 0.004 for small"
echo " # data, 0.001 for large data"
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
echo " # per context-dependent state. Try a number several times #states."
echo " --num-threads <num-threads|16> # Number of parallel threads per job (will affect results"
echo " # as well as speed; may interact with batch size; if you increase"
echo " # this, you may want to decrease the batch size."
echo " --parallel-opts <opts|\"--num-threads 16\"> # extra options to pass to e.g. queue.pl for processes that"
echo " # use multiple threads."
echo " --minibatch-size <minibatch-size|128> # Size of minibatch to process (note: product with --num-threads"
echo " # should not get too large, e.g. >2k)."
echo " --num-iters-final <#iters|10> # Number of final iterations to give to nnet-combine-fast to "
echo " # interpolate parameters (the weights are learned with a validation set)"
echo " --stage <stage|-5> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
exit 1;
fi
egs_dir=$1
nnet_dir=$2
dir=$3
# Check some files.
for f in $egs_dir/egs.1.0.ark $nnet_dir/final.mdl; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` || exit 1;
iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1;
mkdir -p $dir/log
cp $nnet_dir/phones.txt $dir 2>/dev/null
cp $nnet_dir/splice_opts $dir 2>/dev/null
cp $nnet_dir/final.mat $dir 2>/dev/null # any LDA matrix...
cp $nnet_dir/tree $dir
if [ $stage -le -2 ] && [ $mix_up -gt 0 ]; then
echo Mixing up to $mix_up components
$cmd $dir/log/mix_up.$x.log \
nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
$nnet_dir/final.mdl $dir/0.mdl || exit 1;
else
cp $nnet_dir/final.mdl $dir/0.mdl || exit 1;
fi
if [ $stage -le -1 ] && [ $widen -gt 0 ]; then
echo "$0: Widening nnet to hidden-layer-dim=$widen"
$cmd $dir/log/widen.log \
nnet-am-widen --hidden-layer-dim=$widen $dir/0.mdl $dir/0.mdl || exit 1;
fi
num_iters_reduce=$[$num_epochs * $iters_per_epoch];
num_iters_extra=$[$num_epochs_extra * $iters_per_epoch];
num_iters=$[$num_iters_reduce+$num_iters_extra]
echo "$0: Will train for $num_epochs + $num_epochs_extra epochs, equalling "
echo "$0: $num_iters_reduce + $num_iters_extra = $num_iters iterations, "
echo "$0: (while reducing learning rate) + (with constant learning rate)."
x=0
while [ $x -lt $num_iters ]; do
if [ $x -ge 0 ] && [ $stage -le $x ]; then
# Set off jobs doing some diagnostics, in the background.
$cmd $dir/log/compute_prob_valid.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.$x.log \
nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
echo "Training neural net (pass $x)"
$cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
nnet-train-parallel --num-threads=$num_threads \
--minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \
ark:- $dir/$[$x+1].JOB.mdl \
|| exit 1;
nnets_list=
for n in `seq 1 $num_jobs_nnet`; do
nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
done
learning_rate=`perl -e '($x,$n,$i,$f)=@ARGV; print ($x >= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`;
softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo 2>/dev/null || exit 1
nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
na=`cat $dir/foo | grep AffineComponent | wc -l` # number of last AffineComopnent layer [one-based]
lr_string="$learning_rate"
for n in `seq 2 $nu`; do
if [ $n -eq $na ]; then lr=$softmax_learning_rate;
else lr=$learning_rate; fi
lr_string="$lr_string:$lr"
done
$cmd $dir/log/average.$x.log \
nnet-am-average $nnets_list - \| \
nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
rm $nnets_list
fi
x=$[$x+1]
done
# Now do combination.
# At the end, final.mdl will be a combination of the last e.g. 10 models.
if [ $num_iters_final -gt $num_iters_extra ]; then
echo "Setting num_iters_final=$num_iters_extra"
num_iters_final=$num_iters_extra
fi
start=$[$num_iters-$num_iters_final+1]
nnets_list=
for x in `seq $start $num_iters`; do
nnets_list="$nnets_list $dir/$x.mdl"
done
if [ $stage -le $num_iters ]; then
num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
mb=$[($num_egs+$num_threads-1)/$num_threads]
$cmd $parallel_opts $dir/log/combine.log \
nnet-combine-fast --use-gpu=no --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \
$nnets_list ark:$egs_dir/combine.egs $dir/final.mdl || exit 1;
fi
sleep 2; # make sure final.mdl exists.
# Compute the probability of the final, combined model with
# the same subset we used for the previous compute_probs, as the
# different subsets will lead to different probs.
$cmd $dir/log/compute_prob_valid.final.log \
nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs &
$cmd $dir/log/compute_prob_train.final.log \
nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs &
echo Done
if $cleanup; then
echo Removing most of the models
for x in `seq 0 $num_iters`; do
if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
# delete all but every 10th model; don't delete the ones which combine to form the final model.
rm $dir/$x.mdl
fi
done
fi