train_scheduler.sh
7.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/bin/bash
# Copyright 2012-2017 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
# Schedules epochs and controls learning rate during the neural network training
# Begin configuration.
# training options,
learn_rate=0.008
momentum=0
l1_penalty=0
l2_penalty=0
# data processing,
train_tool="nnet-train-frmshuff"
train_tool_opts="--minibatch-size=256 --randomizer-size=32768 --randomizer-seed=777"
feature_transform=
split_feats= # int -> number of splits 'feats.scp -> feats.${i}.scp', starting from feats.1.scp,
# (data are alredy shuffled and split to N parts),
# empty -> no splitting,
# learn rate scheduling,
max_iters=20
min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual,
keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection,
dropout_schedule= # dropout-rates for N initial epochs, for example: 0.1,0.1,0.1,0.1,0.1,0.0
start_halving_impr=0.01
end_halving_impr=0.001
halving_factor=0.5
# misc,
verbose=0 # 0 No GPU time-stats, 1 with GPU time-stats (slower),
frame_weights=
utt_weights=
# End configuration.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
set -euo pipefail
if [ $# != 6 ]; then
echo "Usage: $0 <mlp-init> <feats-tr> <feats-cv> <labels-tr> <labels-cv> <exp-dir>"
echo " e.g.: $0 0.nnet scp:train.scp scp:cv.scp ark:labels_tr.ark ark:labels_cv.ark exp/dnn1"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
exit 1;
fi
mlp_init=$1
feats_tr=$2
feats_cv=$3
labels_tr=$4
labels_cv=$5
dir=$6
[ ! -d $dir ] && mkdir $dir
[ ! -d $dir/log ] && mkdir $dir/log
[ ! -d $dir/nnet ] && mkdir $dir/nnet
dropout_array=($(echo ${dropout_schedule} | tr ',' ' '))
# Skip training
[ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0
##############################
# start training
# choose mlp to start with,
mlp_best=$mlp_init
mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
# optionally resume training from the best epoch, using saved learning-rate,
[ -e $dir/.mlp_best ] && mlp_best=$(cat $dir/.mlp_best)
[ -e $dir/.learn_rate ] && learn_rate=$(cat $dir/.learn_rate)
# cross-validation on original network,
log=$dir/log/iter00.initial.log; hostname>$log
$train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
${feature_transform:+ --feature-transform=$feature_transform} \
${frame_weights:+ "--frame-weights=$frame_weights"} \
${utt_weights:+ "--utt-weights=$utt_weights"} \
"$feats_cv" "$labels_cv" $mlp_best \
2>> $log
loss=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
loss_type=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $5; }')
echo "CROSSVAL PRERUN AVG.LOSS $(printf "%.4f" $loss) $loss_type"
# resume lr-halving,
halving=0
[ -e $dir/.halving ] && halving=$(cat $dir/.halving)
# training,
for iter in $(seq -w $max_iters); do
echo -n "ITERATION $iter: "
mlp_next=$dir/nnet/${mlp_base}_iter${iter}
# skip iteration (epoch) if already done,
[ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue
# set dropout-rate from the schedule,
if [ -n ${dropout_array[$((${iter#0}-1))]-''} ]; then
dropout_rate=${dropout_array[$((${iter#0}-1))]}
nnet-copy --dropout-rate=$dropout_rate $mlp_best ${mlp_best}.dropout_rate${dropout_rate}
mlp_best=${mlp_best}.dropout_rate${dropout_rate}
fi
# select the split,
feats_tr_portion="$feats_tr" # no split?
if [ -n "$split_feats" ]; then
portion=$((1 + iter % split_feats))
feats_tr_portion="${feats_tr/train.scp/train.${portion}.scp}"
fi
# training,
log=$dir/log/iter${iter}.tr.log; hostname>$log
$train_tool --cross-validate=false --randomize=true --verbose=$verbose $train_tool_opts \
--learn-rate=$learn_rate --momentum=$momentum \
--l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
${feature_transform:+ --feature-transform=$feature_transform} \
${frame_weights:+ "--frame-weights=$frame_weights"} \
${utt_weights:+ "--utt-weights=$utt_weights"} \
"$feats_tr_portion" "$labels_tr" $mlp_best $mlp_next \
2>> $log || exit 1;
tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), "
# cross-validation,
log=$dir/log/iter${iter}.cv.log; hostname>$log
$train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
${feature_transform:+ --feature-transform=$feature_transform} \
${frame_weights:+ "--frame-weights=$frame_weights"} \
${utt_weights:+ "--utt-weights=$utt_weights"} \
"$feats_cv" "$labels_cv" $mlp_next \
2>>$log || exit 1;
loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), "
# accept or reject?
loss_prev=$loss
if [ 1 == $(awk "BEGIN{print($loss_new < $loss ? 1:0);}") -o $iter -le $keep_lr_iters -o $iter -le $min_iters ]; then
# accepting: the loss was better, or we had fixed learn-rate, or we had fixed epoch-number,
loss=$loss_new
mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)
[ $iter -le $min_iters ] && mlp_best=${mlp_best}_min-iters-$min_iters
[ $iter -le $keep_lr_iters ] && mlp_best=${mlp_best}_keep-lr-iters-$keep_lr_iters
mv $mlp_next $mlp_best
echo "nnet accepted ($(basename $mlp_best))"
echo $mlp_best > $dir/.mlp_best
else
# rejecting,
mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected
mv $mlp_next $mlp_reject
echo "nnet rejected ($(basename $mlp_reject))"
fi
# create .done file, the iteration (epoch) is completed,
touch $dir/.done_iter$iter
# continue with original learn-rate,
[ $iter -le $keep_lr_iters ] && continue
# stopping criterion,
rel_impr=$(awk "BEGIN{print(($loss_prev-$loss)/$loss_prev);}")
if [ 1 == $halving -a 1 == $(awk "BEGIN{print($rel_impr < $end_halving_impr ? 1:0);}") ]; then
if [ $iter -le $min_iters ]; then
echo we were supposed to finish, but we continue as min_iters : $min_iters
continue
fi
echo finished, too small rel. improvement $rel_impr
break
fi
# start learning-rate fade-out when improvement is low,
if [ 1 == $(awk "BEGIN{print($rel_impr < $start_halving_impr ? 1:0);}") ]; then
halving=1
echo $halving >$dir/.halving
fi
# reduce the learning-rate,
if [ 1 == $halving ]; then
learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
echo $learn_rate >$dir/.learn_rate
fi
done
# select the best network,
if [ $mlp_best != $mlp_init ]; then
mlp_final=${mlp_best}_final_
( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); )
( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; )
echo "$0: Succeeded training the Neural Network : '$dir/final.nnet'"
else
echo "$0: Error training neural network..."
exit 1
fi