train_rnnlm.sh
11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env bash
# This script does the RNNLM training. It assumes you have already run
# 'prepare_rnnlm_dir.sh' to prepare the directory.
#num-jobs-initial, num-jobs-final, max-change, embedding-max-change [initial,final?],
#num-samples, minibatch-size, chunk-length, [and the same for dev data]...
#initial-effective-learning-rate, final-effective-learning-rate, ...
#embedding-learning-rate-factor, num-epochs
stage=0
num_jobs_initial=1
num_jobs_final=1
rnnlm_max_change=0.5
embedding_max_change=0.5
chunk_length=32
num_epochs=100 # maximum number of epochs to train. later we
# may find a stopping criterion.
initial_effective_lrate=0.001
final_effective_lrate=0.0001
embedding_l2=0.005
embedding_lrate_factor=0.1 # the embedding learning rate is the
# nnet learning rate times this factor.
backstitch_training_scale=0.0 # backstitch training scale
backstitch_training_interval=1 # backstitch training interval
cmd=run.pl # you might want to set this to queue.pl
# some options passed into rnnlm-get-egs, relating to sampling.
num_samples=512
sample_group_size=2 # see rnnlm-get-egs
num_egs_threads=10 # number of threads used for sampling, if we're using
# sampling. the actual number of threads that runs at one
# time, will be however many is needed to balance the
# sampling and the actual training, this is just the maximum
# possible number that are allowed to run
use_gpu=true # use GPU for training
use_gpu_for_diagnostics=false # set true to use GPU for compute_prob_*.log
# optional cleanup options
cleanup=false # add option --cleanup true to enable automatic cleanup of old models
cleanup_strategy="keep_latest" # determines cleanup strategy, use either "keep_latest" or "keep_best"
cleanup_keep_iters=3 # number of iterations that will have their models retained
trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
. utils/parse_options.sh
if [ $# != 1 ]; then
echo "Usage: $0 [options] <rnnlm-dir>"
echo "Trains an RNNLM, assuming the things needed for training have already been"
echo "set up by prepare_rnnlm_dir.sh."
exit 1
fi
dir=$1
set -e
. ./path.sh
for f in $dir/config/{words,data_weights,oov}.txt \
$dir/text/1.txt $dir/text/dev.txt $dir/0.raw \
$dir/text/info/num_splits $dir/text/info/num_repeats \
$dir/special_symbol_opts.txt; do
[ ! -f $f ] && echo "$0: expected $f to exist" && exit 1
done
# set some variables and check more files.
num_splits=$(cat $dir/text/info/num_splits)
num_repeats=$(cat $dir/text/info/num_repeats)
text_files=$(for n in $(seq $num_splits); do echo $dir/text/$n.txt; done)
vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
embedding_type=
if [ -f $dir/feat_embedding.0.mat ]; then
sparse_features=true
embedding_type=feat
if [ -f $dir/word_embedding.0.mat ]; then
echo "$0: error: $dir/feat_embedding.0.mat and $dir/word_embedding.0.mat both exist."
exit 1;
fi
! [ -f $dir/word_feats.txt ] && echo "$0: expected $0/word_feats.txt to exist" && exit 1;
else
sparse_features=false
embedding_type=word
! [ -f $dir/word_embedding.0.mat ] && \
echo "$0: expected $dir/word_embedding.0.mat to exist" && exit 1
fi
if [ $num_jobs_initial -gt $num_splits ] || [ $num_jobs_final -gt $num_splits ]; then
echo -n "$0: number of initial or final jobs $num_jobs_initial/$num_jobs_final"
echo "exceeds num-splits=$num_splits; reduce number of jobs"
exit 1
fi
num_splits_to_process=$[($num_epochs*$num_splits)/$num_repeats]
num_split_processed=0
num_iters=$[($num_splits_to_process*2)/($num_jobs_initial+$num_jobs_final)]
# this string will combine options and arguments.
train_egs_args="--vocab-size=$vocab_size $(cat $dir/special_symbol_opts.txt)"
if [ -f $dir/sampling.lm ]; then
# we are doing sampling.
train_egs_args="$train_egs_args --num-samples=$num_samples --sample-group-size=$sample_group_size --num-threads=$num_egs_threads $dir/sampling.lm"
fi
echo "$0: will train for $num_iters iterations"
# recording some configuration information
cat >$dir/info.txt <<EOF
num_iters=$num_iters
num_epochs=$num_epochs
num_jobs_initial=$num_jobs_initial
num_jobs_final=$num_jobs_final
rnnlm_max_change=$rnnlm_max_change
embedding_max_change=$embedding_max_change
chunk_length=$chunk_length
initial_effective_lrate=$initial_effective_lrate
final_effective_lrate=$final_effective_lrate
embedding_lrate_factor=$embedding_lrate_factor
sample_group_size=$sample_group_size
num_samples=$num_samples
backstitch_training_scale=$backstitch_training_scale
backstitch_training_interval=$backstitch_training_interval
EOF
x=0
num_splits_processed=0
while [ $x -lt $num_iters ]; do
this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_splits_processed; nt=$num_splits_to_process;
this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
embedding_lrate=$(perl -e "print ($this_learning_rate*$embedding_lrate_factor);")
if [ $stage -le $x ]; then
# Set off the diagnostic job in the background.
if $sparse_features; then
word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.$x.mat -|"
else
word_embedding="$dir/word_embedding.$x.mat"
fi
if $use_gpu_for_diagnostics; then queue_gpu_opt="--gpu 1"; gpu_opt="--use-gpu=yes";
else gpu_opt=''; queue_gpu_opt=''; fi
backstitch_opt="--rnnlm.backstitch-training-scale=$backstitch_training_scale \
--rnnlm.backstitch-training-interval=$backstitch_training_interval \
--embedding.backstitch-training-scale=$backstitch_training_scale \
--embedding.backstitch-training-interval=$backstitch_training_interval"
[ -f $dir/.error ] && rm $dir/.error
$cmd $queue_gpu_opt $dir/log/compute_prob.$x.log \
rnnlm-get-egs $(cat $dir/special_symbol_opts.txt) \
--vocab-size=$vocab_size $dir/text/dev.txt ark:- \| \
rnnlm-compute-prob $gpu_opt $dir/$x.raw "$word_embedding" ark:- || touch $dir/.error &
if [ $x -gt 0 ]; then
$cmd $dir/log/progress.$x.log \
nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw '&&' \
nnet3-info $dir/$x.raw &
fi
echo "Training neural net (pass $x)"
( # this sub-shell is so that when we "wait" below,
# we only wait for the training jobs that we just spawned,
# not the diagnostic jobs that we spawned above.
# We can't easily use a single parallel SGE job to do the main training,
# because the computation of which archive and which --frame option
# to use for each job is a little complex, so we spawn each one separately.
[ -f $dir/.train_error ] && rm $dir/.train_error
for n in $(seq $this_num_jobs); do
k=$[$num_splits_processed + $n - 1]; # k is a zero-based index that we'll derive
# the other indexes from.
split=$[($k%$num_splits)+1]; # work out the 1-based split index.
src_rnnlm="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|"
if $sparse_features; then
sparse_opt="--read-sparse-word-features=$dir/word_feats.txt";
embedding_type=feat
else
sparse_opt=''; embedding_type=word
fi
if $use_gpu; then gpu_opt="--use-gpu=yes"; queue_gpu_opt="--gpu 1";
else gpu_opt="--use-gpu=no"; queue_gpu_opt=""; fi
if [ $this_num_jobs -gt 1 ]; then dest_number=$[x+1].$n
else dest_number=$[x+1]; fi
# in the normal case $repeated data will be just one copy.
repeated_data=$(for n in $(seq $num_repeats); do echo -n $dir/text/$split.txt ''; done)
rnnlm_l2_factor=$(perl -e "print (1.0/$this_num_jobs);")
embedding_l2_regularize=$(perl -e "print ($embedding_l2/$this_num_jobs);")
# allocate queue-slots for threads doing sampling,
num_threads_=$[$num_egs_threads*2/3]
[ -f $dir/sampling.lm ] && queue_thread_opt="--num-threads $num_threads_" || queue_thread_opt=
# Run the training job or jobs.
$cmd $queue_gpu_opt $queue_thread_opt $dir/log/train.$x.$n.log \
rnnlm-train \
--rnnlm.max-param-change=$rnnlm_max_change \
--rnnlm.l2_regularize_factor=$rnnlm_l2_factor \
--embedding.max-param-change=$embedding_max_change \
--embedding.learning-rate=$embedding_lrate \
--embedding.l2_regularize=$embedding_l2_regularize \
$sparse_opt $gpu_opt $backstitch_opt \
--read-rnnlm="$src_rnnlm" --write-rnnlm=$dir/$dest_number.raw \
--read-embedding=$dir/${embedding_type}_embedding.$x.mat \
--write-embedding=$dir/${embedding_type}_embedding.$dest_number.mat \
"ark,bg:cat $repeated_data | rnnlm-get-egs --chunk-length=$chunk_length --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error &
done
wait # wait for just the training jobs.
[ -f $dir/.train_error ] && \
echo "$0: failure on iteration $x of training, see $dir/log/train.$x.*.log for details." && exit 1
if [ $this_num_jobs -gt 1 ]; then
# average the models and the embedding matrces. Use run.pl as we don\'t
# want this to wait on the queue (if there is a queue).
src_models=$(for n in $(seq $this_num_jobs); do echo $dir/$[x+1].$n.raw; done)
src_matrices=$(for n in $(seq $this_num_jobs); do echo $dir/${embedding_type}_embedding.$[x+1].$n.mat; done)
run.pl $dir/log/average.$[x+1].log \
nnet3-average $src_models $dir/$[x+1].raw '&&' \
matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat
fi
# optionally, perform cleanup after training
if [ "$cleanup" = true ] ; then
python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters
fi
)
# the error message below is not that informative, but $cmd will
# have printed a more specific one.
[ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1;
fi
x=$[x+1]
num_splits_processed=$[num_splits_processed+this_num_jobs]
done
wait # wait for diagnostic jobs in the background.
if [ $stage -le $num_iters ]; then
# link the best model we encountered during training (based on
# dev-set probability) as the final model.
best_iter=$(rnnlm/get_best_model.py $dir)
echo "$0: best iteration (out of $num_iters) was $best_iter, linking it to final iteration."
train_best_log=$dir/log/train.$best_iter.1.log
ppl_train=`grep 'Overall objf' $train_best_log | awk '{printf("%.1f",exp(-$10))}'`
dev_best_log=$dir/log/compute_prob.$best_iter.log
ppl_dev=`grep 'Overall objf' $dev_best_log | awk '{printf("%.1f",exp(-$NF))}'`
echo "$0: train/dev perplexity was $ppl_train / $ppl_dev."
ln -sf ${embedding_type}_embedding.$best_iter.mat $dir/${embedding_type}_embedding.final.mat
ln -sf $best_iter.raw $dir/final.raw
fi
# Now get some diagnostics about the evolution of the objective function.
if [ $stage -le $[num_iters+1] ]; then
(
logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/train.$iter.1.log ''; done)
# in the non-sampling case the exact objf is printed and we plot that
# in the sampling case we print the approximated objf for training.
grep 'Overall objf' $logs | awk 'BEGIN{printf("Train objf: ")} /exact/{printf("%.2f ", $NF);next} {printf("%.2f ", $10)} END{print "";}'
logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/compute_prob.$iter.log ''; done)
grep 'Overall objf' $logs | awk 'BEGIN{printf("Dev objf: ")} {printf("%.2f ", $NF)} END{print "";}'
) > $dir/report.txt
cat $dir/report.txt
fi