Blame view
scripts/rnnlm/train_rnnlm.sh
11.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
#!/usr/bin/env bash # This script does the RNNLM training. It assumes you have already run # 'prepare_rnnlm_dir.sh' to prepare the directory. #num-jobs-initial, num-jobs-final, max-change, embedding-max-change [initial,final?], #num-samples, minibatch-size, chunk-length, [and the same for dev data]... #initial-effective-learning-rate, final-effective-learning-rate, ... #embedding-learning-rate-factor, num-epochs stage=0 num_jobs_initial=1 num_jobs_final=1 rnnlm_max_change=0.5 embedding_max_change=0.5 chunk_length=32 num_epochs=100 # maximum number of epochs to train. later we # may find a stopping criterion. initial_effective_lrate=0.001 final_effective_lrate=0.0001 embedding_l2=0.005 embedding_lrate_factor=0.1 # the embedding learning rate is the # nnet learning rate times this factor. backstitch_training_scale=0.0 # backstitch training scale backstitch_training_interval=1 # backstitch training interval cmd=run.pl # you might want to set this to queue.pl # some options passed into rnnlm-get-egs, relating to sampling. num_samples=512 sample_group_size=2 # see rnnlm-get-egs num_egs_threads=10 # number of threads used for sampling, if we're using # sampling. the actual number of threads that runs at one # time, will be however many is needed to balance the # sampling and the actual training, this is just the maximum # possible number that are allowed to run use_gpu=true # use GPU for training use_gpu_for_diagnostics=false # set true to use GPU for compute_prob_*.log # optional cleanup options cleanup=false # add option --cleanup true to enable automatic cleanup of old models cleanup_strategy="keep_latest" # determines cleanup strategy, use either "keep_latest" or "keep_best" cleanup_keep_iters=3 # number of iterations that will have their models retained trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM . utils/parse_options.sh if [ $# != 1 ]; then echo "Usage: $0 [options] <rnnlm-dir>" echo "Trains an RNNLM, assuming the things needed for training have already been" echo "set up by prepare_rnnlm_dir.sh." exit 1 fi dir=$1 set -e . ./path.sh for f in $dir/config/{words,data_weights,oov}.txt \ $dir/text/1.txt $dir/text/dev.txt $dir/0.raw \ $dir/text/info/num_splits $dir/text/info/num_repeats \ $dir/special_symbol_opts.txt; do [ ! -f $f ] && echo "$0: expected $f to exist" && exit 1 done # set some variables and check more files. num_splits=$(cat $dir/text/info/num_splits) num_repeats=$(cat $dir/text/info/num_repeats) text_files=$(for n in $(seq $num_splits); do echo $dir/text/$n.txt; done) vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}') embedding_type= if [ -f $dir/feat_embedding.0.mat ]; then sparse_features=true embedding_type=feat if [ -f $dir/word_embedding.0.mat ]; then echo "$0: error: $dir/feat_embedding.0.mat and $dir/word_embedding.0.mat both exist." exit 1; fi ! [ -f $dir/word_feats.txt ] && echo "$0: expected $0/word_feats.txt to exist" && exit 1; else sparse_features=false embedding_type=word ! [ -f $dir/word_embedding.0.mat ] && \ echo "$0: expected $dir/word_embedding.0.mat to exist" && exit 1 fi if [ $num_jobs_initial -gt $num_splits ] || [ $num_jobs_final -gt $num_splits ]; then echo -n "$0: number of initial or final jobs $num_jobs_initial/$num_jobs_final" echo "exceeds num-splits=$num_splits; reduce number of jobs" exit 1 fi num_splits_to_process=$[($num_epochs*$num_splits)/$num_repeats] num_split_processed=0 num_iters=$[($num_splits_to_process*2)/($num_jobs_initial+$num_jobs_final)] # this string will combine options and arguments. train_egs_args="--vocab-size=$vocab_size $(cat $dir/special_symbol_opts.txt)" if [ -f $dir/sampling.lm ]; then # we are doing sampling. train_egs_args="$train_egs_args --num-samples=$num_samples --sample-group-size=$sample_group_size --num-threads=$num_egs_threads $dir/sampling.lm" fi echo "$0: will train for $num_iters iterations" # recording some configuration information cat >$dir/info.txt <<EOF num_iters=$num_iters num_epochs=$num_epochs num_jobs_initial=$num_jobs_initial num_jobs_final=$num_jobs_final rnnlm_max_change=$rnnlm_max_change embedding_max_change=$embedding_max_change chunk_length=$chunk_length initial_effective_lrate=$initial_effective_lrate final_effective_lrate=$final_effective_lrate embedding_lrate_factor=$embedding_lrate_factor sample_group_size=$sample_group_size num_samples=$num_samples backstitch_training_scale=$backstitch_training_scale backstitch_training_interval=$backstitch_training_interval EOF x=0 num_splits_processed=0 while [ $x -lt $num_iters ]; do this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_splits_processed; nt=$num_splits_to_process; this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); embedding_lrate=$(perl -e "print ($this_learning_rate*$embedding_lrate_factor);") if [ $stage -le $x ]; then # Set off the diagnostic job in the background. if $sparse_features; then word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.$x.mat -|" else word_embedding="$dir/word_embedding.$x.mat" fi if $use_gpu_for_diagnostics; then queue_gpu_opt="--gpu 1"; gpu_opt="--use-gpu=yes"; else gpu_opt=''; queue_gpu_opt=''; fi backstitch_opt="--rnnlm.backstitch-training-scale=$backstitch_training_scale \ --rnnlm.backstitch-training-interval=$backstitch_training_interval \ --embedding.backstitch-training-scale=$backstitch_training_scale \ --embedding.backstitch-training-interval=$backstitch_training_interval" [ -f $dir/.error ] && rm $dir/.error $cmd $queue_gpu_opt $dir/log/compute_prob.$x.log \ rnnlm-get-egs $(cat $dir/special_symbol_opts.txt) \ --vocab-size=$vocab_size $dir/text/dev.txt ark:- \| \ rnnlm-compute-prob $gpu_opt $dir/$x.raw "$word_embedding" ark:- || touch $dir/.error & if [ $x -gt 0 ]; then $cmd $dir/log/progress.$x.log \ nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw '&&' \ nnet3-info $dir/$x.raw & fi echo "Training neural net (pass $x)" ( # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. # We can't easily use a single parallel SGE job to do the main training, # because the computation of which archive and which --frame option # to use for each job is a little complex, so we spawn each one separately. [ -f $dir/.train_error ] && rm $dir/.train_error for n in $(seq $this_num_jobs); do k=$[$num_splits_processed + $n - 1]; # k is a zero-based index that we'll derive # the other indexes from. split=$[($k%$num_splits)+1]; # work out the 1-based split index. src_rnnlm="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|" if $sparse_features; then sparse_opt="--read-sparse-word-features=$dir/word_feats.txt"; embedding_type=feat else sparse_opt=''; embedding_type=word fi if $use_gpu; then gpu_opt="--use-gpu=yes"; queue_gpu_opt="--gpu 1"; else gpu_opt="--use-gpu=no"; queue_gpu_opt=""; fi if [ $this_num_jobs -gt 1 ]; then dest_number=$[x+1].$n else dest_number=$[x+1]; fi # in the normal case $repeated data will be just one copy. repeated_data=$(for n in $(seq $num_repeats); do echo -n $dir/text/$split.txt ''; done) rnnlm_l2_factor=$(perl -e "print (1.0/$this_num_jobs);") embedding_l2_regularize=$(perl -e "print ($embedding_l2/$this_num_jobs);") # allocate queue-slots for threads doing sampling, num_threads_=$[$num_egs_threads*2/3] [ -f $dir/sampling.lm ] && queue_thread_opt="--num-threads $num_threads_" || queue_thread_opt= # Run the training job or jobs. $cmd $queue_gpu_opt $queue_thread_opt $dir/log/train.$x.$n.log \ rnnlm-train \ --rnnlm.max-param-change=$rnnlm_max_change \ --rnnlm.l2_regularize_factor=$rnnlm_l2_factor \ --embedding.max-param-change=$embedding_max_change \ --embedding.learning-rate=$embedding_lrate \ --embedding.l2_regularize=$embedding_l2_regularize \ $sparse_opt $gpu_opt $backstitch_opt \ --read-rnnlm="$src_rnnlm" --write-rnnlm=$dir/$dest_number.raw \ --read-embedding=$dir/${embedding_type}_embedding.$x.mat \ --write-embedding=$dir/${embedding_type}_embedding.$dest_number.mat \ "ark,bg:cat $repeated_data | rnnlm-get-egs --chunk-length=$chunk_length --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error & done wait # wait for just the training jobs. [ -f $dir/.train_error ] && \ echo "$0: failure on iteration $x of training, see $dir/log/train.$x.*.log for details." && exit 1 if [ $this_num_jobs -gt 1 ]; then # average the models and the embedding matrces. Use run.pl as we don\'t # want this to wait on the queue (if there is a queue). src_models=$(for n in $(seq $this_num_jobs); do echo $dir/$[x+1].$n.raw; done) src_matrices=$(for n in $(seq $this_num_jobs); do echo $dir/${embedding_type}_embedding.$[x+1].$n.mat; done) run.pl $dir/log/average.$[x+1].log \ nnet3-average $src_models $dir/$[x+1].raw '&&' \ matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat fi # optionally, perform cleanup after training if [ "$cleanup" = true ] ; then python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters fi ) # the error message below is not that informative, but $cmd will # have printed a more specific one. [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1; fi x=$[x+1] num_splits_processed=$[num_splits_processed+this_num_jobs] done wait # wait for diagnostic jobs in the background. if [ $stage -le $num_iters ]; then # link the best model we encountered during training (based on # dev-set probability) as the final model. best_iter=$(rnnlm/get_best_model.py $dir) echo "$0: best iteration (out of $num_iters) was $best_iter, linking it to final iteration." train_best_log=$dir/log/train.$best_iter.1.log ppl_train=`grep 'Overall objf' $train_best_log | awk '{printf("%.1f",exp(-$10))}'` dev_best_log=$dir/log/compute_prob.$best_iter.log ppl_dev=`grep 'Overall objf' $dev_best_log | awk '{printf("%.1f",exp(-$NF))}'` echo "$0: train/dev perplexity was $ppl_train / $ppl_dev." ln -sf ${embedding_type}_embedding.$best_iter.mat $dir/${embedding_type}_embedding.final.mat ln -sf $best_iter.raw $dir/final.raw fi # Now get some diagnostics about the evolution of the objective function. if [ $stage -le $[num_iters+1] ]; then ( logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/train.$iter.1.log ''; done) # in the non-sampling case the exact objf is printed and we plot that # in the sampling case we print the approximated objf for training. grep 'Overall objf' $logs | awk 'BEGIN{printf("Train objf: ")} /exact/{printf("%.2f ", $NF);next} {printf("%.2f ", $10)} END{print "";}' logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/compute_prob.$iter.log ''; done) grep 'Overall objf' $logs | awk 'BEGIN{printf("Dev objf: ")} {printf("%.2f ", $NF)} END{print "";}' ) > $dir/report.txt cat $dir/report.txt fi |