Blame view
egs/wsj/s5/steps/nnet/train_scheduler.sh
7.13 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
#!/bin/bash # Copyright 2012-2017 Brno University of Technology (author: Karel Vesely) # Apache 2.0 # Schedules epochs and controls learning rate during the neural network training # Begin configuration. # training options, learn_rate=0.008 momentum=0 l1_penalty=0 l2_penalty=0 # data processing, train_tool="nnet-train-frmshuff" train_tool_opts="--minibatch-size=256 --randomizer-size=32768 --randomizer-seed=777" feature_transform= split_feats= # int -> number of splits 'feats.scp -> feats.${i}.scp', starting from feats.1.scp, # (data are alredy shuffled and split to N parts), # empty -> no splitting, # learn rate scheduling, max_iters=20 min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual, keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection, dropout_schedule= # dropout-rates for N initial epochs, for example: 0.1,0.1,0.1,0.1,0.1,0.0 start_halving_impr=0.01 end_halving_impr=0.001 halving_factor=0.5 # misc, verbose=0 # 0 No GPU time-stats, 1 with GPU time-stats (slower), frame_weights= utt_weights= # End configuration. echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh; . parse_options.sh || exit 1; set -euo pipefail if [ $# != 6 ]; then echo "Usage: $0 <mlp-init> <feats-tr> <feats-cv> <labels-tr> <labels-cv> <exp-dir>" echo " e.g.: $0 0.nnet scp:train.scp scp:cv.scp ark:labels_tr.ark ark:labels_cv.ark exp/dnn1" echo "main options (for others, see top of script file)" echo " --config <config-file> # config containing options" exit 1; fi mlp_init=$1 feats_tr=$2 feats_cv=$3 labels_tr=$4 labels_cv=$5 dir=$6 [ ! -d $dir ] && mkdir $dir [ ! -d $dir/log ] && mkdir $dir/log [ ! -d $dir/nnet ] && mkdir $dir/nnet dropout_array=($(echo ${dropout_schedule} | tr ',' ' ')) # Skip training [ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0 ############################## # start training # choose mlp to start with, mlp_best=$mlp_init mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*} # optionally resume training from the best epoch, using saved learning-rate, [ -e $dir/.mlp_best ] && mlp_best=$(cat $dir/.mlp_best) [ -e $dir/.learn_rate ] && learn_rate=$(cat $dir/.learn_rate) # cross-validation on original network, log=$dir/log/iter00.initial.log; hostname>$log $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \ ${feature_transform:+ --feature-transform=$feature_transform} \ ${frame_weights:+ "--frame-weights=$frame_weights"} \ ${utt_weights:+ "--utt-weights=$utt_weights"} \ "$feats_cv" "$labels_cv" $mlp_best \ 2>> $log loss=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') loss_type=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $5; }') echo "CROSSVAL PRERUN AVG.LOSS $(printf "%.4f" $loss) $loss_type" # resume lr-halving, halving=0 [ -e $dir/.halving ] && halving=$(cat $dir/.halving) # training, for iter in $(seq -w $max_iters); do echo -n "ITERATION $iter: " mlp_next=$dir/nnet/${mlp_base}_iter${iter} # skip iteration (epoch) if already done, [ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue # set dropout-rate from the schedule, if [ -n ${dropout_array[$((${iter#0}-1))]-''} ]; then dropout_rate=${dropout_array[$((${iter#0}-1))]} nnet-copy --dropout-rate=$dropout_rate $mlp_best ${mlp_best}.dropout_rate${dropout_rate} mlp_best=${mlp_best}.dropout_rate${dropout_rate} fi # select the split, feats_tr_portion="$feats_tr" # no split? if [ -n "$split_feats" ]; then portion=$((1 + iter % split_feats)) feats_tr_portion="${feats_tr/train.scp/train.${portion}.scp}" fi # training, log=$dir/log/iter${iter}.tr.log; hostname>$log $train_tool --cross-validate=false --randomize=true --verbose=$verbose $train_tool_opts \ --learn-rate=$learn_rate --momentum=$momentum \ --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \ ${feature_transform:+ --feature-transform=$feature_transform} \ ${frame_weights:+ "--frame-weights=$frame_weights"} \ ${utt_weights:+ "--utt-weights=$utt_weights"} \ "$feats_tr_portion" "$labels_tr" $mlp_best $mlp_next \ 2>> $log || exit 1; tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), " # cross-validation, log=$dir/log/iter${iter}.cv.log; hostname>$log $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \ ${feature_transform:+ --feature-transform=$feature_transform} \ ${frame_weights:+ "--frame-weights=$frame_weights"} \ ${utt_weights:+ "--utt-weights=$utt_weights"} \ "$feats_cv" "$labels_cv" $mlp_next \ 2>>$log || exit 1; loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), " # accept or reject? loss_prev=$loss if [ 1 == $(awk "BEGIN{print($loss_new < $loss ? 1:0);}") -o $iter -le $keep_lr_iters -o $iter -le $min_iters ]; then # accepting: the loss was better, or we had fixed learn-rate, or we had fixed epoch-number, loss=$loss_new mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new) [ $iter -le $min_iters ] && mlp_best=${mlp_best}_min-iters-$min_iters [ $iter -le $keep_lr_iters ] && mlp_best=${mlp_best}_keep-lr-iters-$keep_lr_iters mv $mlp_next $mlp_best echo "nnet accepted ($(basename $mlp_best))" echo $mlp_best > $dir/.mlp_best else # rejecting, mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected mv $mlp_next $mlp_reject echo "nnet rejected ($(basename $mlp_reject))" fi # create .done file, the iteration (epoch) is completed, touch $dir/.done_iter$iter # continue with original learn-rate, [ $iter -le $keep_lr_iters ] && continue # stopping criterion, rel_impr=$(awk "BEGIN{print(($loss_prev-$loss)/$loss_prev);}") if [ 1 == $halving -a 1 == $(awk "BEGIN{print($rel_impr < $end_halving_impr ? 1:0);}") ]; then if [ $iter -le $min_iters ]; then echo we were supposed to finish, but we continue as min_iters : $min_iters continue fi echo finished, too small rel. improvement $rel_impr break fi # start learning-rate fade-out when improvement is low, if [ 1 == $(awk "BEGIN{print($rel_impr < $start_halving_impr ? 1:0);}") ]; then halving=1 echo $halving >$dir/.halving fi # reduce the learning-rate, if [ 1 == $halving ]; then learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") echo $learn_rate >$dir/.learn_rate fi done # select the best network, if [ $mlp_best != $mlp_init ]; then mlp_final=${mlp_best}_final_ ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); ) ( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; ) echo "$0: Succeeded training the Neural Network : '$dir/final.nnet'" else echo "$0: Error training neural network..." exit 1 fi |