Yannick Estève / ONTRAC-Kaldi

Blame view

scripts/rnnlm/train_rnnlm.sh 11.8 KB
  #!/usr/bin/env bash
  
  # This script does the RNNLM training.  It assumes you have already run
  # 'prepare_rnnlm_dir.sh' to prepare the directory.
  
  
  #num-jobs-initial, num-jobs-final, max-change, embedding-max-change [initial,final?],
  #num-samples, minibatch-size, chunk-length, [and the same for dev data]...
  #initial-effective-learning-rate, final-effective-learning-rate, ...
  #embedding-learning-rate-factor, num-epochs
  
  
  stage=0
  num_jobs_initial=1
  num_jobs_final=1
  rnnlm_max_change=0.5
  embedding_max_change=0.5
  chunk_length=32
  num_epochs=100  # maximum number of epochs to train.  later we
                  # may find a stopping criterion.
  initial_effective_lrate=0.001
  final_effective_lrate=0.0001
  embedding_l2=0.005
  embedding_lrate_factor=0.1  # the embedding learning rate is the
                              # nnet learning rate times this factor.
  backstitch_training_scale=0.0    # backstitch training scale
  backstitch_training_interval=1   # backstitch training interval
  cmd=run.pl  # you might want to set this to queue.pl
  
  # some options passed into rnnlm-get-egs, relating to sampling.
  num_samples=512
  sample_group_size=2  # see rnnlm-get-egs
  num_egs_threads=10  # number of threads used for sampling, if we're using
                      # sampling.  the actual number of threads that runs at one
                      # time, will be however many is needed to balance the
                      # sampling and the actual training, this is just the maximum
                      # possible number that are allowed to run
  use_gpu=true  # use GPU for training
  use_gpu_for_diagnostics=false  # set true to use GPU for compute_prob_*.log
  
  # optional cleanup options
  cleanup=false  # add option --cleanup true to enable automatic cleanup of old models
  cleanup_strategy="keep_latest"  # determines cleanup strategy, use either "keep_latest" or "keep_best"
  cleanup_keep_iters=3  # number of iterations that will have their models retained
  
  trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
  . utils/parse_options.sh
  
  if [ $# != 1 ]; then
    echo "Usage: $0 [options] <rnnlm-dir>"
    echo "Trains an RNNLM, assuming the things needed for training have already been"
    echo "set up by prepare_rnnlm_dir.sh."
    exit 1
  fi
  
  
  dir=$1
  
  
  set -e
  . ./path.sh
  
  
  for f in $dir/config/{words,data_weights,oov}.txt \
                $dir/text/1.txt $dir/text/dev.txt $dir/0.raw \
                $dir/text/info/num_splits $dir/text/info/num_repeats \
                $dir/special_symbol_opts.txt; do
    [ ! -f $f ] && echo "$0: expected $f to exist" && exit 1
  done
  
  # set some variables and check more files.
  num_splits=$(cat $dir/text/info/num_splits)
  num_repeats=$(cat $dir/text/info/num_repeats)
  text_files=$(for n in $(seq $num_splits); do echo $dir/text/$n.txt; done)
  vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
  embedding_type=
  
  if [ -f $dir/feat_embedding.0.mat ]; then
    sparse_features=true
    embedding_type=feat
    if [ -f $dir/word_embedding.0.mat ]; then
      echo "$0: error: $dir/feat_embedding.0.mat and $dir/word_embedding.0.mat both exist."
      exit 1;
    fi
    ! [ -f $dir/word_feats.txt ] && echo "$0: expected $0/word_feats.txt to exist" && exit 1;
  else
    sparse_features=false
    embedding_type=word
    ! [ -f $dir/word_embedding.0.mat ] && \
      echo "$0: expected $dir/word_embedding.0.mat to exist" && exit 1
  fi
  
  if [ $num_jobs_initial -gt $num_splits ] || [ $num_jobs_final -gt $num_splits ]; then
    echo -n "$0: number of initial or final jobs $num_jobs_initial/$num_jobs_final"
    echo "exceeds num-splits=$num_splits; reduce number of jobs"
    exit 1
  fi
  
  num_splits_to_process=$[($num_epochs*$num_splits)/$num_repeats]
  num_split_processed=0
  num_iters=$[($num_splits_to_process*2)/($num_jobs_initial+$num_jobs_final)]
  
  
  # this string will combine options and arguments.
  train_egs_args="--vocab-size=$vocab_size $(cat $dir/special_symbol_opts.txt)"
  if [ -f $dir/sampling.lm ]; then
    # we are doing sampling.
    train_egs_args="$train_egs_args --num-samples=$num_samples --sample-group-size=$sample_group_size --num-threads=$num_egs_threads $dir/sampling.lm"
  fi
  
  echo "$0: will train for $num_iters iterations"
  
  # recording some configuration information
  cat >$dir/info.txt <<EOF
  num_iters=$num_iters
  num_epochs=$num_epochs
  num_jobs_initial=$num_jobs_initial
  num_jobs_final=$num_jobs_final
  rnnlm_max_change=$rnnlm_max_change
  embedding_max_change=$embedding_max_change
  chunk_length=$chunk_length
  initial_effective_lrate=$initial_effective_lrate
  final_effective_lrate=$final_effective_lrate
  embedding_lrate_factor=$embedding_lrate_factor
  sample_group_size=$sample_group_size
  num_samples=$num_samples
  backstitch_training_scale=$backstitch_training_scale
  backstitch_training_interval=$backstitch_training_interval
  EOF
  
  
  x=0
  num_splits_processed=0
  while [ $x -lt $num_iters ]; do
  
    this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
    ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_splits_processed; nt=$num_splits_to_process;
    this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
    embedding_lrate=$(perl -e "print ($this_learning_rate*$embedding_lrate_factor);")
  
    if [ $stage -le $x ]; then
  
      # Set off the diagnostic job in the background.
      if $sparse_features; then
        word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.$x.mat -|"
      else
        word_embedding="$dir/word_embedding.$x.mat"
      fi
      if $use_gpu_for_diagnostics; then queue_gpu_opt="--gpu 1"; gpu_opt="--use-gpu=yes";
      else gpu_opt=''; queue_gpu_opt=''; fi
      backstitch_opt="--rnnlm.backstitch-training-scale=$backstitch_training_scale \
        --rnnlm.backstitch-training-interval=$backstitch_training_interval \
        --embedding.backstitch-training-scale=$backstitch_training_scale \
        --embedding.backstitch-training-interval=$backstitch_training_interval"
      [ -f $dir/.error ] && rm $dir/.error
      $cmd $queue_gpu_opt $dir/log/compute_prob.$x.log \
         rnnlm-get-egs $(cat $dir/special_symbol_opts.txt) \
                       --vocab-size=$vocab_size $dir/text/dev.txt ark:- \| \
         rnnlm-compute-prob $gpu_opt $dir/$x.raw "$word_embedding" ark:- || touch $dir/.error &
  
      if [ $x -gt 0 ]; then
        $cmd $dir/log/progress.$x.log \
          nnet3-show-progress --use-gpu=no $dir/$[$x-1].raw $dir/$x.raw '&&' \
            nnet3-info $dir/$x.raw &
      fi
  
      echo "Training neural net (pass $x)"
  
  
      ( # this sub-shell is so that when we "wait" below,
        # we only wait for the training jobs that we just spawned,
        # not the diagnostic jobs that we spawned above.
  
        # We can't easily use a single parallel SGE job to do the main training,
        # because the computation of which archive and which --frame option
        # to use for each job is a little complex, so we spawn each one separately.
        [ -f $dir/.train_error ] && rm $dir/.train_error
        for n in $(seq $this_num_jobs); do
          k=$[$num_splits_processed + $n - 1]; # k is a zero-based index that we'll derive
                                                 # the other indexes from.
          split=$[($k%$num_splits)+1]; # work out the 1-based split index.
  
          src_rnnlm="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|"
          if $sparse_features; then
            sparse_opt="--read-sparse-word-features=$dir/word_feats.txt";
            embedding_type=feat
          else
            sparse_opt=''; embedding_type=word
          fi
          if $use_gpu; then gpu_opt="--use-gpu=yes"; queue_gpu_opt="--gpu 1";
          else gpu_opt="--use-gpu=no"; queue_gpu_opt=""; fi
          if [ $this_num_jobs -gt 1 ]; then dest_number=$[x+1].$n
          else dest_number=$[x+1]; fi
          # in the normal case $repeated data will be just one copy.
          repeated_data=$(for n in $(seq $num_repeats); do echo -n $dir/text/$split.txt ''; done)
  
          rnnlm_l2_factor=$(perl -e "print (1.0/$this_num_jobs);")
          embedding_l2_regularize=$(perl -e "print ($embedding_l2/$this_num_jobs);")
  
          # allocate queue-slots for threads doing sampling,
          num_threads_=$[$num_egs_threads*2/3]
          [ -f $dir/sampling.lm ] && queue_thread_opt="--num-threads $num_threads_" || queue_thread_opt=
  
          # Run the training job or jobs.
          $cmd $queue_gpu_opt $queue_thread_opt $dir/log/train.$x.$n.log \
             rnnlm-train \
               --rnnlm.max-param-change=$rnnlm_max_change \
               --rnnlm.l2_regularize_factor=$rnnlm_l2_factor \
               --embedding.max-param-change=$embedding_max_change \
               --embedding.learning-rate=$embedding_lrate \
               --embedding.l2_regularize=$embedding_l2_regularize \
               $sparse_opt $gpu_opt $backstitch_opt \
               --read-rnnlm="$src_rnnlm" --write-rnnlm=$dir/$dest_number.raw \
               --read-embedding=$dir/${embedding_type}_embedding.$x.mat \
               --write-embedding=$dir/${embedding_type}_embedding.$dest_number.mat \
               "ark,bg:cat $repeated_data | rnnlm-get-egs --chunk-length=$chunk_length --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error &
        done
        wait # wait for just the training jobs.
        [ -f $dir/.train_error ] && \
          echo "$0: failure on iteration $x of training, see $dir/log/train.$x.*.log for details." && exit 1
        if [ $this_num_jobs -gt 1 ]; then
          # average the models and the embedding matrces.  Use run.pl as we don\'t
          # want this to wait on the queue (if there is a queue).
          src_models=$(for n in $(seq $this_num_jobs); do echo $dir/$[x+1].$n.raw; done)
          src_matrices=$(for n in $(seq $this_num_jobs); do echo $dir/${embedding_type}_embedding.$[x+1].$n.mat; done)
          run.pl $dir/log/average.$[x+1].log \
            nnet3-average $src_models $dir/$[x+1].raw '&&' \
            matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat
        fi
        # optionally, perform cleanup after training
        if [ "$cleanup" = true ] ; then
          python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters
        fi
      )
      # the error message below is not that informative, but $cmd will
      # have printed a more specific one.
      [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1;
    fi
  
    x=$[x+1]
    num_splits_processed=$[num_splits_processed+this_num_jobs]
  done
  
  wait # wait for diagnostic jobs in the background.
  
  if [ $stage -le $num_iters ]; then
    # link the best model we encountered during training (based on
    # dev-set probability) as the final model.
    best_iter=$(rnnlm/get_best_model.py $dir)
    echo "$0: best iteration (out of $num_iters) was $best_iter, linking it to final iteration."
    train_best_log=$dir/log/train.$best_iter.1.log
    ppl_train=`grep 'Overall objf' $train_best_log | awk '{printf("%.1f",exp(-$10))}'`
    dev_best_log=$dir/log/compute_prob.$best_iter.log
    ppl_dev=`grep 'Overall objf' $dev_best_log | awk '{printf("%.1f",exp(-$NF))}'`
    echo "$0: train/dev perplexity was $ppl_train / $ppl_dev."
    ln -sf ${embedding_type}_embedding.$best_iter.mat $dir/${embedding_type}_embedding.final.mat
    ln -sf $best_iter.raw $dir/final.raw
  fi
  
  # Now get some diagnostics about the evolution of the objective function.
  if [ $stage -le $[num_iters+1] ]; then
    (
      logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/train.$iter.1.log ''; done)
      # in the non-sampling case the exact objf is printed and we plot that
      # in the sampling case we print the approximated objf for training.
      grep 'Overall objf' $logs | awk 'BEGIN{printf("Train objf: ")} /exact/{printf("%.2f ", $NF);next} {printf("%.2f ", $10)} END{print "";}'
      logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/compute_prob.$iter.log ''; done)
      grep 'Overall objf' $logs | awk 'BEGIN{printf("Dev objf:   ")} {printf("%.2f ", $NF)} END{print "";}'
    ) > $dir/report.txt
    cat $dir/report.txt
  fi