Kaldi / Kaldi first steps

Blame view

Scripts/steps/train_nnet_cpu_mmi.sh 16 KB
  #!/bin/bash
  
  # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
  
  # MMI (or boosted MMI) training (A.K.A. sequence training) of a neural net based 
  # system as trained by train_nnet_cpu.sh
  
  
  # Begin configuration section.
  cmd=run.pl
  epochs_per_ebw_iter=1 # Number of times we iterate over the whole
                         # data each time we do an "EBW" iteration.
  num_ebw_iters=4 # Number of "EBW" iterations.
  initial_learning_rate=0.001 # learning rate we start with.
  learning_rate_factor=1.0 # factor by which we change the learning
                           # rate each iteration (should be <= 1.0)
  E=2.0  # this is slightly analogous to the constant E used in
         # Extended Baum-Welch updates of GMMs.  It slows down (and
         # somewhat regularizes) the update.
  
  minibatch_size=256 # since the learning rate is always quite low compared with
                     # what we have at the start of ML training, we can probably
                     # afford a somewhat higher minibatch size than there, as
                     # there is less risk of instability.
  
  samples_per_iter=400000 # each phase of training, see this many samples
                           # per job.  Note: this is a kind of suggestion; we
                           # will actually find a number that will make the
                            # #iters per epoch a whole number.
  num_jobs_nnet=8 # Number of neural net training jobs to run in parallel.
                  # not the same as the num-jobs (nj) which will be the same as the
                  # alignment and denlat directories.
  stage=0
  sub_stage=-3 # this can be used to start from a particular sub-iteration of an
               # iteration
  acwt=0.1
  boost=0.0  # boosting for BMMI (you can try 0.1).. this is applied per frame.
  transform_dir=  # Note: by default any transforms in $alidir will be used.
  
  parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
  io_opts="-tc 10" # max 5 jobs running at one time (a lot of I/O.)
  num_threads=16 # number of threads for neural net trainer..
  mkl_num_threads=1
  random_copy=false
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  
  if [ $# != 6 ]; then
    echo "Usage: steps/train_nnet_cpu_mmi.sh [opts] <data> <lang> <src-dir> <ali-dir> <denlat-dir> <exp-dir>"
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "Note, the terminology is: each iteration of EBW we do multiple epochs; each epoch"
    echo " we have multiple iterations of training (note the same as the EBW iters)."
    echo "  --config <config-file>                           # config file containing options"
    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --num-ebw-iters <#iters|4>                       # number of pseudo-Extended-Baum-Welch iterations (default: 4)"
    echo "  --epochs-per-ebw-iter <#epochs|1>                # number of times to see all the data per EBW iter."
    echo "  --initial-learning-rate <initial-lrate|0.005>    # learning rate to use on the first iteration"
    echo "  --learning-rate-factor <lrate-factor|1.0>        # Factor by which to change the learning rate on each"
    echo "                                                   # EBW iteration (should be <= 1.0)"
    echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
    echo "                                                   # training (will affect results as well as speed; try 8, 16)."
    echo "                                                   # Note: if you increase this, you may want to also increase"
    echo "                                                   # the learning rate."
    echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
    echo "                                                   # as well as speed; may interact with batch size; if you increase"
    echo "                                                   # this, you may want to decrease the batch size."
    echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
    echo "                                                   # use multiple threads."
    echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for any especially I/O intensive jobs"
    echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
    echo "                                                   # should not get too large, e.g. >2k)."
    echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, for each"
    echo "                                                   # process.  Note: this will get modified to a number that will"
    echo "                                                   # divide the data into a whole number of pieces."
    echo "  --transform-dir <dir>                            # Directory to find fMLLR transforms; if not specified, "
    echo "                                                   # $alidir will be used if it has transforms"
    echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
    echo "                                                   # the middle."
    echo "  --sub-stage <sub-stage|0>                        # In conjunction with --stage, can be used to start a partially-completed"
    echo "                                                   # training process (refers to the phase number)"
    
  
    exit 1;
  fi
  
  data=$1
  lang=$2
  srcdir=$3
  alidir=$4 # Also used for transforms by default, if transform-dir not specified.
  denlatdir=$5
  dir=$6 # experimental directory
  
  # Check that some files exist, mostly to verify correct directory arguments.
  for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $alidir/ali.1.gz $denlatdir/lat.1.gz; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  
  mkdir -p $dir/log
  cp $srcdir/tree $dir
  learning_rate=$initial_learning_rate
  if [ $stage -ge -1 ]; then
    $cmd $dir/log/copy_initial.log \
       nnet-am-copy --learning-rate=$learning_rate $srcdir/final.mdl $dir/0.1.mdl
  fi
  
  nnet_context_opts="--left-context=`nnet-am-info $dir/0.1.mdl 2>/dev/null | grep -w left-context | awk '{print $2}'` --right-context=`nnet-am-info $dir/0.1.mdl 2>/dev/null | grep -w right-context | awk '{print $2}'`" || exit 1;
  
  silphonelist=`cat $lang/phones/silence.csl` || exit 1;
  
  nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
  nj2=`cat $denlatdir/num_jobs` || exit 1; # number of jobs in denlat dir
  [ "$nj" != "$nj2" ] && echo "Mismatch in #jobs $nj vs $nj2" && exit 1;
  
  sdata=$data/split$nj
  
  splice_opts=`cat $alidir/splice_opts 2>/dev/null`
  cp $alidir/splice_opts $dir 2>/dev/null
  cp $alidir/final.mat $dir 2>/dev/null # any LDA matrix...
  cp $alidir/tree $dir
  
  ## Set up features.  Note: these are different from the normal features
  ## because we have one rspecifier that has the features for the entire
  ## training set, not separate ones for each batch.
  if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
  echo "$0: feature type is $feat_type"
  
  case $feat_type in
    delta) all_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
       feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
     ;;
    lda) all_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$data/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
        feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      ;;
    *) echo "$0: invalid feature type $feat_type" && exit 1;
  esac
  
  if [ -z "$transform_dir" ] && [ -f "$alidir/trans.1" ]; then 
    # --transform-dir option not set and $alidir has transforms in it.
    transform_dir=$alidir
  fi
  
  if [ -f $alidir/trans.1 ]; then
    echo "$0: using transforms from $alidir"
    all_feats="$all_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
  else
    echo "$0: not using fMLLR transforms (assuming unadapted system)"
  fi
  
  echo "$0: working out number of frames of training data"
  
  num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1;
  
  # round to closest int
  iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
  [ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
  samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
  
  echo "Every EBW iteration, splitting the data up into $iters_per_epoch iterations,"
  echo "giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
  
  mkdir -p $dir/post $dir/egs
  
  num_epochs=$[$num_ebw_iters*$epochs_per_ebw_iter]
  
  x=0
  while [ $x -lt $num_epochs ]; do
    z=$[$x / $epochs_per_ebw_iter];  # z is the (generally) smaller iteration number that identifies the EBW pass.
    if [ $x -eq $[$z * $epochs_per_ebw_iter] ]; then
      first_iter_of_epoch=true
      echo "Starting pass $z of EBW"
    else
      first_iter_of_epoch=false
    fi
    echo "Epoch $x of $num_epochs"
  
    if [ $stage -le $x ] && $first_iter_of_epoch; then
      if [ $stage -lt $x ] || [ $sub_stage -le -3 ]; then
        # First get the per-frame posteriors, by rescoring the lattices; this
        # process also gives us at the same time the posteriors of each state for
        # each frame (by default, pruned to 0.01 with a randomized algorithm).
        # The matrix-logprob stage produces a diagnostic and passes the pseudo-log-like
        # matrix through unchanged.  (Note: nnet-logprob2-parallel can use up to
        # $num_threads threads, but in practice it may be limited by the speed of
        # the other elements of the pipe.
        $cmd $parallel_opts JOB=1:$nj $dir/log/post.$z.JOB.log \
          nnet-logprob2-parallel --num-threads=$num_threads $dir/$x.1.mdl "$feats" \
            "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
          matrix-logprob ark:- "ark:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $dir/$x.1.mdl ark:- ark:-|" ark:- \| \
          lattice-rescore-mapped $dir/$x.1.mdl "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark:- ark:- \| \
          lattice-boost-ali --b=$boost --silence-phones=$silphonelist $dir/$x.1.mdl ark:- "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
          lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
          post-to-pdf-post $dir/$x.1.mdl ark:- "ark:|gzip -c >$dir/post/den_post.$z.JOB.gz" || exit 1;
      fi
      if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
        # run nnet-get-egs for all files, to get the training examples for each frame--
        # combines the feature and label/posterior information.  The posterior information
        # consists of 2 things: the numerator posteriors from the alignments, the denominator
        # posteriors from the lattices (times -1), and the smoothing posteriors from the 
        # neural net log-probs (times E).  
        # We copy the examples for each job round-robin to multiple archives, one for each
        # of 1...$num_jobs_nnet.  
        egs_out=""
        for n in `seq 1 $num_jobs_nnet`; do
          # indexes are egs_orig.$z.$num_jobs_nnet.$nj
          egs_out="$egs_out ark:$dir/egs/egs_orig.$z.$n.JOB.ark"
        done
        $cmd JOB=1:$nj $dir/log/get_egs.$z.JOB.log \
           ali-to-pdf $dir/$x.1.mdl "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
           ali-to-post ark:- ark:- \| \
           sum-post --scale2=$E ark:- "ark:gunzip -c $dir/post/smooth_post.$z.JOB.gz|" ark:- \| \
           sum-post --scale2=-1.0 ark:- "ark:gunzip -c $dir/post/den_post.$z.JOB.gz|" ark:- \| \
           nnet-get-egs $nnet_context_opts "$feats" ark:- ark:- \| \
           nnet-copy-egs ark:- $egs_out || exit 1;
        rm $dir/post/smooth_post.$z.*.gz $dir/post/den_post.$z.*.gz 
      fi
      if $first_iter_of_epoch; then
        # Diagnostics-- work out an extra term in the objf that we have to add to
        # what we get from the nnet training.
        tail -n 50 $dir/log/post.$z.*.log | perl -e '$acwt=shift @ARGV; $acwt>0.0 || die "bad acwt"; while(<STDIN>) { if (m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames += $2; } if (m|matrix-logprob.+Average log-prob per frame is (\S+) over (\S+) frames|) { $tot_num_like += $1*$2; $tot_num_frames += $2; } } if (abs($tot_frames - $tot_num_frames) > 0.01*($tot_frames + $tot_num_frames)) { print STDERR "#frames differ $tot_frames vs $tot_num_frames
  "; }  $tot_den_lat_like /= $tot_frames; $tot_num_like /= $tot_num_frames; $objf = $acwt * $tot_num_like - $tot_den_lat_like; print $objf."
  "; ' $acwt > $dir/log/objf.$z.log
        echo "Objf on EBW iter $z is `cat $dir/log/objf.$z.log`"
      fi
      if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
        echo "Merging training examples across original #jobs ($nj), and "
        echo "splitting across number of nnet jobs $num_jobs_nnet"
        egs_out2=""
        for n in `seq 1 $iters_per_epoch`; do
          # indexes of egs_merged are: egs_merged.$z.$iters_per_epoch.$num_jobs_nnet
          egs_out2="$egs_out2 ark:$dir/egs/egs_merged.$z.$n.JOB.ark"
        done
        # Note: in the following command, JOB goes from 1 to $num_jobs_nnet, so one
        # job per parallel training job (different from the previous command).
        # We sum up over the index JOB in the previous $cmd, and write to multiple
        # archives, this time one for each "sub-iter".
        # indexes of egs_orig are: egs_orig.$z.$num_jobs_nnet.$nj
        $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/merge_and_split.$x.JOB.log \
          cat $dir/egs/egs_orig.$z.JOB.*.ark \| \
          nnet-copy-egs --random=$random_copy "--srand=\$[JOB+($x*$num_jobs_nnet)]" \
            ark:- $egs_out2 '&&' rm $dir/egs/egs_orig.$z.JOB.*.ark || exit 1;
      fi
      if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
        echo "Randomizing order of examples in each job"
        for n in `seq 1 $iters_per_epoch`; do
          s=$[$num_jobs_nnet*($n+($iters_per_epoch*$z))] # for srand
          $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$z.$n.JOB.log \
            nnet-shuffle-egs "--srand=\$[JOB+$s]" \
            ark:$dir/egs/egs_merged.$z.$n.JOB.ark ark:$dir/egs/egs.$z.$n.JOB.ark '&&' \
            rm $dir/egs/egs_merged.$z.$n.JOB.ark || exit 1;
        done
      fi
    fi
    if [ $stage -le $x ]; then
      # This block does the $iters_per_epoch iters of training.
      y=1; # y is the "sub-iteration" number.
      while [ $y -le $iters_per_epoch ]; do
        echo "Iteration $x, sub-iteration $y"
        if [ $stage -lt $x ] || [ $sub_stage -le $y ]; then
          $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.$y.JOB.log \
            nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
            $dir/$x.$y.mdl ark:$dir/egs/egs.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
            || exit 1;
          nnets_list=
          for n in `seq 1 $num_jobs_nnet`; do
            nnets_list="$nnets_list $dir/$x.$y.$n.mdl"
          done
          if [ $y -eq $iters_per_epoch ]; then next_mdl=$dir/$[$x+1].1.mdl
          else next_mdl=$dir/$x.$[$y+1].mdl; fi
          # Average the parameters of all the parallel jobs.
          $cmd $dir/log/average.$x.$y.log \
             nnet-am-average $nnets_list $next_mdl || exit 1;
          rm $nnets_list
        fi
        y=$[$y+1]
      done
    fi
    if [ $learning_rate_factor != 1.0 ]; then
      learning_rate=`perl -e "print $learning_rate * $learning_rate_factor;"`;
      ! nnet-am-copy --print-args=false --learning-rate=$learning_rate $dir/$[$x+1].1.mdl $dir/$[$x+1].1.mdl && \
         echo Error changing learning rate of neural net && exit 1;
    fi
    x=$[$x+1]
  done
  
  rm $dir/final.mdl 2>/dev/null
  ln -s $x.1.mdl $dir/final.mdl
  
  echo Done