Yannick Estève / ONTRAC-Kaldi

Blame view

egs/lre07/v1/lid/nnet2/get_lda.sh 7.71 KB
  #!/bin/bash
  
  # Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  
  #           2015 David Snyder
  # Apache 2.0.
  #
  # This script is based off of get_lda.sh in ../../steps/nnet2/, but has been
  # modified for language recogntion purposes to use a sliding window CMN.
  #
  # This script, which will generally be called from other neural-net training
  # scripts, extracts the training examples used to train the neural net (and also
  # the validation examples used for diagnostics), and puts them in separate archives.
  
  # Begin configuration section.
  cmd=run.pl
  
  feat_type=
  stage=0
  splice_width=4 # meaning +- 4 frames on each side for second LDA
  left_context= # left context for second LDA
  right_context= # right context for second LDA
  rand_prune=4.0 # Relates to a speedup we do for LDA.
  within_class_factor=0.0001 # This affects the scaling of the transform rows...
                             # sorry for no explanation, you'll have to see the code.
  transform_dir=     # If supplied, overrides alidir
  num_feats=10000 # maximum number of feature files to use.  Beyond a certain point it just
                  # gets silly to use more data.
  lda_dim=  # This defaults to no dimension reduction.
  online_ivector_dir=
  ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with
                             # this prob per iVector.
  ivector_dir=
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  
  if [ $# != 4 ]; then
    echo "Usage: lid/nnet2/get_lda.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
    echo " e.g.: lid/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
    echo " As well as extracting the examples, this script will also do the LDA computation,"
    echo " if --est-lda=true (default:true)"
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config file containing options"
    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
    echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
    echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input, overrides splice-width"
    echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input, overrides splice-width"
    echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
    echo "                                                   # the middle."
    echo "  --online-vector-dir <dir|none>                   # Directory produced by"
    echo "                                                   # steps/online/nnet2/extract_ivectors_online.sh"
    exit 1;
  fi
  
  data=$1
  lang=$2
  alidir=$3
  dir=$4
  
  [ -z "$left_context" ] && left_context=$splice_width
  [ -z "$right_context" ] && right_context=$splice_width
  
  [ ! -z "$online_ivector_dir" ] && \
    extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
  
  # Check some files.
  for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  
  
  # Set some variables.
  oov=`cat $lang/oov.int`
  num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
  silphonelist=`cat $lang/phones/silence.csl` || exit 1;
  
  nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
  # in this dir we'll have just one job.
  sdata=$data/split$nj
  utils/split_data.sh $data $nj
  
  mkdir -p $dir/log
  echo $nj > $dir/num_jobs
  cp $alidir/tree $dir
  
  [ -z "$transform_dir" ] && transform_dir=$alidir
  
  ## Set up features.  Note: these are different from the normal features
  ## because we have one rspecifier that has the features for the entire
  ## training set, not separate ones for each batch.
  if [ -z $feat_type ]; then
    if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
  fi
  echo "$0: feature type is $feat_type"
  
  
  # If we have more than $num_feats feature files (default: 10k),
  # we use a random subset.  This won't affect the transform much, and will
  # spare us an unnecessary pass over the data.  Probably 10k is
  # way too much, but for small datasets this phase is quite fast.
  N=$[$num_feats/$nj]
  
  case $feat_type in
    raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
     ;;
    lda) 
      splice_opts=`cat $alidir/splice_opts 2>/dev/null`
      cp $alidir/{splice_opts,final.mat} $dir || exit 1;
       feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      ;;
    *) echo "$0: invalid feature type $feat_type" && exit 1;
  esac
  
  if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
    echo "$0: using transforms from $transform_dir"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
  fi
  if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
    echo "$0: using raw-fMLLR transforms from $transform_dir"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
  fi
  
  
  feats_one="$(echo "$feats" | sed s:JOB:1:g)"
  # note: feat_dim is the raw, un-spliced feature dim without the iVectors.
  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
  # by default: no dim reduction.
  
  spliced_feats="$feats splice-feats --left-context=$left_context --right-context=$right_context ark:- ark:- |"
  
  if [ ! -z "$online_ivector_dir" ]; then
    ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
    # note: subsample-feats, with negative value of n, repeats each feature n times.
    spliced_feats="$spliced_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
    ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
  else
    ivector_dim=0
  fi
  echo $ivector_dim >$dir/ivector_dim
  
  if [ -z "$lda_dim" ]; then
    spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"  
    lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
  fi
  
  if [ $stage -le 0 ]; then
    echo "$0: Accumulating LDA statistics."
    rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
    $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
        weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
        acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$spliced_feats" ark,s,cs:- \
         $dir/lda.JOB.acc || exit 1;
  fi
  
  echo $feat_dim > $dir/feat_dim
  echo $lda_dim > $dir/lda_dim
  echo $ivector_dim > $dir/ivector_dim
  
  if [ $stage -le 1 ]; then
    sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
    rm $dir/lda.*.acc
  fi
  
  if [ $stage -le 2 ]; then
    # There are various things that we sometimes (but not always) need
    # the within-class covariance and its Cholesky factor for, and we
    # write these to disk just in case.
    nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
       --write-within-covar=$dir/within_covar.spmat \
       --within-class-factor=$within_class_factor --dim=$lda_dim \
        $dir/lda.mat $dir/lda.acc \
        2>$dir/log/lda_est.log || exit 1;
  fi
  
  echo "$0: Finished estimating LDA"