Yannick Estève / ONTRAC-Kaldi

Blame view

egs/lre/v1/lid/train_lvtln_model.sh 11.3 KB
  #!/bin/bash
  
  # Copyright       2014  Daniel Povey
  # Apache 2.0
  
  #
  # This training script computes some things you will need in order to
  # extract VTLN-warped features.  It takes as input the data directory
  # and an already-trained diagonal-covariance UBM.  Note: although this
  # script is in the lid/ directory, because it is intended to be
  # used in language identification, but it uses features of the
  # same type as those used in the speaker-id scripts (see ../sid/),
  # i.e. double-delta features, rather than the "shifted delta cepstra"
  # features commonly used in language id.
  #
  # This script works with either mfcc or plp features; for plp features, you will
  # need to set the --base-feat-type option.  Regardless, you will need to set the
  # --mfcc-config or --plp-config option if your feature-extraction config is not
  # called conf/${base_feat_type}.conf.  The output of this script will be in
  # $dir/final.lvtln and $dir/final.dubm and $dir/final.ali_dubm; the directory
  # can be passed to ./get_vtln_warps.sh to get VTLN warps for a data directory,
  # or (for data passed to this script) you can use the warping factors this
  # script outputs in $dir/final.warp
  #
  
  # Begin configuration.
  stage=-4 #  This allows restarting after partway, when something when wrong.
  config=
  cmd=run.pl
  num_iters=15    # Number of iterations of training.
  num_utt_lvtln_init=400; # number of utterances (subset) to initialize
                          # LVTLN transform.  Not too critical.
  min_warp=0.85
  max_warp=1.25
  warp_step=0.01
  base_feat_type=mfcc # or could be PLP.
  mfcc_config=conf/mfcc.conf  # default, can be overridden.
  plp_config=conf/plp.conf  # default, can be overridden.
  logdet_scale=0.0
  subsample=5 # We use every 5th frame by default; this is more
              # CPU-efficient.
  min_gaussian_weight=0.0001 # does not matter; inherited from diag-ubm training script.
  nj=4
  cleanup=true
  num_gselect=15
  # End configuration.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh;
  . parse_options.sh || exit 1;
  
  num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
  default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;
  
  if [ $# != 3 ]; then
     echo "Usage: $0 <data-dir> <diag-ubm-dir> <exp-dir>"
     echo "e.g.: $0 data/train_vtln exp/diag_ubm_vtln exp/vtln"
     echo "main options (for others, see top of script file)"
     echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
     echo "  --nj <num-jobs>                                  # number of jobs to use (default 4)"
     echo "  --config <config-file>                           # config containing options"
     echo "  --stage <stage>                                  # stage to do partial re-run from."
     echo "  --num-iters <num-iters>                          # number of iterations of training"
     echo "  --base-feat-type <feat-type>                     # mfcc or plp, mfcc is default"
     echo "  --mfcc-config <config>                           # config for MFCC extraction, default is"
     echo "                                                   # conf/mfcc.conf"
     echo "  --plp-config <config>                            # config for PLP extraction, default is"
     echo "                                                   # conf/plp.conf"
     exit 1;
  fi
  
  data=$1
  ubmdir=$2
  dir=$3
  
  for f in $data/feats.scp $ubmdir/final.dubm; do
    [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
  done
  
  
  mkdir -p $dir/log
  echo $nj > $dir/num_jobs
  
  sdata=$data/split$nj;
  split_data.sh $data $nj || exit 1;
  
  cmvn_sliding_opts="--norm-vars=false --center=true --cmn-window=300"
  # don't change $cmvn_sliding_opts, it should probably match the
  # options used in ../sid/train_diag_ubm.sh.
  sifeats="ark,s,cs:add-deltas scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding $cmvn_sliding_opts ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
  
  
  # for the subsets of features that we use to estimate the linear transforms, we
  # don't bother with CMN.  This will give us wrong offsets on the transforms,
  # but it won't matter because we will allow an arbitrary bias term when we apply
  # these transforms.
  
  # you need to define CLASS when invoking $cmd on featsub_warped.
  featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
  featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- | select-voiced-frames ark:- scp,s,cs:$data/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |"
  
  
  if [ -f $data/utt2warp ]; then
    echo "$0: source data directory $data appears to already have VTLN.";
    exit 1;
  fi
  
  # create a small subset of utterances for purposes of initializing the LVTLN transform
  # utils/shuffle_list.pl is deterministic, unlike sort -R.
  cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
    head -n $num_utt_lvtln_init > $dir/utt_subset
  
  if [ $stage -le -4 ]; then
    echo "$0: computing warped subset of features"
    if [ -f $data/segments ]; then
      echo "$0 [info]: segments file exists: using that."
      subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
    else
      echo "$0 [info]: no segments file exists: using wav.scp directly."
      subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
    fi
    rm $dir/.error 2>/dev/null
    for c in $(seq 0 $[$num_classes-1]); do
      this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
      config_name=${base_feat_type}_config # e.g. mfcc_config or plp_config
      this_config=$(eval echo \$$config_name)  #  e.g. conf/mfcc.conf or conf/plp.conf by default.
      $cmd $dir/log/compute_warped_feats.$c.log \
        $subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
        --config=$this_config --vtln-warp=$this_warp ark:- ark:- \| \
        copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
    done
    wait;
    if [ -f $dir/.error ]; then
      echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
      exit 1;
    fi
  fi
  
  if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
    compare-feats --threshold=0.98 scp:-  ark:$dir/feats.$default_class.ark >&/dev/null; then
    echo "$0: features stored on disk differ from those computed with no warping."
    echo "    Possibly your feature type is wrong (--base-feat-type option)"
    exit 1;
  fi
    
  if [ -f $data/segments ]; then
    subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
  else
    echo "$0 [info]: no segments file exists: using wav.scp directly."
    subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
  fi
  
  if [ $stage -le -3 ]; then
    echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
    dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;
  
    $cmd $dir/log/init_lvtln.log \
      gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
        $dir/0.lvtln || exit 1;
  
    for c in $(seq 0 $[$num_classes-1]); do
      this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
      orig_feats=ark:$dir/feats.$default_class.ark
      warped_feats=ark:$dir/feats.$c.ark
      logfile=$dir/log/train_special.$c.log
      this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
      if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
        $c $dir/0.lvtln $dir/0.lvtln \
        "$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
        echo "$0: Error training LVTLN transform, see $logfile";
        exit 1;
      fi
    done  
  fi
  
  cp $ubmdir/final.dubm $dir/0.dubm
  
  if [ $stage -le -2 ]; then
    echo "$0: computing Gaussian selection info."
  
    $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
      gmm-gselect --n=$num_gselect $ubmdir/final.dubm "$sifeats" \
        "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
  fi
    
  
  if [ $stage -le -1 ]; then
    echo "$0: computing initial LVTLN transforms"  # do this per-utt.
  
    $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
      gmm-global-gselect-to-post $dir/0.dubm "$sifeats" \
        "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
      gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
        $dir/0.dubm $dir/0.lvtln "$sifeats" ark,s,cs:- ark:$dir/trans.0.JOB ark,t:$dir/warp.0.JOB || exit 1
    
    # consolidate the warps into one file.
    for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
    rm $dir/warp.0.*
  fi
  
  
  x=0
  while [ $x -lt $num_iters ]; do
    feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
  
    # First update the model.
    if [ $stage -le $x ]; then
      echo "$0: Updating model on pass $x"
      # Accumulate stats.
      $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
        gmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
         $dir/$x.dubm "$feats" $dir/$x.JOB.acc || exit 1;
  
      $cmd $dir/log/update.$x.log \
        gmm-global-est --remove-low-count-gaussians=false --min-gaussian-weight=$min_gaussian_weight \
          $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
        $dir/$[$x+1].dubm || exit 1;
      $cleanup && rm $dir/$x.*.acc $dir/$x.dubm
    fi
  
    # Now update the LVTLN transforms (and warps.)
    if [ $stage -le $x ]; then
      echo "$0: re-estimating LVTLN transforms on pass $x"
      $cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
        gmm-global-gselect-to-post $dir/$[$x+1].dubm "$feats" \
          "ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" ark:- \| \
        gmm-global-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
          $dir/$[$x+1].dubm $dir/0.lvtln "$sifeats" ark,s,cs:- \
          ark:$dir/trans.$[$x+1].JOB ark,t:$dir/warp.$[$x+1].JOB || exit 1
  
      # consolidate the warps into one file.
      for j in $(seq $nj); do cat $dir/warp.$[$x+1].$j; done > $dir/warp.$[$x+1]
      rm $dir/warp.$[$x+1].*
      $cleanup && rm $dir/trans.$x.*
    fi
    x=$[$x+1]
  done
  
  feats="$sifeats transform-feats ark:$dir/trans.$x.JOB ark:- ark:- |"
  
  if [ $stage -le $x ]; then
    # Accumulate stats for "alignment model"-- this model is computed with the
    # speaker-independent features, but matches Gaussian-for-Gaussian with the
    # final speaker-adapted model.
    $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
      gmm-global-acc-stats-twofeats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
        $dir/$x.dubm "$feats" "$sifeats" $dir/$x.JOB.acc || exit 1
    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
    # Update model.
    $cmd $dir/log/est_alimdl.log \
      gmm-global-est --min-gaussian-weight=$min_gaussian_weight \
        --remove-low-count-gaussians=false $dir/$x.dubm \
       "gmm-global-sum-accs - $dir/$x.*.acc|" $dir/$x.ali_dubm  || exit 1;
    $cleanup && rm $dir/$x.*.acc
  fi
  
  if true; then # Diagnostics
    ln -sf warp.$x $dir/final.warp
    if [ -f $data/spk2gender ]; then 
      # To make it easier to eyeball the male and female speakers' warps
      # separately, separate them out.
      for g in m f; do # means: for gender in male female
        cat $dir/final.warp | \
          utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
        echo -n "The last few warp factors for gender $g are: "
        tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; 
        echo
      done
    fi
  fi
  
  ln -sf $x.dubm $dir/final.dubm
  ln -sf $x.ali_dubm $dir/final.ali_dubm
  ln -sf 0.lvtln $dir/final.lvtln
  
  # Summarize warning messages...
  utils/summarize_warnings.pl  $dir/log
  
  echo "$0: Done training LVTLN model in $dir"