Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/train_lvtln.sh 14.9 KB
  #!/bin/bash
  
  # Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
  # Copyright 2014       Vimal Manohar
  # This training script trains linear-VTLN models starting from an existing
  # system based on either LDA+MLLT or delta+delta-delta features.
  # Works with either mfcc or plp features, but you need to set the 
  # --base-feat-type option.
  # The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh
  # to get VTLN warping factors for data, for warped data extraction, or (for
  # the training data) you can use the warping factors this script outputs
  # in $dir/final.warp
  #
  # Apache 2.0
  
  # Begin configuration.
  stage=-6 #  This allows restarting after partway, when something when wrong.
  config=
  cmd=run.pl
  scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
  realign_iters="10 20 30";
  num_iters=35    # Number of iterations of training
  max_iter_inc=25 # Last iter to increase #Gauss on.
  beam=10
  retry_beam=40
  boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
  power=0.25 # Exponent for number of gaussians according to occurrence counts
  cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
  cmvn_opts=  # you can supply e.g. --cmvn-opts "--norm-vars=true" to turn on variance
              # normalization, but only if base system is the delta type, not LDA.
  lvtln_iters="2 4 6 8 10 12 14 16 20"; # iters on which to recompute LVTLN transform"
  num_utt_lvtln_init=200; # number of utterances (subset) to initialize
                          # LVTLN transform.  Not too critical.
  min_warp=0.85
  max_warp=1.25
  warp_step=0.01
  base_feat_type=mfcc # or could be PLP.
  logdet_scale=0.0
  
  # End configuration.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh;
  . parse_options.sh || exit 1;
  
  num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1;
  default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1;
  
  if [ $# != 6 ]; then
     echo "Usage: $0 <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
     echo "e.g.: $0 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
     echo "main options (for others, see top of script file)"
     echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
     echo "  --config <config-file>                           # config containing options"
     echo "  --stage <stage>                                  # stage to do partial re-run from."
     exit 1;
  fi
  
  numleaves=$1
  totgauss=$2
  data=$3
  lang=$4
  alidir=$5
  dir=$6
  
  for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  
  numgauss=$numleaves
  incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss
  oov=`cat $lang/oov.int` || exit 1;
  silphonelist=`cat $lang/phones/silence.csl` || exit 1;
  ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
  nj=`cat $alidir/num_jobs` || exit 1;
  splice_opts=`cat $alidir/splice_opts 2>/dev/null`
  mkdir -p $dir/log
  echo $nj > $dir/num_jobs
  
  utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
  cp $lang/phones.txt $dir || exit 1;
  
  sdata=$data/split$nj;
  split_data.sh $data $nj || exit 1;
  
  
  cp $alidir/splice_opts $dir 2>/dev/null
  
  
  if [ ! -f $alidir/final.mat ]; then
    [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \
      echo "$0: warning: ignoring CMVN options from $alidir.";
    echo $cmvn_opts > $dir/cmvn_opts
  
    echo "$0: Using delta+delta-delta features since $alidir/final.mat does not exist"
    sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
    # for the subsets of features that we use to estimate the linear transforms, we don't
    # bother with CMVN.  This will give us wrong offsets on the transforms, but it will end
    # up not mattering because we allow an arbitrary offset (bias) term when we apply
    # these transforms.
    featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- |" # you need to define CLASS when invoking $cmd.
    featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- |"
  else
    echo "$0: Using LDA features"
    [ ! -z "$cmvn_opts" ] && echo  "$0: you cannot supply --cmvn-opts if base system is LDA."
    cp $alidir/final.mat $alidir/full.mat $alidir/splice_opts $alidir/cmvn_opts $dir 2>/dev/null 
    cmvn_opts=`cat $dir/cmvn_opts 2>/dev/null`
    sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
    featsub_warped="ark:splice-feats $splice_opts ark:$dir/feats.CLASS.ark ark:- | transform-feats $dir/final.mat ark:- ark:- |" # you need to define CLASS when invoking $cmd.
    featsub_unwarped="ark:splice-feats $splice_opts ark:$dir/feats.$default_class.ark ark:- | transform-feats $dir/final.mat ark:- ark:- |"  
  fi
  
  if [ -f $data/utt2warp ]; then
    echo "$0: source data directory $data appears to already have VTLN.";
    exit 1;
  fi
  
  # create a small subset of utterances for purposes of initializing the LVTLN transform
  # utils/shuffle_list.pl is deterministic, unlike sort -R.
  cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \
    head -n $num_utt_lvtln_init > $dir/utt_subset
  
  if [ $stage -le -6 ]; then
    echo "$0: computing warped subset of features"
    if [ -f $data/segments ]; then
      echo "$0 [info]: segments file exists: using that."
      subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- "
    else
      echo "$0 [info]: no segments file exists: using wav.scp directly."
      subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- "
    fi
    rm $dir/.error 2>/dev/null
    for c in $(seq 0 $[$num_classes-1]); do
      this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
      $cmd $dir/log/compute_warped_feats.$c.log \
        $subset_feats \| compute-${base_feat_type}-feats --verbose=2 \
        --config=conf/${base_feat_type}.conf --vtln-warp=$this_warp ark:- ark:- \| \
        copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error &
    done
    wait;
    if [ -f $dir/.error ]; then
      echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log"
      exit 1;
    fi
  fi
  
  if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \
    compare-feats --threshold=0.98 scp:-  ark:$dir/feats.$default_class.ark >&/dev/null; then
    echo "$0: features stored on disk differ from those computed with no warping."
    echo "    Possibly your feature type is wrong (--base-feat-type option)"
    exit 1;
  fi
    
  if [ -f $data/segments ]; then
    subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
  else
    echo "$0 [info]: no segments file exists: using wav.scp directly."
    subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |"
  fi
  
  if [ $stage -le -5 ]; then
    echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)"
    dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1;
  
    $cmd $dir/log/init_lvtln.log \
      gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \
        $dir/0.lvtln || exit 1;
  
    $cmd JOB=1:$nj $dir/log/get_weights.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz |" ark:- \| \
      weight-silence-post 0.0 "$silphonelist" $alidir/final.mdl ark:- ark:- \| \
      post-to-weights ark:- "ark,scp:$dir/weights.JOB.ark,$dir/weights.JOB.scp" || exit 1
  
    for n in `seq 1 $nj`; do 
      cat $dir/weights.$n.scp
    done > $dir/weights.scp
  
    for c in $(seq 0 $[$num_classes-1]); do
      this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));")
      orig_feats=ark:$dir/feats.$default_class.ark
      warped_feats=ark:$dir/feats.$c.ark
      logfile=$dir/log/train_special.$c.log
      this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)"
      if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \
        --weights-in="scp:$dir/weights.scp" \
        $c $dir/0.lvtln $dir/0.lvtln \
        "$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then
        echo "$0: Error training LVTLN transform, see $logfile";
        exit 1;
      fi
    done  
    rm $dir/final.lvtln 2>/dev/null
    ln -s 0.lvtln $dir/final.lvtln
  fi
  
  if [ $stage -le -4 ]; then
    echo "$0: computing initial LVTLN transforms for speakers"
  
    if [ -f $alidir/final.alimdl ]; then
      # if the base system was trained with SAT, it's probably better
      # to use the .alimdl, trained speaker-independent, to get the
      # LVTLN transforms (LVTLN may be closer to an unadapted system).
      echo "$0: to get initial LVTLN transforms, using $alidir/final.alimdl"
      srcmodel=$alidir/final.alimdl
    else
      srcmodel=$alidir/final.mdl
    fi
  
    $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      weight-silence-post 0.0 "$silphonelist" $alidir/final.mdl ark:- ark:- \| \
      gmm-post-to-gpost $srcmodel "$sifeats" ark:- ark:- \| \
      gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
        --spk2utt=ark:$sdata/JOB/spk2utt $srcmodel \
        $dir/0.lvtln "$sifeats" ark:- ark:$dir/trans.JOB ark,t:$dir/warp.0.JOB || exit 1
    
    # consolidate the warps into one file.
    for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0
    rm $dir/warp.0.*
  fi
  
  if [ $stage -le -3 ]; then
    echo "$0: accumulating tree stats"
    $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
      acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
       "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
    sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
    rm $dir/*.treeacc
  fi
  
  if [ $stage -le -2 ]; then
    echo "$0: getting questions for tree-building, via clustering"
    # preparing questions, roots file...
    cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
    cat $lang/phones/extra_questions.int >> $dir/questions.int
    compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
  
    echo "$0: building the tree"
    $cmd $dir/log/build_tree.log \
      build-tree --verbose=1 --max-leaves=$numleaves \
      --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
      $dir/questions.qst $lang/topo $dir/tree || exit 1;
  
    gmm-init-model  --write-occs=$dir/1.occs  \
      $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
    grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
  
    gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
    rm $dir/treeacc
  fi
  
  if [ $stage -le -1 ]; then
    # Convert the alignments.
    echo "$0: converting alignments from $alidir to use current tree"
    $cmd JOB=1:$nj $dir/log/convert.JOB.log \
      convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
       "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
  fi
  
  if [ $stage -le 0 ]; then
    echo "$0: compiling graphs of transcripts"
    $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
      compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
        "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
  fi
  
  x=1
  while [ $x -lt $num_iters ]; do
    echo "$0: training pass $x"
    if echo $realign_iters | grep -w $x >/dev/null; then
      if [ $stage -le $x ]; then
        echo "$0: aligning data"
        mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
        $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
          gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \
           "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
           "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
      fi
    fi
    if echo $lvtln_iters | grep -w $x >/dev/null; then
      if [ $stage -le $x ]; then
        echo "Re-estimating LVTLN transforms"
        $cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \
          ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
          weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
          gmm-post-to-gpost $dir/$x.mdl "$feats" ark:- ark:- \| \
          gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \
            --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \
            $dir/0.lvtln "$sifeats" ark:- ark:$dir/new_trans.JOB ark,t:$dir/warp.$x.JOB || exit 1
        # consolidate the warps into one file.
        for j in $(seq $nj); do mv $dir/new_trans.$j $dir/trans.$j; done
        for j in $(seq $nj); do cat $dir/warp.$x.$j; done > $dir/warp.$x
        rm $dir/warp.$x.*
      fi
    fi
  
    if [ $stage -le $x ]; then
      $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
        gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
        "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
      $cmd $dir/log/update.$x.log \
        gmm-est --mix-up=$numgauss --power=$power \
        --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
        "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
      rm $dir/$x.mdl $dir/$x.*.acc
      rm $dir/$x.occs
    fi
    [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
    x=$[$x+1];
  done
  
  
  if [ $stage -le $x ]; then
    # Accumulate stats for "alignment model"-- this model is computed with the
    # speaker-independent features, but matches Gaussian-for-Gaussian with the
    # final speaker-adapted model.
    $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
      ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
      gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
      ark,s,cs:- $dir/$x.JOB.acc || exit 1;
    [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1;
    # Update model.
    $cmd $dir/log/est_alimdl.log \
      gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \
      "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
    rm $dir/$x.*.acc
  fi
  
  if true; then # Diagnostics
    last_iter=$(echo 0 $lvtln_iters  | awk '{print $NF;}')
    ln -sf warp.$last_iter $dir/final.warp
    if [ -f $data/spk2gender ]; then 
      # To make it easier to eyeball the male and female speakers' warps
      # separately, separate them out.
      for g in m f; do # means: for gender in male female
        cat $dir/final.warp | \
          utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g
        echo -n "The last few warp factors for gender $g are: "
        tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; 
        echo
      done
    fi
  fi
  
  ln -sf $x.mdl $dir/final.mdl
  ln -sf $x.occs $dir/final.occs
  ln -sf $x.alimdl $dir/final.alimdl
  
  # Summarize warning messages...
  utils/summarize_warnings.pl  $dir/log
  
  echo "$0: Done training LVTLN system in $dir"