Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/nnet3/chain/get_phone_post.sh 10.7 KB
  #!/bin/bash
  # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
  #  Apache 2.0.
  
  
  
  # This script obtains phone posteriors from a trained chain model, using either
  # the xent output or the forward-backward posteriors from the denominator fst.
  # The phone posteriors will be in matrices where the column index can be
  # interpreted as phone-index - 1.
  
  # You may want to mess with the compression options.  Be careful: with the current
  # settings, you might sometimes get exact zeros as the posterior values.
  
  # CAUTION!  This script isn't very suitable for dumping features from recurrent
  # architectures such as LSTMs, because it doesn't support setting the chunk size
  # and left and right context.  (Those would have to be passed into nnet3-compute
  # or nnet3-chain-compute-post).
  
  # Begin configuration section.
  stage=0
  
  nj=1  # Number of jobs to run.
  cmd=run.pl
  remove_word_position_dependency=false
  use_xent_output=false
  online_ivector_dir=
  use_gpu=false
  count_smoothing=1.0  # this should be some small number, I don't think it's critical;
                       # it will mainly affect the probability we assign to phones that
                       # were never seen in training.  note: this is added to the raw
                       # transition-id occupation counts, so 1.0 means, add a single
                       # frame's count to each transition-id's counts.
  
  # End configuration section.
  
  set -e -u
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# != 5 ]; then
    echo "Usage: $0 <chain-tree-dir> <chain-model-dir> <lang-dir> <data-dir> <phone-post-dir>"
    echo " e.g.: $0 --remove-word-position-dependency true --online-ivector-dir exp/nnet3/ivectors_test_eval92_hires \\"
    echo "       exp/chain/tree_a_sp exp/chain/tdnn1a_sp data/lang data/test_eval92_hires exp/chain/tdnn1a_sp_post_eval92"
    echo " ... you'll normally want to set the --nj and --cmd options as well."
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "  --cmd (run.pl|queue.pl|... <queue opts>)    # how to run jobs."
    echo "  --config <config-file>                      # config containing options"
    echo "  --stage <stage>                             # stage to do partial re-run from."
    echo "  --nj <N>                                    # Number of parallel jobs to run, default:1"
    echo "  --remove-word-position-dependency <bool>    # If true, remove word-position-dependency"
    echo "                                              # info when dumping posteriors (default: false)"
    echo "  --use-xent-output <bool>                    # If true, use the cross-entropy output of the"
    echo "                                              # neural network when dumping posteriors"
    echo "                                              # (default: false, will use chain denominator FST)"
    echo "  --online-ivector-dir <dir>                  # Directory where we dumped online-computed"
    echo "                                              # ivectors corresponding to the data in <data>"
    echo "  --use-gpu <bool>                            # Set to true to use GPUs (not recommended as the"
    echo "                                              # binary is very poorly optimized for GPU use)."
    exit 1;
  fi
  
  
  tree_dir=$1
  model_dir=$2
  lang=$3
  data=$4
  dir=$5
  
  
  for f in $tree_dir/tree $tree_dir/final.mdl $tree_dir/ali.1.gz $tree_dir/num_jobs \
           $model_dir/final.mdl $model_dir/frame_subsampling_factor $model_dir/den.fst \
           $data/feats.scp $lang/phones.txt; do
    [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
  done
  
  sdata=$data/split${nj}utt
  [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1;
  
  use_ivector=false
  
  cmvn_opts=$(cat $model_dir/cmvn_opts)
  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
  
  if [ ! -z "$online_ivector_dir" ];then
    steps/nnet2/check_ivectors_compatible.sh $model_dir $online_ivector_dir || exit 1;
    ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
    ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp |"
    ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'"
  else
    ivector_opts=
  fi
  
  if $use_gpu; then
    gpu_queue_opt="--gpu 1"
    gpu_opt="--use-gpu=yes"
    if ! cuda-compiled; then
      echo "$0: WARNING: you are running with one thread but you have not compiled"
      echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
      echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
      exit 1
    fi
  else
    gpu_queue_opts=
    gpu_opt="--use-gpu=no"
  fi
  frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor)
  
  mkdir -p $dir/log
  cp $model_dir/frame_subsampling_factor $dir/
  
  if [ $stage -le 0 ]; then
    if [ ! -f $dir/tacc ] || [ $dir/tacc -ot $tree_dir/ali.1.gz ]; then
      echo "$0: obtaining transition-id counts in $dir/tacc"
      # Obtain counts for each transition-id, from the alignments.
      this_nj=$(cat $tree_dir/num_jobs)
  
  
      $cmd JOB=1:$this_nj $dir/log/acc_taccs.JOB.log \
         ali-to-post "ark:gunzip -c $tree_dir/ali.JOB.gz|" ark:- \| \
         post-to-tacc $tree_dir/final.mdl ark:- $dir/tacc.JOB
  
      input_taccs=$(for n in $(seq $this_nj); do echo $dir/tacc.$n; done)
  
      $cmd $dir/log/sum_taccs.log \
           vector-sum --binary=false $input_taccs $dir/tacc
  
      rm $dir/tacc.*
    else
      echo "$0: skipping creation of $dir/tacc since it already exists."
    fi
  fi
  
  
  if [ $stage -le 1 ] && $remove_word_position_dependency; then
    echo "$0: creating $dir/phone_map.int"
    utils/lang/get_word_position_phone_map.pl $lang $dir
  else
    # Either way, $dir/phones.txt will be a symbol table for the phones that
    # we are dumping (although the matrices we dump won't contain anything
    # for symbol 0 which is <eps>).
    grep -v '^#' $lang/phones.txt > $dir/phones.txt
  fi
  
  if [ $stage -le 1 ]; then
    # we want the phones in integer form as it's safer for processing by script.
    # $data/fake_phones.txt will just contain e.g. "0 0
  1 1
  ....", it's used
    # to force show-transitions to print the phones as integers.
    awk '{print $2,$2}' <$lang/phones.txt >$dir/fake_phones.txt
  
  
    # The format of the 'show-transitions' command below is like the following:
    #show-transitions tempdir/phone_map.int exp/chain/tree_a_sp/final.mdl
    #Transition-state 1: phone = 1 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51
    # Transition-id = 1 p = 0.5 [self-loop]
    # Transition-id = 2 p = 0.5 [0 -> 1]
    #Transition-state 2: phone = 10 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51
    # Transition-id = 3 p = 0.5 [self-loop]
    # Transition-id = 4 p = 0.5 [0 -> 1]
  
    # The following inline script processes that info about the transition model
    # into the file $dir/phones_and_pdfs.txt, which has a line for each transition-id
    # (starting from number 1), and the format of each line is
    # <phone-id> <pdf-id>
    show-transitions $dir/fake_phones.txt $tree_dir/final.mdl | \
      perl -ane ' if(m/Transition-state.* phone = (\d+) pdf = (\d+)/) { $phone = $1; $forward_pdf = $2; $self_loop_pdf = $2; }
          if(m/Transition-state.* phone = (\d+) .* forward-pdf = (\d+) self-loop-pdf = (\d+)/) {
            $phone = $1; $forward_pdf = $2; $self_loop_pdf = $3; }
          if(m/Transition-id/) {  if (m/self-loop/) { print "$phone $self_loop_pdf
  "; }
              else { print "$phone $forward_pdf
  " } } ' > $dir/phones_and_pdfs.txt
  
  
    # The following command just separates the 'tacc' file into a similar format
    # to $dir/phones_and_pdfs.txt, with one count per line, and a line per transition-id
    # starting from number 1.  We skip the first two fields which are "[ 0" (the 0 is
    # for transition-id=0, since transition-ids are 1-based), and the last field which is "]".
    awk '{ for (n=3;n<NF;n++) print $n; }' <$dir/tacc  >$dir/transition_counts.txt
  
    num_lines1=$(wc -l <$dir/phones_and_pdfs.txt)
    num_lines2=$(wc -l <$dir/transition_counts.txt)
    if [ $num_lines1 -ne $num_lines2 ]; then
      echo "$0: mismatch in num-lines between phones_and_pdfs.txt and transition_counts.txt: $num_lines1 vs $num_lines2"
      exit 1
    fi
  
    # after 'paste', the format of the data will be
    # <phone-id> <pdf-id> <data-count>
    # we add the count smoothing at this point.
    paste $dir/phones_and_pdfs.txt $dir/transition_counts.txt | \
       awk -v s=$count_smoothing '{print $1, $2, (s+$3);}' > $dir/combined_info.txt
  
    if $remove_word_position_dependency; then
      # map the phones to word-position-independent phones; you can see $dir/phones.txt
      # to interpret the final output.
      utils/apply_map.pl -f 1 $dir/phone_map.int <$dir/combined_info.txt > $dir/temp.txt
      mv $dir/temp.txt $dir/combined_info.txt
    fi
  
    awk 'BEGIN{num_phones=1;num_pdfs=1;} { phone=$1; pdf=$2; count=$3; pdf_count[pdf] += count; counts[pdf,phone] += count;
         if (phone>num_phones) num_phones=phone; if (pdf>=num_pdfs) num_pdfs = pdf + 1; }
         END{ print "[ "; for(phone=1;phone<=num_phones;phone++) {
            for (pdf=0;pdf<num_pdfs;pdf++) printf("%.3f ", counts[pdf,phone]/pdf_count[pdf]);
             print ""; } print "]"; }' <$dir/combined_info.txt >$dir/transform.mat
  
  fi
  
  
  if [ $stage -le 2 ]; then
  
    # note: --compression-method=3 is kTwoByteAuto: Each element is stored in two
    # bytes as a uint16, with the representable range of values chosen
    # automatically with the minimum and maximum elements of the matrix as its
    # edges.
    compress_opts="--compress=true --compression-method=3"
  
    if $use_xent_output; then
      # This block uses the 'output-xent' output of the nnet.
  
      model="nnet3-copy '--edits-config=echo remove-output-nodes name=output; echo rename-node old-name=output-xent new-name=output|' $model_dir/final.mdl -|"
  
      $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \
         nnet3-compute $gpu_opt $ivector_opts \
         --frame-subsampling-factor=$frame_subsampling_factor --apply-exp=true \
         "$model" "$feats" ark:- \| \
         transform-feats $dir/transform.mat ark:- ark:- \| \
         copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp
    else
      # This block is when we are using the 'chain' output (recommended as the posteriors
      # will be much more accurate).
      $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \
         nnet3-chain-compute-post $gpu_opt $ivector_opts --transform-mat=$dir/transform.mat \
            --frame-subsampling-factor=$frame_subsampling_factor \
          $model_dir/final.mdl $model_dir/den.fst "$feats" ark:- \| \
         copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp
    fi
  
    sleep 5
    # Make a single .scp file, for convenience.
    for n in $(seq $nj); do cat $dir/phone_post.$n.scp; done > $dir/phone_post.scp
  
  fi