Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/cleanup/make_utterance_graph.sh 5.68 KB
  #!/bin/bash
  
  # Copyright 2014  Guoguo Chen
  # Apache 2.0
  
  # Begin configuration section.
  tscale=1.0      # transition scale.
  loopscale=0.1   # scale for self-loops.
  cleanup=true
  ngram_order=1
  srilm_options="-wbdiscount"   # By default, use Witten-Bell discounting in SRILM
  # End configuration section.
  
  set -e
  
  echo "$0 $@"
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 4 ]; then
    echo "This script builds one decoding graph for each utterance using the"
    echo "corresponding text in the given <text> file. If --ngram-order is 1,"
    echo "then utils/make_unigram_grammar.pl will be used to build the unigram"
    echo "language model. Otherwise SRILM will be used instead. You are supposed"
    echo "to have SRILM installed if --ngram-order is larger than 1. The format"
    echo "of the given <text> file is same as the transcript text files in data"
    echo "directory."
    echo ""
    echo "Usage: $0 [options] <text> <lang-dir> <model-dir> <graph-dir>"
    echo " e.g.: $0 data/train_si284_split/text \\"
    echo "                data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
    echo ""
    echo "Options:"
    echo "    --ngram-order           # order of n-gram language model"
    echo "    --srilm-options         # options for ngram-count in SRILM tool"
    echo "    --tscale                # transition scale"
    echo "    --loopscale             # scale for self-loops"
    echo "    --cleanup               # if true, removes the intermediate files"
    exit 1;
  fi
  
  text=$1
  lang=$2
  model_dir=$3
  graph_dir=$4
  
  for f in $lang/L_disambig.fst $lang/words.txt $lang/oov.int \
    $model_dir/final.mdl $model_dir/tree; do
    if [ ! -f $f ]; then
      echo "$0: expected $f to exist"
      exit 1;
    fi
  done
  
  mkdir -p $graph_dir/sub_graphs
  
  utils/lang/check_phones_compatible.sh $lang/phones.txt $model_dir/phones.txt
  
  # If --ngram-order is larger than 1, we will have to use SRILM
  if [ $ngram_order -gt 1 ]; then
    ngram_count=`which ngram-count` || true
    if [ -z $ngram_count ]; then
      if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
        sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
      else
        sdir=$KALDI_ROOT/tools/srilm/bin/i686
      fi
      if [ -f $sdir/ngram-count ]; then
        echo Using SRILM tools from $sdir
        export PATH=$PATH:$sdir
      else
        echo You appear to not have SRILM tools installed, either on your path,
        echo or installed in $sdir.  See tools/install_srilm.sh for installation
        echo instructions.
        exit 1
      fi
    fi
  fi
  
  # Maps OOV words to the oov symbol.
  oov=`cat $lang/oov.int`
  oov_txt=`cat $lang/oov.txt`
  
  N=`tree-info --print-args=false $model_dir/tree |\
    grep "context-width" | awk '{print $NF}'`
  P=`tree-info --print-args=false $model_dir/tree |\
    grep "central-position" | awk '{print $NF}'`
  
  # Loops over all utterances.
  if [ -f $graph_dir/sub_graphs/HCLG.fsts.scp ]; then
    rm $graph_dir/sub_graphs/HCLG.fsts.scp
  fi
  
  cat $text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
   utils/int2sym.pl -f 2- $lang/words.txt | \
   while read line; do
    uttid=`echo $line | cut -d ' ' -f 1`
    words=`echo $line | cut -d ' ' -f 2-`
  
    echo "$0: processing utterance $uttid."
  
    wdir=$graph_dir/sub_graphs/$uttid
    mkdir -p $wdir
  
    # Compiles G.fst
    if [ $ngram_order -eq 1 ]; then
      echo $words > $wdir/text
      cat $wdir/text | utils/sym2int.pl --map-oov $oov -f 1- $lang/words.txt | \
        utils/make_unigram_grammar.pl | fstcompile |\
        fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
    else
       echo $words | \
       perl -ane '@A = split; for ($n=0;$n<@A;$n++) { print "$A[$n] "; if(($n+1)%30000 == 0 || $n+1==@A) {print "
  ";} }' \
       > $wdir/text
       ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - | \
        arpa2fst --disambig-symbol=#0 \
               --read-symbol-table=$lang/words.txt - $wdir/G.fst || exit 1;
    fi
    fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."
  
    # Builds LG.fst
    fsttablecompose $lang/L_disambig.fst $wdir/G.fst |\
      fstdeterminizestar --use-log=true | fstminimizeencoded |\
      fstarcsort --sort_type=ilabel > $wdir/LG.fst || exit 1;
    fstisstochastic $wdir/LG.fst || echo "$0: $uttid/LG.fst not stochastic."
  
    # Builds CLG.fst
    clg=$wdir/CLG_${N}_${P}.fst
    fstcomposecontext --context-size=$N --central-position=$P \
      --read-disambig-syms=$lang/phones/disambig.int \
      --write-disambig-syms=$wdir/disambig_ilabels_${N}_${P}.int \
      $wdir/ilabels_${N}_${P} < $wdir/LG.fst | fstdeterminize > $wdir/CLG.fst
    fstisstochastic $wdir/CLG.fst  || echo "$0: $uttid/CLG.fst not stochastic."
  
    make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
      --transition-scale=$tscale $wdir/ilabels_${N}_${P} \
      $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
  
    # Builds HCLGa.fst
    fsttablecompose $wdir/Ha.fst $wdir/CLG.fst | \
      fstdeterminizestar --use-log=true | \
      fstrmsymbols $wdir/disambig_tid.int | fstrmepslocal | \
      fstminimizeencoded > $wdir/HCLGa.fst
    fstisstochastic $wdir/HCLGa.fst ||\
      echo "$0: $uttid/HCLGa.fst is not stochastic"
  
    add-self-loops --self-loop-scale=$loopscale --reorder=true \
      $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
  
    if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
      fstisstochastic $wdir/HCLG.fst ||\
        echo "$0: $uttid/HCLG.fst is not stochastic."
    fi
  
    echo "$uttid $wdir/HCLG.fst" >> $graph_dir/sub_graphs/HCLG.fsts.scp
    echo
   done
  
  # Copies files from lang directory.
  mkdir -p $graph_dir
  cp -r $lang/* $graph_dir
  
  am-info --print-args=false $model_dir/final.mdl |\
   grep pdfs | awk '{print $NF}' > $graph_dir/num_pdfs
  
  # Creates the graph table.
  fstcopy scp:$graph_dir/sub_graphs/HCLG.fsts.scp \
    "ark,scp:$graph_dir/HCLG.fsts,$graph_dir/HCLG.fsts.scp"
  
  if $cleanup; then
    rm -r $graph_dir/sub_graphs
  fi
  
  exit 0;