Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh 8.17 KB
  #! /bin/bash
  
  # Copyright 2016  Vimal Manohar
  #           2016  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0
  
  set -e
  set -o pipefail
  
  cleanup=true
  stage=0
  cmd=run.pl
  special_symbol="***"    # Special symbol to be aligned with the inserted or
                          # deleted words. Your sentences should not contain this
                          # symbol.
  print_silence=true      # True if we want the silences in the ctm.  We do.
  frame_shift=0.01
  
  . ./path.sh
  . utils/parse_options.sh
  
  if [ $# -ne 4 ]; then
    echo "This script computes oracle paths for lattices (against a reference "
    echo "transcript) and does various kinds of processing of that, for use by "
    echo "steps/cleanup/cleanup_with_segmentation.sh."
    echo "Its main input is <latdir>/lat.*.gz."
    echo "This script outputs a human-readable word alignment of the oracle path"
    echo "through the lattice in <dir>/oracle_hyp.txt, and a time-aligned ctm version of"
    echo "the same in <dir>/ctm."
    echo "It also creates <dir>/edits.txt (the number of edits per utterance),"
    echo "<dir>/text (which is <data>/text but filtering out any utterances that"
    echo "were not decoded for some reason), and <dir>/length.txt, which is the length"
    echo "of the reference transcript, and <dir>/all_info.txt and <dir>/all_info.sorted.txt"
    echo "which contain all the info in a way that's easier to scan for humans."
    echo "Note: most of this is the same as is done in steps/cleanup/find_bad_utts.sh,"
    echo "except it runs from pre-existing lattices."
    echo ""
    echo "Usage: $0 <data> <lang> <latdir> <dir>"
    echo " e.g.: $0 data/train_si284 data/lang exp/tri4_bad_utts/lats exp/tri4_bad_utts/lattice_oracle"
    echo "Main options (for others, see top of script file)"
    echo "  --config <config-file>            # config containing options"
    echo "  --cleanup <true|false>            # set this to false to disable cleanup of "
    echo "                                    # temporary files (default: true)"
    echo "  --cmd <command-string>            # how to run jobs (default: run.pl)."
    echo "  --special-symbol <special-symbol> #  Symbol to pad with in insertions and deletions in the"
    echo "                                    # output produced in <dir>/analysis/ (default: '***'"
    echo "  --print-silence <true|false>      # Affects ctm generation; default is true (recommended)"
    echo "  --frame-shift <frame-shift>       # Frame shift in seconds; default: 0.01.  Affects ctm generation."
    exit 1
  fi
  
  data=$1
  lang=$2
  latdir=$3
  dir=$4
  
  for f in $lang/oov.int $lang/words.txt $data/text $latdir/lat.1.gz $latdir/num_jobs; do
    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
  done
  
  mkdir -p $dir/log
  
  if [ -e $dir/final.mdl ]; then
    model=$dir/final.mdl
  elif [ -e $dir/../final.mdl ]; then
    model=$dir/../final.mdl
  else
    echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist"
    exit 1
  fi
  
  nj=$(cat $latdir/num_jobs)
  oov=$(cat $lang/oov.int)
  
  utils/split_data.sh $data $nj
  
  sdata=$data/split${nj}
  
  if [ $stage -le 1 ]; then
    $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \
      lattice-oracle --write-lattices="ark:|gzip -c > $dir/lat.JOB.gz" \
      "ark:gunzip -c $latdir/lat.JOB.gz |" \
      "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
      ark,t:- \| utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/oracle_hyp.JOB.txt || exit 1;
  
    echo -n "lattice_oracle_align.sh: overall oracle %WER is: "
    grep 'Overall %WER'  $dir/log/get_oracle.*.log  | \
      perl -e 'while (<>){ if (m: (\d+) / (\d+):) { $x += $1; $y += $2}}  printf("%.2f%%
  ", $x*100.0/$y); ' | \
      tee $dir/log/oracle_overall_wer.log
  
    # the awk commands below are to ensure that partially-written files don't confuse us.
    for x in $(seq $nj); do cat $dir/oracle_hyp.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/oracle_hyp.txt
    if $cleanup; then
      rm $dir/oracle_hyp.*.txt
    fi
  fi
  
  echo $nj > $dir/num_jobs
  
  
  if [ $stage -le 2 ]; then
    # The following command gets the time-aligned ctm as $dir/ctm.JOB.txt.
  
    if [ -f $lang/phones/word_boundary.int ]; then
      $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
        set -o pipefail '&&' \
        lattice-align-words $lang/phones/word_boundary.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
        nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
        utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
    elif [ -f $lang/phones/align_lexicon.int ]; then
      $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
        set -o pipefail '&&' \
        lattice-align-words-lexicon $lang/phones/align_lexicon.int $model  "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
        lattice-1best ark:- ark:- \| \
        nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
        utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
    else
      echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
      exit 1;
    fi
    for j in $(seq $nj); do cat $dir/ctm.$j; done > $dir/ctm
    if $cleanup; then rm $dir/ctm.*; fi
    echo "$0: oracle ctm is in $dir/ctm"
  fi
  
  
  # Stages below are really just to satifsy your curiosity; the output is the same
  # as that of find_bad_utts.sh.
  
  if [ $stage -le 3 ]; then
    # in case any utterances failed to align, get filtered copy of $data/text
    utils/filter_scp.pl $dir/oracle_hyp.txt < $data/text  > $dir/text
    cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
  
    mkdir -p $dir/analysis
  
    align-text --special-symbol="$special_symbol"  ark:$dir/text ark:$dir/oracle_hyp.txt  ark,t:- | \
      utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt
  
    echo "$0: human-readable alignments are in $dir/analysis/per_utt_details.txt"
  
    awk '{if ($2 == "#csid") print $1" "($4+$5+$6)}' $dir/analysis/per_utt_details.txt > $dir/edits.txt
  
    n1=$(wc -l < $dir/edits.txt)
    n2=$(wc -l < $dir/oracle_hyp.txt)
    n3=$(wc -l < $dir/text)
    n4=$(wc -l < $dir/length.txt)
    if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
      echo "$0: mismatch in lengths of files:"
      wc $dir/edits.txt $dir/oracle_hyp.txt $dir/text $dir/length.txt
      exit 1;
    fi
  
    # note: the format of all_info.txt is:
    # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
    # with the fields separated by tabs, e.g.
    # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
  
    paste $dir/edits.txt \
        <(awk '{print $2}' $dir/length.txt) \
        <(awk '{$1="";print;}' <$dir/oracle_hyp.txt) \
        <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
  
    sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
  
    echo "$0: per-utterance details sorted from worst to best utts are in $dir/all_info.sorted.txt"
    echo "$0: format is: utt-id num-errs ref-length decoded-output (tab) reference"
  fi
  
  if [ $stage -le 4 ]; then
    ###
    # These stats might help people figure out what is wrong with the data
    # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
    # b)evaluation of per-speaker performance to possibly find speakers with
    #   distinctive accents/speech disorders and similar
    # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
    #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability
  
    cat $dir/analysis/per_utt_details.txt | \
      utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt
  
    echo "$0: per-speaker details are in $dir/analysis/per_spk_details.txt"
  
    cat $dir/analysis/per_utt_details.txt | \
      utils/scoring/wer_ops_details.pl --special-symbol "$special_symbol" | \
      sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
  
    echo "$0: per-word statistics [corr,sub,ins,del] are in $dir/analysis/ops_details.txt"
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: obtaining ctm edits"
  
    $cmd $dir/log/get_ctm_edits.log \
      align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:-  \| \
        steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
         /dev/stdin $dir/ctm $dir/ctm_edits || exit 1
  
    echo "$0: ctm with edits information appended is in $dir/ctm_edits"
  fi