Blame view

egs/wsj/s5/steps/oracle_wer.sh 4.29 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
  #!/bin/bash
  
  # Copyright Johns Hopkins University (Author: Daniel Povey)  2013
  # Apache 2.0.
  
  # Begin configuration section.
  wildcard_symbols=
  cmd=run.pl
  acwt=0.08333
  beam=
  stage=0
  cleanup=true
  # End configuration section.
  
  . utils/parse_options.sh
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ $# != 3 ]; then
     echo "Compute lattice oracle WER and depth, optionally pruning and minimizing the lattice"
     echo "beforehand.  To produce oracle WER, requires there to be a file 'text' in data dir"
     echo "(not usable if only stm is present)"
     echo ""
     echo "Usage: $0 [options] <data-dir> <lang-dir> <decode-dir>"
     echo "e.g.: $0 --wildcard-symbols=1:3:4 data/test data/lang exp/tri5/test_tg"
     echo "Options:"
     echo "  --wildcard-symbols <colon-separated-integer-list>  # Allows you to specify words"
     echo "                                                     # to be removed from both reference"
     echo "                                                     # and hypothesis before computing oracle."
     echo "  --cmd <cmd>                                        # How to run the jobs (default: run.pl)"
     echo "  --acwt <acwt>                                      # Acoustic scale, default $acwt: only"
     echo "                                                     # has an effect if --prune option used."
     echo "  --beam <prune-beam, e.g. 6.0>                      # Lattice pruning beam (optional; can"
     echo "                                                     # be used to compute oracle and depth at"
     echo "                                                     # various beams."
     echo "  --stage <stage>                                    # Used to control partial re-runs"
     echo "  --cleanup <true|false>                             # If true, remove pruned lattices."
     exit 1;
  fi
  
  . ./path.sh || exit 1;
  
  data=$1
  lang=$2
  dir=$3
  
  
  for f in $data/text $lang/words.txt $dir/lat.1.gz; do
    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
  done
  
  nj=`cat $dir/num_jobs` || exit 1;
  oov_sym=`cat $lang/oov.int`
  sdata=$data/split$nj;
  split_data.sh $data $nj || exit 1;
  
  nl=$(grep -v IGNORE_TIME_SEGMENT_IN_SCORING $data/text | wc -l)
  if [ $nl -eq 0 ]; then
    echo "$0: error: $data/text only contains IGNORE_TIME_SEGMENT_IN_SCORING, or is empty."
    exit 1;
  fi
  
  if [ ! -z "$beam" ]; then
    prunedir=${dir}/lats_beam${beam}
    mkdir -p $prunedir/log
    
    if [ $stage -le 0 ]; then
      echo "$0: creating pruned lattices"
      $cmd JOB=1:$nj $prunedir/log/prune.JOB.log \
        lattice-prune --acoustic-scale=$acwt --beam=$beam  \
          "ark:gunzip -c $dir/lat.JOB.gz|" "ark:|gzip -c >$prunedir/lat.JOB.gz" || exit 1;
    fi
  else
    prunedir=$dir
  fi
  
  mkdir -p $prunedir/log
  
  
  if [ $stage -le 1 ]; then
    echo "$0: measuring lattice depth"
    $cmd JOB=1:$nj $prunedir/log/lattice_depth.JOB.log \
      lattice-depth "ark:gunzip -c $prunedir/lat.JOB.gz|" ark:/dev/null || exit 1;
  
    # look for lines like: LOG (blah:blah.cc:95) Overall density is 153.3 over 164361 frames
    grep -w Overall $prunedir/log/lattice_depth.*.log | \
      awk -v nj=$nj '{num+=$6*$8; den+=$8; nl++} END{ 
        if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; }
        printf("%.2f ( %d / %d )
  ", num/den, num, den); }' > $prunedir/depth || exit 1;
    echo -n "Depth is: "
    cat $prunedir/depth
  fi
  
  
  if [ $stage -le 2 ]; then
    echo "$0: measuring lattice oracle WER"
    $cmd JOB=1:$nj $prunedir/log/lattice_oracle.JOB.log \
      lattice-oracle --wildcard-symbols=$wildcard_symbols  \
      "ark:gunzip -c $prunedir/lat.JOB.gz|" \
     "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt $sdata/JOB/text | grep -v IGNORE_TIME_SEGMENT_IN_SCORING |"  \
     ark:/dev/null || exit 1;
  
    # look for lines like: LOG (blah:blah.cc:95) Overall %WER 25.6 [ 1243 / 6331, ... ]  
    grep -w Overall $prunedir/log/lattice_oracle.*.log | \
      awk -v nj=$nj '{num+=$7; den+=$9; ins+=$10; del+=$12; sb+=$14; nl++} END{ 
        if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; }
        printf("%.2f%% [ %d / %d, %d insertions, %d deletions, %d substitutions ]
  ", (100.0 * num/den), num, den, ins, del, sb); }' > \
        $prunedir/oracle_wer || exit 1;
    echo -n "Oracle WER is: "
    cat $prunedir/oracle_wer
  fi
  
  if $cleanup && [ ! -z $beam ]; then
    echo "$0: removing pruned lattices in $prunedir"
    rm $prunedir/lat.*.gz
  fi
  
  exit 0;