Blame view
egs/wsj/s5/steps/oracle_wer.sh
4.29 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
#!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey) 2013 # Apache 2.0. # Begin configuration section. wildcard_symbols= cmd=run.pl acwt=0.08333 beam= stage=0 cleanup=true # End configuration section. . utils/parse_options.sh echo "$0 $@" # Print the command line for logging if [ $# != 3 ]; then echo "Compute lattice oracle WER and depth, optionally pruning and minimizing the lattice" echo "beforehand. To produce oracle WER, requires there to be a file 'text' in data dir" echo "(not usable if only stm is present)" echo "" echo "Usage: $0 [options] <data-dir> <lang-dir> <decode-dir>" echo "e.g.: $0 --wildcard-symbols=1:3:4 data/test data/lang exp/tri5/test_tg" echo "Options:" echo " --wildcard-symbols <colon-separated-integer-list> # Allows you to specify words" echo " # to be removed from both reference" echo " # and hypothesis before computing oracle." echo " --cmd <cmd> # How to run the jobs (default: run.pl)" echo " --acwt <acwt> # Acoustic scale, default $acwt: only" echo " # has an effect if --prune option used." echo " --beam <prune-beam, e.g. 6.0> # Lattice pruning beam (optional; can" echo " # be used to compute oracle and depth at" echo " # various beams." echo " --stage <stage> # Used to control partial re-runs" echo " --cleanup <true|false> # If true, remove pruned lattices." exit 1; fi . ./path.sh || exit 1; data=$1 lang=$2 dir=$3 for f in $data/text $lang/words.txt $dir/lat.1.gz; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; done nj=`cat $dir/num_jobs` || exit 1; oov_sym=`cat $lang/oov.int` sdata=$data/split$nj; split_data.sh $data $nj || exit 1; nl=$(grep -v IGNORE_TIME_SEGMENT_IN_SCORING $data/text | wc -l) if [ $nl -eq 0 ]; then echo "$0: error: $data/text only contains IGNORE_TIME_SEGMENT_IN_SCORING, or is empty." exit 1; fi if [ ! -z "$beam" ]; then prunedir=${dir}/lats_beam${beam} mkdir -p $prunedir/log if [ $stage -le 0 ]; then echo "$0: creating pruned lattices" $cmd JOB=1:$nj $prunedir/log/prune.JOB.log \ lattice-prune --acoustic-scale=$acwt --beam=$beam \ "ark:gunzip -c $dir/lat.JOB.gz|" "ark:|gzip -c >$prunedir/lat.JOB.gz" || exit 1; fi else prunedir=$dir fi mkdir -p $prunedir/log if [ $stage -le 1 ]; then echo "$0: measuring lattice depth" $cmd JOB=1:$nj $prunedir/log/lattice_depth.JOB.log \ lattice-depth "ark:gunzip -c $prunedir/lat.JOB.gz|" ark:/dev/null || exit 1; # look for lines like: LOG (blah:blah.cc:95) Overall density is 153.3 over 164361 frames grep -w Overall $prunedir/log/lattice_depth.*.log | \ awk -v nj=$nj '{num+=$6*$8; den+=$8; nl++} END{ if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; } printf("%.2f ( %d / %d ) ", num/den, num, den); }' > $prunedir/depth || exit 1; echo -n "Depth is: " cat $prunedir/depth fi if [ $stage -le 2 ]; then echo "$0: measuring lattice oracle WER" $cmd JOB=1:$nj $prunedir/log/lattice_oracle.JOB.log \ lattice-oracle --wildcard-symbols=$wildcard_symbols \ "ark:gunzip -c $prunedir/lat.JOB.gz|" \ "ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt $sdata/JOB/text | grep -v IGNORE_TIME_SEGMENT_IN_SCORING |" \ ark:/dev/null || exit 1; # look for lines like: LOG (blah:blah.cc:95) Overall %WER 25.6 [ 1243 / 6331, ... ] grep -w Overall $prunedir/log/lattice_oracle.*.log | \ awk -v nj=$nj '{num+=$7; den+=$9; ins+=$10; del+=$12; sb+=$14; nl++} END{ if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; } printf("%.2f%% [ %d / %d, %d insertions, %d deletions, %d substitutions ] ", (100.0 * num/den), num, den, ins, del, sb); }' > \ $prunedir/oracle_wer || exit 1; echo -n "Oracle WER is: " cat $prunedir/oracle_wer fi if $cleanup && [ ! -z $beam ]; then echo "$0: removing pruned lattices in $prunedir" rm $prunedir/lat.*.gz fi exit 0; |