oracle_wer.sh
4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2013
# Apache 2.0.
# Begin configuration section.
wildcard_symbols=
cmd=run.pl
acwt=0.08333
beam=
stage=0
cleanup=true
# End configuration section.
. utils/parse_options.sh
echo "$0 $@" # Print the command line for logging
if [ $# != 3 ]; then
echo "Compute lattice oracle WER and depth, optionally pruning and minimizing the lattice"
echo "beforehand. To produce oracle WER, requires there to be a file 'text' in data dir"
echo "(not usable if only stm is present)"
echo ""
echo "Usage: $0 [options] <data-dir> <lang-dir> <decode-dir>"
echo "e.g.: $0 --wildcard-symbols=1:3:4 data/test data/lang exp/tri5/test_tg"
echo "Options:"
echo " --wildcard-symbols <colon-separated-integer-list> # Allows you to specify words"
echo " # to be removed from both reference"
echo " # and hypothesis before computing oracle."
echo " --cmd <cmd> # How to run the jobs (default: run.pl)"
echo " --acwt <acwt> # Acoustic scale, default $acwt: only"
echo " # has an effect if --prune option used."
echo " --beam <prune-beam, e.g. 6.0> # Lattice pruning beam (optional; can"
echo " # be used to compute oracle and depth at"
echo " # various beams."
echo " --stage <stage> # Used to control partial re-runs"
echo " --cleanup <true|false> # If true, remove pruned lattices."
exit 1;
fi
. ./path.sh || exit 1;
data=$1
lang=$2
dir=$3
for f in $data/text $lang/words.txt $dir/lat.1.gz; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
nj=`cat $dir/num_jobs` || exit 1;
oov_sym=`cat $lang/oov.int`
sdata=$data/split$nj;
split_data.sh $data $nj || exit 1;
nl=$(grep -v IGNORE_TIME_SEGMENT_IN_SCORING $data/text | wc -l)
if [ $nl -eq 0 ]; then
echo "$0: error: $data/text only contains IGNORE_TIME_SEGMENT_IN_SCORING, or is empty."
exit 1;
fi
if [ ! -z "$beam" ]; then
prunedir=${dir}/lats_beam${beam}
mkdir -p $prunedir/log
if [ $stage -le 0 ]; then
echo "$0: creating pruned lattices"
$cmd JOB=1:$nj $prunedir/log/prune.JOB.log \
lattice-prune --acoustic-scale=$acwt --beam=$beam \
"ark:gunzip -c $dir/lat.JOB.gz|" "ark:|gzip -c >$prunedir/lat.JOB.gz" || exit 1;
fi
else
prunedir=$dir
fi
mkdir -p $prunedir/log
if [ $stage -le 1 ]; then
echo "$0: measuring lattice depth"
$cmd JOB=1:$nj $prunedir/log/lattice_depth.JOB.log \
lattice-depth "ark:gunzip -c $prunedir/lat.JOB.gz|" ark:/dev/null || exit 1;
# look for lines like: LOG (blah:blah.cc:95) Overall density is 153.3 over 164361 frames
grep -w Overall $prunedir/log/lattice_depth.*.log | \
awk -v nj=$nj '{num+=$6*$8; den+=$8; nl++} END{
if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; }
printf("%.2f ( %d / %d )\n", num/den, num, den); }' > $prunedir/depth || exit 1;
echo -n "Depth is: "
cat $prunedir/depth
fi
if [ $stage -le 2 ]; then
echo "$0: measuring lattice oracle WER"
$cmd JOB=1:$nj $prunedir/log/lattice_oracle.JOB.log \
lattice-oracle --wildcard-symbols=$wildcard_symbols \
"ark:gunzip -c $prunedir/lat.JOB.gz|" \
"ark:sym2int.pl --map-oov $oov_sym -f 2- $lang/words.txt $sdata/JOB/text | grep -v IGNORE_TIME_SEGMENT_IN_SCORING |" \
ark:/dev/null || exit 1;
# look for lines like: LOG (blah:blah.cc:95) Overall %WER 25.6 [ 1243 / 6331, ... ]
grep -w Overall $prunedir/log/lattice_oracle.*.log | \
awk -v nj=$nj '{num+=$7; den+=$9; ins+=$10; del+=$12; sb+=$14; nl++} END{
if (nl != nj) { print "Error: expected " nj " lines, got " nl | "cat 1>&2"; }
printf("%.2f%% [ %d / %d, %d insertions, %d deletions, %d substitutions ]\n", (100.0 * num/den), num, den, ins, del, sb); }' > \
$prunedir/oracle_wer || exit 1;
echo -n "Oracle WER is: "
cat $prunedir/oracle_wer
fi
if $cleanup && [ ! -z $beam ]; then
echo "$0: removing pruned lattices in $prunedir"
rm $prunedir/lat.*.gz
fi
exit 0;