lattice_oracle_align.sh
8.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#! /bin/bash
# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
set -e
set -o pipefail
cleanup=true
stage=0
cmd=run.pl
special_symbol="***" # Special symbol to be aligned with the inserted or
# deleted words. Your sentences should not contain this
# symbol.
print_silence=true # True if we want the silences in the ctm. We do.
frame_shift=0.01
. ./path.sh
. utils/parse_options.sh
if [ $# -ne 4 ]; then
echo "This script computes oracle paths for lattices (against a reference "
echo "transcript) and does various kinds of processing of that, for use by "
echo "steps/cleanup/cleanup_with_segmentation.sh."
echo "Its main input is <latdir>/lat.*.gz."
echo "This script outputs a human-readable word alignment of the oracle path"
echo "through the lattice in <dir>/oracle_hyp.txt, and a time-aligned ctm version of"
echo "the same in <dir>/ctm."
echo "It also creates <dir>/edits.txt (the number of edits per utterance),"
echo "<dir>/text (which is <data>/text but filtering out any utterances that"
echo "were not decoded for some reason), and <dir>/length.txt, which is the length"
echo "of the reference transcript, and <dir>/all_info.txt and <dir>/all_info.sorted.txt"
echo "which contain all the info in a way that's easier to scan for humans."
echo "Note: most of this is the same as is done in steps/cleanup/find_bad_utts.sh,"
echo "except it runs from pre-existing lattices."
echo ""
echo "Usage: $0 <data> <lang> <latdir> <dir>"
echo " e.g.: $0 data/train_si284 data/lang exp/tri4_bad_utts/lats exp/tri4_bad_utts/lattice_oracle"
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cleanup <true|false> # set this to false to disable cleanup of "
echo " # temporary files (default: true)"
echo " --cmd <command-string> # how to run jobs (default: run.pl)."
echo " --special-symbol <special-symbol> # Symbol to pad with in insertions and deletions in the"
echo " # output produced in <dir>/analysis/ (default: '***'"
echo " --print-silence <true|false> # Affects ctm generation; default is true (recommended)"
echo " --frame-shift <frame-shift> # Frame shift in seconds; default: 0.01. Affects ctm generation."
exit 1
fi
data=$1
lang=$2
latdir=$3
dir=$4
for f in $lang/oov.int $lang/words.txt $data/text $latdir/lat.1.gz $latdir/num_jobs; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
mkdir -p $dir/log
if [ -e $dir/final.mdl ]; then
model=$dir/final.mdl
elif [ -e $dir/../final.mdl ]; then
model=$dir/../final.mdl
else
echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist"
exit 1
fi
nj=$(cat $latdir/num_jobs)
oov=$(cat $lang/oov.int)
utils/split_data.sh $data $nj
sdata=$data/split${nj}
if [ $stage -le 1 ]; then
$cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \
lattice-oracle --write-lattices="ark:|gzip -c > $dir/lat.JOB.gz" \
"ark:gunzip -c $latdir/lat.JOB.gz |" \
"ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
ark,t:- \| utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/oracle_hyp.JOB.txt || exit 1;
echo -n "lattice_oracle_align.sh: overall oracle %WER is: "
grep 'Overall %WER' $dir/log/get_oracle.*.log | \
perl -e 'while (<>){ if (m: (\d+) / (\d+):) { $x += $1; $y += $2}} printf("%.2f%%\n", $x*100.0/$y); ' | \
tee $dir/log/oracle_overall_wer.log
# the awk commands below are to ensure that partially-written files don't confuse us.
for x in $(seq $nj); do cat $dir/oracle_hyp.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/oracle_hyp.txt
if $cleanup; then
rm $dir/oracle_hyp.*.txt
fi
fi
echo $nj > $dir/num_jobs
if [ $stage -le 2 ]; then
# The following command gets the time-aligned ctm as $dir/ctm.JOB.txt.
if [ -f $lang/phones/word_boundary.int ]; then
$cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
set -o pipefail '&&' \
lattice-align-words $lang/phones/word_boundary.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
elif [ -f $lang/phones/align_lexicon.int ]; then
$cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
set -o pipefail '&&' \
lattice-align-words-lexicon $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
lattice-1best ark:- ark:- \| \
nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
else
echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
exit 1;
fi
for j in $(seq $nj); do cat $dir/ctm.$j; done > $dir/ctm
if $cleanup; then rm $dir/ctm.*; fi
echo "$0: oracle ctm is in $dir/ctm"
fi
# Stages below are really just to satifsy your curiosity; the output is the same
# as that of find_bad_utts.sh.
if [ $stage -le 3 ]; then
# in case any utterances failed to align, get filtered copy of $data/text
utils/filter_scp.pl $dir/oracle_hyp.txt < $data/text > $dir/text
cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
mkdir -p $dir/analysis
align-text --special-symbol="$special_symbol" ark:$dir/text ark:$dir/oracle_hyp.txt ark,t:- | \
utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt
echo "$0: human-readable alignments are in $dir/analysis/per_utt_details.txt"
awk '{if ($2 == "#csid") print $1" "($4+$5+$6)}' $dir/analysis/per_utt_details.txt > $dir/edits.txt
n1=$(wc -l < $dir/edits.txt)
n2=$(wc -l < $dir/oracle_hyp.txt)
n3=$(wc -l < $dir/text)
n4=$(wc -l < $dir/length.txt)
if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
echo "$0: mismatch in lengths of files:"
wc $dir/edits.txt $dir/oracle_hyp.txt $dir/text $dir/length.txt
exit 1;
fi
# note: the format of all_info.txt is:
# <utterance-id> <number of errors> <reference-length> <decoded-output> <reference>
# with the fields separated by tabs, e.g.
# adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
paste $dir/edits.txt \
<(awk '{print $2}' $dir/length.txt) \
<(awk '{$1="";print;}' <$dir/oracle_hyp.txt) \
<(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
echo "$0: per-utterance details sorted from worst to best utts are in $dir/all_info.sorted.txt"
echo "$0: format is: utt-id num-errs ref-length decoded-output (tab) reference"
fi
if [ $stage -le 4 ]; then
###
# These stats might help people figure out what is wrong with the data
# a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
# b)evaluation of per-speaker performance to possibly find speakers with
# distinctive accents/speech disorders and similar
# c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
# out if there is systematic issue with lexicon, pronunciation or phonetic confusability
cat $dir/analysis/per_utt_details.txt | \
utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt
echo "$0: per-speaker details are in $dir/analysis/per_spk_details.txt"
cat $dir/analysis/per_utt_details.txt | \
utils/scoring/wer_ops_details.pl --special-symbol "$special_symbol" | \
sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
echo "$0: per-word statistics [corr,sub,ins,del] are in $dir/analysis/ops_details.txt"
fi
if [ $stage -le 5 ]; then
echo "$0: obtaining ctm edits"
$cmd $dir/log/get_ctm_edits.log \
align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:- \| \
steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
/dev/stdin $dir/ctm $dir/ctm_edits || exit 1
echo "$0: ctm with edits information appended is in $dir/ctm_edits"
fi