get_ctm.sh
3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Copyright 2015 Johns Hopkins University (Authors: Vijayaditya Peddinti). Apache 2.0.
set -e
beam=7
decode_mbr=true
filter_ctm_command=cp
glm=
stm=
resolve_overlaps=true
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
echo $*
if [ $# -ne 6 ]; then
echo "Usage: $0 [options] <LMWT> <word-ins-penalty> <lang-dir> <data-dir> <model> <decode-dir>"
echo " e.g.: $0 --decode-mbr true --beam 7 --glm data/dev_aspire/glm \\"
echo " --stm data/dev_aspire/stm\\"
echo " 12 1.5 data/dev_aspire_hires \\"
echo " exp/nnet2_multicondition/nnet_ms_a/final.mdl \\"
echo " exp/nnet2_multicondition/nnet_ms_a/decode_dev_aspire"
echo "main options (for others, see top of script file)"
echo " --beam <beam> # Decoding beam; default 7.0"
echo " --decode-mbr <true|false> # do mbr decoding; default true"
echo " --filter_ctm_command <string> # command for ctm filtering ;default cp"
echo " --stm <stm-file> # stm file, will score if provided"
echo " --glm <glm-file> # glm file, needs to be specified along with stm"
exit 1;
fi
LMWT=$1
wip=$2
lang=$3
data_dir=$4
model=$5
decode_dir=$6
nj=$(cat $decode_dir/num_jobs)
set -o pipefail
mkdir -p $decode_dir/score_$LMWT/penalty_$wip
if [ -f $decode_dir/../frame_shift ]; then
frame_shift_opt="--frame-shift=$(cat $decode_dir/../frame_shift)"
echo "$0: $decode_dir/../frame_shift exists, using $frame_shift_opt"
elif [ -f $decode_dir/../frame_subsampling_factor ]; then
factor=$(cat $decode_dir/../frame_subsampling_factor) || exit 1
frame_shift_opt="--frame-shift=0.0$factor"
echo "$0: $decode_dir/../frame_subsampling_factor exists, using $frame_shift_opt"
fi
lat_files=`eval "echo $decode_dir/lat.{1..$nj}.gz"`
lattice-scale --inv-acoustic-scale=$LMWT "ark:gunzip -c $lat_files|" ark:- | \
lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- | \
lattice-prune --beam=$beam ark:- ark:- | \
lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true --max-expand=10.0 --test=false \
$lang/phones/align_lexicon.int $model ark:- ark:- | \
lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping || exit 1;
ctm=$decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping
# combine the segment-wise ctm files, while resolving overlaps
if $resolve_overlaps; then
utils/ctm/resolve_ctm_overlaps.py $data_dir/segments \
$decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping \
$decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1;
ctm=$decode_dir/score_$LMWT/penalty_$wip/ctm.merged
fi
cat $ctm | utils/int2sym.pl -f 5 $lang/words.txt | \
utils/convert_ctm.pl $data_dir/segments $data_dir/reco2file_and_channel | \
sort -k1,1 -k2,2 -k3,3nb > $decode_dir/score_$LMWT/penalty_$wip/ctm || exit 1;
# Remove some stuff we don't want to score, from the ctm.
$filter_ctm_command $decode_dir/score_${LMWT}/penalty_$wip/ctm $decode_dir/score_${LMWT}/penalty_$wip/ctm.temp
awk '$4 < 0.75 + 0.2*length($5)' < $decode_dir/score_${LMWT}/penalty_$wip/ctm.temp \
| perl -ane '@A = split; $word = $A[4]; if ($word =~ s/\._//g) { $word =~ s/\.$//; $word =~ s/.s/s/; } $A[4] = $word; print join("\t", @A), "\n"; ' \
> $decode_dir/score_${LMWT}/penalty_$wip/ctm.filt || exit 1;
rm $decode_dir/score_${LMWT}/penalty_$wip/ctm.temp
if [ ! -z $stm ]; then
if [ -z $glm ]; then
echo "glm file needs to be specified " && exit 1;
fi
echo "Scoring the ctm file locally as we have the transcripts."
cp $stm $decode_dir/score_$LMWT/penalty_$wip/
stm=$decode_dir/score_$LMWT/penalty_$wip/`basename $stm`
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
hubdir=`dirname $hubscr`
$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm $decode_dir/score_$LMWT/penalty_$wip/ctm.filt || exit 1;
fi