lmrescore.sh
4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
# Begin configuration section.
mode=4
cmd=run.pl
skip_scoring=false
# End configuration section.
echo "$0 $@" # Print the command line for logging
for x in `seq 2`; do
[ "$1" == "--cmd" ] && cmd=$2 && shift 2;
[ "$1" == "--mode" ] && mode=$2 && shift 2;
done
if [ $# != 5 ]; then
echo "Do language model rescoring of lattices (remove old LM, add new LM)"
echo "Usage: steps/lmrescore.sh [options] <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--mode (1|2|3|4)]"
exit 1;
fi
[ -f path.sh ] && . ./path.sh;
oldlang=$1
newlang=$2
data=$3
indir=$4
outdir=$5
oldlm=$oldlang/G.fst
newlm=$newlang/G.fst
! cmp $oldlang/words.txt $newlang/words.txt && echo "Warning: vocabularies may be incompatible."
[ ! -f $oldlm ] && echo Missing file $oldlm && exit 1;
[ ! -f $newlm ] && echo Missing file $newlm && exit 1;
! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1;
oldlmcommand="fstproject --project_output=true $oldlm |"
newlmcommand="fstproject --project_output=true $newlm |"
mkdir -p $outdir/log
phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'`
if [ "$mode" == 4 ]; then
# we have to prepare $outdir/Ldet.fst in this case: determinized
# lexicon (determinized on phones), with disambig syms removed.
# take L_disambig.fst; get rid of transition with "#0 #0" on it; determinize
# with epsilon removal; remove disambiguation symbols.
fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \
fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$outdir/Ldet.fst || exit 1;
fi
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir
#for lat in $indir/lat.*.gz; do
# number=`basename $lat | cut -d. -f2`;
# newlat=$outdir/`basename $lat`
case "$mode" in
1) # 1 is inexact, it's the original way of doing it.
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore --lm-scale=-1.0 "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \
lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$outdir/lat.JOB.gz" \
|| exit 1;
;;
2) # 2 is equivalent to 1, but using more basic operations, combined.
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
gunzip -c $indir/lat.JOB.gz \| \
lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
lattice-determinize ark:- ark:- \| \
lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \
lattice-determinize ark:- ark:- \| \
gzip -c \>$outdir/lat.JOB.gz || exit 1;
;;
3) # 3 is "exact" in that we remove the old LM scores accepting any path
# through G.fst (which is what we want as that happened in lattice
# generation), but we add the new one with "phi matcher", only taking
# backoff arcs if an explicit arc did not exist.
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
gunzip -c $indir/lat.JOB.gz \| \
lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
lattice-determinize ark:- ark:- \| \
lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
lattice-determinize ark:- ark:- \| \
gzip -c \>$outdir/lat.JOB.gz || exit 1;
;;
4) # 4 is also exact (like 3), but instead of subtracting the old LM-scores,
# it removes the old graph scores entirely and adds in the lexicon,
# grammar and transition weights.
mdl=`dirname $indir`/final.mdl
[ ! -f $mdl ] && echo No such model $mdl && exit 1;
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
gunzip -c $indir/lat.JOB.gz \| \
lattice-scale --lm-scale=0.0 ark:- ark:- \| \
lattice-to-phone-lattice $mdl ark:- ark:- \| \
lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
lattice-determinize ark:- ark:- \| \
lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \
$mdl ark:- ark:- \| \
gzip -c \>$outdir/lat.JOB.gz || exit 1;
;;
esac
rm $outdir/Ldet.fst 2>/dev/null
if ! $skip_scoring ; then
[ ! -x local/score.sh ] && \
echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
local/score.sh --cmd "$cmd" $data $newlang $outdir
else
echo "Not scoring because requested so..."
fi
exit 0;