lmrescore_pruned.sh
4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
# Copyright 2017 Hainan Xu
# Apache 2.0
# This script rescores lattices with KALDI RNNLM using a pruned algorithm.
# The details of the algorithm could be found at
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
# One example script for this is at egs/swbd/s5c/local/rnnlm/run_lstm.sh
# Begin configuration section.
cmd=run.pl
skip_scoring=false
max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially. Details of the n-gram approximation
# method are described in section 2.3 of the paper
# http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
max_arcs= # limit the max arcs in lattice while rescoring. E.g., 20000
acwt=0.1
weight=0.5 # Interpolation weight for RNNLM.
normalize=false # If true, we add a normalization step to the output of the RNNLM
# so that it adds up to *exactly* 1. Note that this is not necessary
# as in our RNNLM setup, a properly trained network would automatically
# have its normalization term close to 1. The details of this
# could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
lattice_prune_beam=8 # Beam used in pruned lattice composition
# This option affects speed and how large the composed lattice may be
# End configuration section.
echo "$0 $@" # Print the command line for logging
. ./utils/parse_options.sh
if [ $# != 5 ]; then
echo "Does language model rescoring of lattices (remove old LM, add new LM)"
echo "with Kaldi RNNLM using a pruned algorithm. See comments in file for details"
echo ""
echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
echo " <data-dir> <input-decode-dir> <output-decode-dir>"
echo " e.g.: $0 data/lang_tg exp/rnnlm_lstm/ data/test \\"
echo " exp/tri3/test_tg exp/tri3/test_rnnlm_4gram"
echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
exit 1;
fi
[ -f path.sh ] && . ./path.sh;
oldlang=$1
rnnlm_dir=$2
data=$3
indir=$4
outdir=$5
oldlm=$oldlang/G.fst
carpa_option=
if [ ! -f $oldlm ]; then
echo "$0: file $oldlm not found; looking for $oldlang/G.carpa"
oldlm=$oldlang/G.carpa
carpa_option="--use-const-arpa=true"
fi
[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
[ ! -f $rnnlm_dir/final.raw ] && echo "$0: Missing file $rnnlm_dir/final.raw" && exit 1;
[ ! -f $rnnlm_dir/feat_embedding.final.mat ] && [ ! -f $rnnlm_dir/word_embedding.final.mat ] && echo "$0: Missing word embedding file" && exit 1;
[ ! -f $oldlang/words.txt ] &&\
echo "$0: Missing file $oldlang/words.txt" && exit 1;
! ls $indir/lat.*.gz >/dev/null &&\
echo "$0: No lattices input directory $indir" && exit 1;
awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
|| exit 1;
if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then
# the last word of the RNNLM word list is an added <brk> word
echo "$0: Word lists mismatch for lattices and RNNLM."
exit 1
fi
normalize_opt=
if $normalize; then
normalize_opt="--normalize-probs=true"
fi
special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt)
word_embedding=
if [ -f $rnnlm_dir/word_embedding.final.mat ]; then
word_embedding=$rnnlm_dir/word_embedding.final.mat
else
word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'"
fi
max_arcs_opt=
if [ ! -z "$max_arcs" ]; then
max_arcs_opt="--max-arcs=$max_arcs"
fi
mkdir -p $outdir/log
nj=`cat $indir/num_jobs` || exit 1;
cp $indir/num_jobs $outdir
$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \
--lattice-compose-beam=$lattice_prune_beam \
--acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \
$carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \
"ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
if ! $skip_scoring ; then
err_msg="$0: Not scoring because local/score.sh does not exist or not executable."
[ ! -x local/score.sh ] && echo $err_msg && exit 1;
echo local/score.sh --cmd "$cmd" $data $oldlang $outdir
local/score.sh --cmd "$cmd" $data $oldlang $outdir
else
echo "$0: Not scoring because --skip-scoring was specified."
fi
exit 0;