reverse_lm.sh
3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
# Copyright 2012 Brno University of Technology (Author: Mirko Hannemann)
# JHU (Author: Dan Povey)
# Apache 2.0
# configuration section
tmpdir=data/local/lm_tmp # only for OOVs and checks
lexicon=data/local/lang_tmp.reverse/lexicon.txt # only for checks
# end config section
mkdir -p $tmpdir
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: utils/reverse_lm.sh [options] <arpa-gz-file> <lang-dir> <out-dir>"
echo "e.g.: utils/reverse_lm.sh data/local/nist_lm/lm_tgpr_5k.arpa.gz data/lang.reverse data/lang_test_tgpr_5k.reverse"
echo "... where files from <lang-dir> are copied into <out-dir>"
echo "options:"
echo " --lexicon <lexicon-file> reversed lexicon (only for checks)"
exit 1;
fi
lm=$1 # gzipped arpa file
langdir=$2
outdir=$3 # output directory
# create the corresponding FST for the language model
# and the corresponding lang_test_* directory.
echo Preparing reverse language model from $lm into $outdir
echo "Finding OOVs and strange silences"
mkdir -p $outdir
for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
cp -r $langdir/$f $outdir
done
gunzip -c $lm | utils/find_arpa_oovs.pl $outdir/words.txt > $tmpdir/oovs.txt
# grep -v '<s> <s>' because the LM seems to have some strange and useless
# stuff in it with multiple <s>'s in the history. Encountered some other similar
# things in a LM from Geoff. Removing all "illegal" combinations of <s> and </s>,
# which are supposed to occur only at being/end of utt. These can cause
# determinization failures of CLG [ends up being epsilon cycles].
gunzip -c $lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' > $outdir/forward.arpa
echo "Mapping ARPA to reverse ARPA"
python utils/reverse_arpa.py $outdir/forward.arpa > $outdir/reverse.arpa
arpa2fst $outdir/reverse.arpa | fstprint | \
grep -v "230258.5" | \
utils/remove_oovs.pl $tmpdir/oovs.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$outdir/words.txt \
--osymbols=$outdir/words.txt --keep_isymbols=false --keep_osymbols=false \
| fstrmepsilon > $outdir/G_org.fst
#--arc_type=log
echo "Push weights to make it stochastic (log semi-ring)"
# delta must be very small otherwise weight pushing won't succeed
#fstpush --push_weights=true --push_labels=true --delta=1E-7 $outdir/G_log.fst >$outdir/G_log_pushed.fst
fstpushspecial --delta=1E-5 $outdir/G_org.fst >$outdir/G.fst
fstisstochastic $outdir/G.fst
# The output is like:
# 9.14233e-05 -0.259833
# we do expect the first of these 2 numbers to be close to zero (the second is
# nonzero because the backoff weights make the states sum to >1).
# Because of the <s> fiasco for these particular LMs, the first number is not
# as close to zero as it could be.
# Everything below is only for diagnostic.
# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
# this might cause determinization failure of CLG.
# #0 is treated as an empty word.
if [ -f $lexicon ]; then
mkdir -p $tmpdir/g
awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
< "$lexicon" >$tmpdir/g/select_empty.fst.txt
fstcompile --isymbols=$outdir/words.txt --osymbols=$outdir/words.txt $tmpdir/g/select_empty.fst.txt | \
fstarcsort --sort_type=olabel | fstcompose - $outdir/G.fst > $tmpdir/g/empty_words.fst
fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
echo "Language model has cycles with empty words" && exit 1
rm -r $tmpdir/g
fi
echo "Succeeded in creating reversed language model."
rm -r $tmpdir