arpa2G.sh
3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/bin/bash
# Copyright 2013-2014 Johns Hopkins University (authors: Yenda Trmal, Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
#Simple utility script to convert the gzipped ARPA lm into a G.fst file
oov_prob_file=
unk_fraction=
cleanup=true
#end configuration section.
echo $0 $@
[ -f ./path.sh ] && . ./path.sh
[ -f ./cmd.sh ] && . ./cmd.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: $0 [options] <arpa-lm-file> <lang-dir> <dest-dir>"
echo "Options: --oov-prob-file <oov-prob-file> # e.g. data/local/oov2prob"
echo " # with this option it will replace <unk> with OOVs in G.fst."
exit 1;
fi
set -e #Exit on non-zero return code from any command
set -o pipefail #Exit if any of the commands in the pipeline will
#return non-zero return code
lmfile=$1
langdir=$2
destdir=$3
mkdir $destdir 2>/dev/null || true
if [ ! -z "$oov_prob_file" ]; then
if [ ! -s "$oov_prob_file" ]; then
echo "$0: oov-prob file $oov_prob_file does not exist"
exit 1;
fi
if [ -z "$unk_fraction" ]; then
echo "--oov-prob option requires --unk-fraction option";
exit 1;
fi
min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0;
while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
if ($order == 1) { @A = split;
if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
echo "Minimum prob in LM file is $min_prob"
echo "$0: creating LM file with unk words, using $oov_prob_file, in $destdir/lm_tmp.gz"
gunzip -c $lmfile | \
perl -e ' ($oov_prob_file,$min_prob,$unk_fraction) = @ARGV; $ceilinged=0;
$min_prob < 0.0 || die "Bad min_prob"; # this is a log-prob
$unk_fraction > 0.0 || die "Bad unk_fraction"; # this is a prob
open(F, "<$oov_prob_file") || die "opening oov file";
while (<F>) { push @OOVS, $_; }
$num_oovs = @F;
while(<STDIN>) {
if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
else { print; } # print all lines unchanged except the one that says ngram 1=X.
if (m/^\\1-grams:$/) {
foreach $l (@OOVS) {
@A = split(" ", $l);
@A == 2 || die "bad line in oov2prob: $_;";
($word, $prob) = @A;
$log10prob = (log($prob * $unk_fraction) / log(10.0));
if ($log10prob > $min_prob) { $log10prob = $min_prob; $ceilinged++;}
print "$log10prob $word\n";
}
}} print STDERR "Ceilinged $ceilinged unk-probs\n";' \
$oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz
lmfile=$destdir/lm_tmp.gz
fi
if [[ $lmfile == *.bz2 ]] ; then
decompress="bunzip2 -c $lmfile"
elif [[ $lmfile == *.gz ]] ; then
decompress="gunzip -c $lmfile"
else
decompress="cat $lmfile"
fi
$decompress | \
grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \
arpa2fst - | \
fstprint | \
utils/eps2disambig.pl | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true;
if $cleanup; then
rm $destdir/lm_tmp.gz 2>/dev/null || true;
fi
exit 0