Blame view
egs/iban/s5/local/arpa2G.sh
3.75 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
#!/bin/bash # Copyright 2013-2014 Johns Hopkins University (authors: Yenda Trmal, Daniel Povey) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. #Simple utility script to convert the gzipped ARPA lm into a G.fst file oov_prob_file= unk_fraction= cleanup=true #end configuration section. echo $0 $@ [ -f ./path.sh ] && . ./path.sh [ -f ./cmd.sh ] && . ./cmd.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: $0 [options] <arpa-lm-file> <lang-dir> <dest-dir>" echo "Options: --oov-prob-file <oov-prob-file> # e.g. data/local/oov2prob" echo " # with this option it will replace <unk> with OOVs in G.fst." exit 1; fi set -e #Exit on non-zero return code from any command set -o pipefail #Exit if any of the commands in the pipeline will #return non-zero return code lmfile=$1 langdir=$2 destdir=$3 mkdir $destdir 2>/dev/null || true if [ ! -z "$oov_prob_file" ]; then if [ ! -s "$oov_prob_file" ]; then echo "$0: oov-prob file $oov_prob_file does not exist" exit 1; fi if [ -z "$unk_fraction" ]; then echo "--oov-prob option requires --unk-fraction option"; exit 1; fi min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; } if ($order == 1) { @A = split; if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob') echo "Minimum prob in LM file is $min_prob" echo "$0: creating LM file with unk words, using $oov_prob_file, in $destdir/lm_tmp.gz" gunzip -c $lmfile | \ perl -e ' ($oov_prob_file,$min_prob,$unk_fraction) = @ARGV; $ceilinged=0; $min_prob < 0.0 || die "Bad min_prob"; # this is a log-prob $unk_fraction > 0.0 || die "Bad unk_fraction"; # this is a prob open(F, "<$oov_prob_file") || die "opening oov file"; while (<F>) { push @OOVS, $_; } $num_oovs = @F; while(<STDIN>) { if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n "; } else { print; } # print all lines unchanged except the one that says ngram 1=X. if (m/^\\1-grams:$/) { foreach $l (@OOVS) { @A = split(" ", $l); @A == 2 || die "bad line in oov2prob: $_;"; ($word, $prob) = @A; $log10prob = (log($prob * $unk_fraction) / log(10.0)); if ($log10prob > $min_prob) { $log10prob = $min_prob; $ceilinged++;} print "$log10prob $word "; } }} print STDERR "Ceilinged $ceilinged unk-probs ";' \ $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz lmfile=$destdir/lm_tmp.gz fi if [[ $lmfile == *.bz2 ]] ; then decompress="bunzip2 -c $lmfile" elif [[ $lmfile == *.gz ]] ; then decompress="gunzip -c $lmfile" else decompress="cat $lmfile" fi $decompress | \ grep -v '<s> <s>' | grep -v '</s> <s>' | grep -v '</s> </s>' | \ arpa2fst - | \ fstprint | \ utils/eps2disambig.pl | \ utils/s2eps.pl | \ fstcompile --isymbols=$langdir/words.txt \ --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1 fstisstochastic $destdir/G.fst || true; if $cleanup; then rm $destdir/lm_tmp.gz 2>/dev/null || true; fi exit 0 |