Blame view

egs/iban/s5/local/arpa2G.sh 3.75 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  #!/bin/bash
  # Copyright 2013-2014  Johns Hopkins University (authors: Yenda Trmal, Daniel Povey)
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  #Simple utility script to convert the gzipped ARPA lm into a G.fst file
  
  
  oov_prob_file=
  unk_fraction=
  cleanup=true
  #end configuration section.
  
  
  
  echo $0 $@
  
  [ -f ./path.sh ] && . ./path.sh
  [ -f ./cmd.sh ]  && . ./cmd.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "Usage: $0 [options] <arpa-lm-file> <lang-dir> <dest-dir>"
    echo "Options: --oov-prob-file <oov-prob-file>   # e.g. data/local/oov2prob"
    echo "           # with this option it will replace <unk> with OOVs in G.fst."
    exit 1;
  fi
  
  set -e           #Exit on non-zero return code from any command
  set -o pipefail  #Exit if any of the commands in the pipeline will
                   #return non-zero return code
  
  lmfile=$1
  langdir=$2
  destdir=$3
  
  mkdir $destdir 2>/dev/null || true
  
  
  if [ ! -z "$oov_prob_file" ]; then
    if [ ! -s "$oov_prob_file" ]; then
      echo "$0: oov-prob file $oov_prob_file does not exist"
      exit 1;
    fi
    if [ -z "$unk_fraction" ]; then
      echo "--oov-prob option requires --unk-fraction option";
      exit 1;
    fi
  
    min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0;
       while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
        if ($order == 1) { @A = split;
         if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
    echo "Minimum prob in LM file is $min_prob"
  
    echo "$0: creating LM file with unk words, using $oov_prob_file, in $destdir/lm_tmp.gz"
    gunzip -c $lmfile | \
      perl -e ' ($oov_prob_file,$min_prob,$unk_fraction) = @ARGV; $ceilinged=0;
        $min_prob < 0.0 || die "Bad min_prob"; # this is a log-prob
        $unk_fraction > 0.0 || die "Bad unk_fraction"; # this is a prob
        open(F, "<$oov_prob_file") || die "opening oov file";
        while (<F>) { push @OOVS, $_; }
        $num_oovs = @F;
        while(<STDIN>) {
        if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n
  "; }
        else { print; } # print all lines unchanged except the one that says ngram 1=X.
        if (m/^\\1-grams:$/) {
          foreach $l (@OOVS) {
            @A = split(" ", $l);
            @A == 2 || die "bad line in oov2prob: $_;";
            ($word, $prob) = @A;
            $log10prob = (log($prob * $unk_fraction) / log(10.0));
            if ($log10prob > $min_prob) { $log10prob = $min_prob; $ceilinged++;}
            print "$log10prob $word
  ";
         }
       }} print STDERR "Ceilinged $ceilinged unk-probs
  ";' \
         $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz
    lmfile=$destdir/lm_tmp.gz
  fi
  
  if [[ $lmfile == *.bz2 ]] ; then
    decompress="bunzip2 -c $lmfile"
  elif [[ $lmfile == *.gz ]] ; then
    decompress="gunzip -c $lmfile"
  else
    decompress="cat $lmfile"
  fi
  
  $decompress | \
    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
    arpa2fst - | \
    fstprint | \
    utils/eps2disambig.pl | \
    utils/s2eps.pl | \
    fstcompile --isymbols=$langdir/words.txt \
    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
    fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
  fstisstochastic $destdir/G.fst || true;
  
  if $cleanup; then
    rm $destdir/lm_tmp.gz  2>/dev/null || true;
  fi
  
  exit 0