Blame view

Scripts/utils/reverse_lm.sh 3.59 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  #!/bin/bash
  
  # Copyright 2012  Brno University of Technology (Author: Mirko Hannemann)
  # JHU (Author: Dan Povey)
  # Apache 2.0
  
  # configuration section
  tmpdir=data/local/lm_tmp  # only for OOVs and checks
  lexicon=data/local/lang_tmp.reverse/lexicon.txt # only for checks
  # end config section
  
  mkdir -p $tmpdir
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f ./path.sh ] && . ./path.sh; # source the path.
  . parse_options.sh || exit 1;
  
  if [ $# != 3 ]; then
     echo "Usage: utils/reverse_lm.sh [options] <arpa-gz-file> <lang-dir> <out-dir>"
     echo "e.g.: utils/reverse_lm.sh data/local/nist_lm/lm_tgpr_5k.arpa.gz data/lang.reverse data/lang_test_tgpr_5k.reverse"
     echo "... where files from <lang-dir> are copied into <out-dir>"
     echo "options:"
     echo " --lexicon <lexicon-file>   reversed lexicon (only for checks)"
     exit 1;
  fi
  
  lm=$1 # gzipped arpa file
  langdir=$2
  outdir=$3 # output directory
  
  # create the corresponding FST for the language model
  # and the corresponding lang_test_* directory.
  
  echo Preparing reverse language model from $lm into $outdir
  echo "Finding OOVs and strange silences"
  mkdir -p $outdir
  for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
    cp -r $langdir/$f $outdir
  done
  gunzip -c $lm | utils/find_arpa_oovs.pl $outdir/words.txt  > $tmpdir/oovs.txt
  
  # grep -v '<s> <s>' because the LM seems to have some strange and useless
  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
  # which are supposed to occur only at being/end of utt.  These can cause 
  # determinization failures of CLG [ends up being epsilon cycles].
  gunzip -c $lm | \
    grep -v '<s> <s>' | \
    grep -v '</s> <s>' | \
    grep -v '</s> </s>' > $outdir/forward.arpa
  echo "Mapping ARPA to reverse ARPA"
  python utils/reverse_arpa.py $outdir/forward.arpa > $outdir/reverse.arpa
  arpa2fst $outdir/reverse.arpa | fstprint | \
    grep -v "230258.5" | \
    utils/remove_oovs.pl $tmpdir/oovs.txt | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$outdir/words.txt \
      --osymbols=$outdir/words.txt  --keep_isymbols=false --keep_osymbols=false \
      | fstrmepsilon > $outdir/G_org.fst
  #--arc_type=log
  
  echo "Push weights to make it stochastic (log semi-ring)"
  # delta must be very small otherwise weight pushing won't succeed
  #fstpush --push_weights=true --push_labels=true --delta=1E-7 $outdir/G_log.fst >$outdir/G_log_pushed.fst
  fstpushspecial --delta=1E-5 $outdir/G_org.fst >$outdir/G.fst
  
  fstisstochastic $outdir/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833
  # we do expect the first of these 2 numbers to be close to zero (the second is
  # nonzero because the backoff weights make the states sum to >1).
  # Because of the <s> fiasco for these particular LMs, the first number is not
  # as close to zero as it could be.
  
  # Everything below is only for diagnostic.
  # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
  # this might cause determinization failure of CLG.
  # #0 is treated as an empty word.
  
  if [ -f $lexicon ]; then
    mkdir -p $tmpdir/g
    awk '{if(NF==1){ printf("0 0 %s %s
  ", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
      < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
    fstcompile --isymbols=$outdir/words.txt --osymbols=$outdir/words.txt $tmpdir/g/select_empty.fst.txt | \
      fstarcsort --sort_type=olabel | fstcompose - $outdir/G.fst > $tmpdir/g/empty_words.fst
    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
    echo "Language model has cycles with empty words" && exit 1
    rm -r $tmpdir/g
  fi
  echo "Succeeded in creating reversed language model."
  rm -r $tmpdir