Blame view

egs/wsj/s5/steps/make_phone_graph.sh 4.93 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  #!/bin/bash
  
  # steps/make_phone_graph.sh data/train_100k_nodup/ data/lang exp/tri2_ali_100k_nodup/ exp/tri2
  
  # Copyright 2013  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
  
  # This script makes a phone-based LM, without smoothing to unigram, that
  # is to be used for segmentation, and uses that together with a model to
  # make a decoding graph.
  # Uses SRILM.
  # See also utils/lang/make_phone_bigram_lm.sh.
  
  # Begin configuration section.
  stage=0
  cmd=run.pl
  N=3  # change N and P for non-trigram systems.
  P=1
  tscale=1.0 # transition scale.
  loopscale=0.1 # scale for self-loops.
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f ./path.sh ] && . ./path.sh; # source the path.
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "Usage: $0  [options] <lang-dir> <alignment-dir> <model-dir>"
    echo " e.g.: $0 data/lang exp/tri3b_ali exp/tri4b_seg"
    echo "Makes the graph in $dir/phone_graph, corresponding to the model in $dir"
    echo "The alignments from $ali_dir are used to train the phone LM."
    exit 1;
  fi
  
  lang=$1
  alidir=$2
  dir=$3
  
  
  for f in $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $dir/final.mdl; do
    if [ ! -f $f ]; then
      echo "$0: expected $f to exist"
      exit 1;
    fi
  done
  
  loc=`which ngram-count`;
  if [ -z $loc ]; then
    if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
      sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
    else
      sdir=$KALDI_ROOT/tools/srilm/bin/i686
    fi
    if [ -f $sdir/ngram-count ]; then
      echo Using SRILM tools from $sdir
      export PATH=$PATH:$sdir
    else
      echo You appear to not have SRILM tools installed, either on your path,
      echo or installed in $sdir.  See tools/install_srilm.sh for installation
      echo instructions.
      exit 1
    fi
  fi
  
  set -e # exit on error status
  
  mkdir -p $dir/phone_graph
  
  utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt
  
  if [ $stage -le 0 ]; then
    echo "$0: creating phone LM-training data"
    gunzip -c $alidir/ali.*gz | ali-to-phones $alidir/final.mdl ark:- ark,t:- | \
      awk '{for (x=2; x <= NF; x++) printf("%s ", $x); printf("
  "); }' | \
      utils/int2sym.pl $lang/phones.txt > $dir/phone_graph/train_phones.txt
  fi
  
  if [ $stage -le 1 ]; then
    echo "$0: building ARPA LM"
    ngram-count -text $dir/phone_graph/train_phones.txt -order 3  \
      -addsmooth1 1 -kndiscount2 -kndiscount3 -interpolate -lm $dir/phone_graph/arpa.gz
  fi
  
  # Set the unigram and unigram-backoff log-probs to -99.  we'll later remove the
  # arcs from the FST.  This is to avoid CLG blowup, and to increase speed.
  
  if [ $stage -le 2 ]; then
    echo "$0: removing unigrams from ARPA LM"
  
    gunzip -c $dir/phone_graph/arpa.gz | \
      awk '/\\1-grams/{state=1;} /\\2-grams:/{ state=2; }
         {if(state == 1 && NF == 3) { printf("-99\t%s\t-99
  ", $2); } else {print;}}' | \
           gzip -c >$dir/phone_graph/arpa_noug.gz
  fi
  
  if [ $stage -le 3 ]; then
    echo "$0: creating G_phones.fst from ARPA"
    gunzip -c $dir/phone_graph/arpa_noug.gz | \
      arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/phones.txt - - | \
      fstprint | awk '{if (NF < 5 || $5 < 100.0) { print; }}' | fstcompile | \
      fstconnect > $dir/phone_graph/G_phones.fst
    fstisstochastic $dir/phone_graph/G_phones.fst || echo "[info]: G_phones not stochastic."
  fi
  
  
  if [ $stage -le 4 ]; then
    echo "$0: creating CLG."
  
    fstcomposecontext --context-size=$N --central-position=$P \
     --read-disambig-syms=$lang/phones/disambig.int \
     --write-disambig-syms=$dir/phone_graph/disambig_ilabels_${N}_${P}.int \
      $dir/phone_graph/ilabels_${N}_${P} < $dir/phone_graph/G_phones.fst | \
        fstdeterminize >$dir/phone_graph/CLG.fst
    fstisstochastic $dir/phone_graph/CLG.fst  || echo "[info]: CLG not stochastic."
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: creating Ha.fst"
    make-h-transducer --disambig-syms-out=$dir/phone_graph/disambig_tid.int \
      --transition-scale=$tscale $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
         > $dir/phone_graph/Ha.fst
  fi
  
  if [ $stage -le 6 ]; then
    echo "$0: creating HCLGa.fst"
    fsttablecompose $dir/phone_graph/Ha.fst $dir/phone_graph/CLG.fst | \
        fstdeterminizestar --use-log=true | \
        fstrmsymbols $dir/phone_graph/disambig_tid.int | fstrmepslocal | \
        fstminimizeencoded > $dir/phone_graph/HCLGa.fst || exit 1;
    fstisstochastic $dir/phone_graph/HCLGa.fst || echo "HCLGa is not stochastic"
  fi
  
  if [ $stage -le 7 ]; then
    add-self-loops --self-loop-scale=$loopscale --reorder=true \
      $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;
  
    if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
      # No point doing this test if transition-scale not 1, as it is bound to fail.
      fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
    fi
  
    # $lang/phones.txt is the symbol table that corresponds to the output
    # symbols on the graph; decoding scripts expect it as words.txt.
    cp $lang/phones.txt $dir/phone_graph/words.txt
    cp -r $lang/phones $dir/phone_graph/
  fi