Blame view

egs/wsj/s5/utils/lang/make_unk_lm.sh 14.3 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
  #!/bin/bash
  
  # Copyright      2016 Johns Hopkins University (Author: Daniel Povey);
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  # Begin configuration section.
  cmd=run.pl
  ngram_order=4
  num_extra_ngrams=10000
  position_dependent_phones=true
  use_pocolm=true
  min_word_length=2
  stage=0
  phone_disambig_symbol="#1"
  
  # end configuration sections
  
  [ -f path.sh ] && . ./path.sh
  . utils/parse_options.sh
  
  if [ $# -ne 2 ]; then
    echo "Usage: $0 [options] <input-dict-dir> <work-dir>"
    echo "e.g.: $0 data/local/dict exp/make_unk"
    echo ""
    echo "This script creates, as an FST, a phone language model suitable for modeling"
    echo "the unknown word.  It first trains a language model on the phone sequences of the"
    echo "provided dictionary entries (which should be without any word-position-dependency"
    echo "tags); it then creates an FST from it, while, for compactness after context-dependency"
    echo "limiting the transitions to seen bigram pairs of phones.  Then, by composing with"
    echo "a separate FST it converts it into word-position-dependent phones if applicable,"
    echo "while imposing a minimum-number-of-phones constraint."
    echo ""
    echo "  <input-dict-dir>:  A dictionary directory (as validated by validate_dict_dir.pl);"
    echo "             the dictionary from this location (lexicon.txt, lexiconp.txt, or"
    echo "             lexiconp_silprob.txt) will be used to train the language model on"
    echo "             phones.  The files silence_phones.txt and nonsilence_phones.txt will"
    echo "             be used to construct a symbol table used internally, and to"
    echo "             exclude lexicon entries containing silences."
    echo " <work-dir>:    A place to put logs and the output of this script.  The output of"
    echo "                this script will be written to <work-dir>/unk_fst.txt (we write in"
    echo "                text form so that it's independent of the phones.txt)."
    echo "Options:"
    echo "    --ngram-order <n>                 # (default: 4)  N-gram order of the phone-level language"
    echo "                                      # model.  Must be in range [2, 7]"
    echo "    --num-extra-ngrams <n>            # (default: 10000).  The maximum the number of n-grams"
    echo "                                      # that may be present in the language model in addition"
    echo "                                      # to the unigrams.  The LM will be pruned to achieve this."
    echo "    --use-pocolm <true|false>         # (default: true).  If true, use pocolm to estimate the"
    echo "                                      # language model; you will be prompted to install it if"
    echo "                                      # needed.  (If false, we use the script make_phone_lm.py,"
    echo "                                      # which is simpler but the perplexity is not as good)."
    echo "    --position-dependent-phones <true|false>  # (default: true).  If true, assume position-dependent"
    echo "                                      # phones (although in any case the lexicon should use position-"
    echo "                                      # independent phones).  If position-dependent phones are used,"
    echo "                                      # after creating the LM we compose with an FST that converts"
    echo "                                      # into position-dependent phones while enforcing the natural"
    echo "                                      # constraints that they form a single word."
    echo "    --min-word-length <1|2>           # (default: 2).  May only be 1 or 2.  The minimum word length"
    echo "                                      # (in number of phones) that is allowed"
    echo "    --phone-disambig-symbol <symbol>  # default: '#1'.  This is the symbol that will be put on the"
    echo "                                      # input side of backoff arcs.  You won't normally have to change"
    echo "                                      # this because prepare_lang.sh expects '#1' there."
    exit 1;
  fi
  
  
  dict_dir=$1
  dir=$2
  
  set -e
  
  mkdir -p $dir/log
  
  if [ $stage -le 0 ]; then
    if ! utils/validate_dict_dir.pl $dict_dir >&$dir/log/validate_dict_dir.log; then
      cat $dir/log/validate_dict_dir.log
      echo "$0: failed to validate input dict-dir $dict_dir"
      exit 1
    fi
  fi
  
  if ! [ $ngram_order -ge 2 ] || ! [ $ngram_order -le 7 ]; then
    echo "$0: invalid --ngram-order $ngram_order (must be in [2,7])"
    exit 1
  fi
  
  if ! [ $min_word_length -ge 1 ] || ! [ $min_word_length -le 2 ]; then
    echo "$0: invalid --min-word-length $min_word_length (must be in [1,2])"
    exit 1
  fi
  
  # The next command creates a symbol table that will cover all the symbols we might
  # possibly need in this script.  The word-position-dependent suffixes (_B and so on
  # won't be needed if --position-dependent-phones is false, but it won't hurt.
  cat $dict_dir/silence_phones.txt $dict_dir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' | \
    awk '{print $1; print $1 "_B"; print $1 "_I"; print $1 "_S"; print $1 "_E";}' | \
        cat - <(echo "$phone_disambig_symbol") | \
    awk 'BEGIN{print "<eps> 0";} {print $1, NR;}' > $dir/phones.txt
  
  phone_disambig_int=$(tail -n 1 <$dir/phones.txt | awk '{print $2}')
  if ! [ $phone_disambig_int == $phone_disambig_int ]; then
    echo "$0: problem working out integer form of phone-disambig symbol."
    exit 1;
  fi
  
  if [ -e $dict_dir/lexicon.txt ]; then
    src_dict=$dict_dir/lexicon.txt
    first_phone_field=2
  elif [ -e $dict_dir/lexiconp.txt ]; then
    src_dict=$dict_dir/lexiconp.txt
    first_phone_field=3
  else
    [ ! -e $dict_dir/lexiconp_silprob.txt ] && \
      echo "$0: expected file $dict_dir/lexiconp_silprob.txt to exist" && exit 1
    src_dict=$dict_dir/lexiconp_silprob.tt
    first_phone_field=6
  fi
  
  cat $dict_dir/silence_phones.txt | awk '{for(n=1;n<=NF;n++) print $n; }' > $dir/silence_phones.txt
  
  # prepare the cleaned up version of the dictionary (to train our phone LM), with
  # the first field (the word) removed, with prons that have silence phones in
  # them removed, and with empty prons (which should not be allowed anyway, but
  # just in case..) removed.
  awk -v dir=$dir -v ff=$first_phone_field \
     'BEGIN{ while ((getline <(dir"/silence_phones.txt")) > 0) sil[$1]=1;  }
           { ok=1; for (n=ff; n<=NF; n++) { if ($n in sil) ok=0; }
             if (ok && NF>=ff) { for (n=ff;n<=NF;n++) printf("%s ",$n); print ""; } else {
              print("make_unk_lm.sh: info: not including dict line: ", $0) >"/dev/stderr" }}' <$src_dict >$dir/training.txt
  cat $dir/training.txt | awk '{for(n=1;n<=NF;n++) seen[$n]=1; } END{for (k in seen) print k;}' > $dir/all_nonsil_phones
  
  num_dict_lines=$(wc -l <$src_dict)
  num_train_lines=$(wc -l < $dir/training.txt)
  if ! [ $num_train_lines -gt 0 ]; then
    echo "$0: something went wrong getting text to train phone-level LM."
    exit 1
  fi
  echo "$0: training on $num_train_lines words out of $num_dict_lines in the "
  echo "     ... original dictionary (excluding words with silence phones)."
  
  
  if [ $num_train_lines -lt 2000 ] && $use_pocolm; then
    echo "$0: the number of lines of training data is very small [$num_train_lines]."
    echo "    Setting --use-pocolm to false since it probably won't work well"
    echo "    on so little data (e.g. hard to estimate the discounting parameters)"
    echo "    Using make_phone_lm.py instead."
    use_pocolm=false
  fi
  
  if $use_pocolm; then
    if [ ! -e $KALDI_ROOT/tools/pocolm ]; then
      echo "$0: $KALDI_ROOT/tools/pocolm does not exist:"
      echo " ... please do:  cd $KALDI_ROOT/tools; extras/install_pocolm.sh"
      echo " ... and then rerun this script."
      exit 1
    fi
  
    PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  
    if [ $stage -le 1 ]; then
      echo "$0: training $ngram_order-gram LM with pocolm"
  
      mkdir -p $dir/pocolm/text
      heldout_ratio=5  # hold out one fifth of the data as validation to estimate
      # metaparameters; we'll fold it back in before estimating the
      # final LM.
      cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h == 0) print; }' > $dir/pocolm/text/dev.txt
      cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h != 0) print; }' > $dir/pocolm/text/train.txt
  
  
      # the following options are because we expect the amount of data to be small,
      # all the data subsampling isn't really needed and will increase the chance of
      # something going wrong.
  
      small_data_opts="--num-splits 4 --warm-start-ratio 1"
      $cmd $dir/log/train_lm.log \
           train_lm.py --wordlist $dir/all_nonsil_phones $small_data_opts \
           --fold-dev-into=train $dir/pocolm/text $ngram_order $dir/pocolm
    fi
  
    if [ $stage -le 2 ]; then
      echo "$0: pruning LM with pocolm"
      num_words=$(wc -l <$dir/all_nonsil_phones)
      num_ngrams=$[$num_extra_ngrams+$num_words]
  
  
      $cmd $dir/log/prune_lm_dir.log \
           prune_lm_dir.py --target-num-ngrams=$num_ngrams \
           $dir/pocolm/all_nonsil_phones_${ngram_order}.pocolm $dir/poclm/lm_pruned
  
      # format as arpa.
      format_arpa_lm.py $dir/poclm/lm_pruned > $dir/pocolm.arpa
    fi
  
    if [ $stage -le 3 ]; then
      echo "$0: applying bigram constraints and converting from ARPA to FST"
      # now get bigram constraints: we want to get an FST that only allows phone
      # bigrams that we've seen (this may enforce certain linguistic constraints,
      # and also stops the graph from blowing up too much once we introduce
      # phonetic context.
      # The NF > 0 is just a double-check that there are no empty prons, which
      # would be bad as it would allow an empty pronunciation of the unknown word.
      cat $dir/training.txt | awk '{ if (NF > 0) printf("<s> %s </s>
  ", $0); }' | \
        awk '{for(n=1;n<NF;n++) { m=n+1; seen[ $n " " $m ] = 1; }} END{for(k in seen) print k;}' \
            > $dir/allowed_bigrams
  
      $cmd $dir/log/arpa2fst.log \
           utils/lang/internal/arpa2fst_constrained.py --verbose=3 \
             --disambig-symbol="$phone_disambig_symbol" \
           $dir/pocolm.arpa $dir/allowed_bigrams '>' $dir/unk_fst_orig.txt
    fi
  else
  
    if [ $stage -le 1 ]; then
      echo "$0: using make_phone_lm.py to create $ngram_order-gram language-model FST"
      $cmd $dir/log/make_phone_lm.log \
           utils/sym2int.pl $dir/phones.txt $dir/training.txt '|' \
           utils/lang/make_phone_lm.py --verbose=2 \
           --phone-disambig-symbol=$phone_disambig_int \
           --num-extra-ngrams=$num_extra_ngrams \
           --ngram-order=$ngram_order '|' \
           utils/int2sym.pl -f 3-4 $dir/phones.txt '>'$dir/unk_fst_orig.txt
    fi
  fi
  
  
  sym_opts="--isymbols=$dir/phones.txt --osymbols=$dir/phones.txt"
  
  if ! $position_dependent_phones; then
    if  [ $min_word_length == 1 ]; then
      echo "$0: no word-length constraint or word-position-dependency, so exiting."
      # There is no need to compose unk_fst_orig.txt with a separate FST: because of
      # the bigram constraints and because we ensure that there were no empty prons
      # in the dictionary (no empty lines in training.txt), the FST wouldn't allow
      # length-zero words anyway.
      cp $dir/unk_fst_orig.txt $dir/unk_fst.txt
      fstcompile $sym_opts <$dir/unk_fst.txt >$dir/unk.fst
      exit 0;
    else
      echo "$0: creating constraint_fst.txt for min-word-length=2 constraint."
      # min-word-length is 2; we need to apply that constraint.  A note on the FST
      # states: 0 is start state, 1 is "seen one phone", 2 is "seen two or more
      # phones".
      # We don't need to take into account the disambig symbol because we compose on
      # the right with this FST, and it doesn't appear on the output side.
      cat $dir/all_nonsil_phones | \
        awk '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; }
                   print 2,0.0; }' > $dir/constraint_fst.txt
    fi
  else
    echo "$0: creating constraint_fst.txt for min-word-length=$min_word_length constraint, plus word-position-dependency conversion."
  
    # Add constraints and convert phones without tags into phones with the _B, _E, _I and _S
    # tags (begin, end, internal, singleton).
  
    # States:
    # 0 is start state,
    # 1 is "seen initial phone (and maybe internal phones) of multi-phone word",
    # 2 is "seen final phone of multi-phone word".
    # 3 is "seen phone of single-phone word"; note, if --min-word-length is 2,
    #      then state 3 will not exist.
  
    cat $dir/all_nonsil_phones | \
      awk -v mwl=$min_word_length -v "disambig=$phone_disambig_symbol" \
   '{ph[$1]=1} END{ for (n=0; n<3; n++) print n,n,disambig,disambig;
                    for (p in ph) { printf("0 1 %s %s_B
  ", p, p); printf("1 1 %s %s_I
  ", p, p);
                                    printf("1 2 %s %s_E
  ", p, p); if (mwl==1) printf("0 3 %s %s_S
  ", p, p);  }
                   print 2,0.0; if (mwl==1) print 3,0.0; }' >$dir/constraint_fst.txt
  fi
  
  
  echo "$0: creating final FST via composition, etc."
  
  fstcompile $sym_opts <$dir/constraint_fst.txt | fstarcsort > $dir/constraint.fst
  fstcompile $sym_opts <$dir/unk_fst_orig.txt >$dir/unk_orig.fst
  
  # The first 'fstproject' below projects on the input; it makes sure the
  # disambiguation symbol appears on the output side also.
  # The fstcompose actually applies the constraints and does the conversion, but
  # after this the "correct" phones appear only on the output side.
  # The second 'fstproject' copies the word-position-dependent phones to
  # the input side.
  # The 'fstpushspecial' pushes the weights, as the composition with the
  #  constraint FST makes the FST quite non-stochastic [weights per state do not
  #  sum up to one].
  # The 'fstrmsymbols' command makes sure the disambiguation symbol appears only
  # on the input side.
  # 'fstminimizeencoded' combines states that are the same as far as their output
  # arcs are concerned; in the case where --min-word-length is 1, this combines
  # a lot of final-states that have no transitions out of them.
  fstproject $dir/unk_orig.fst | \
    fstcompose - $dir/constraint.fst | \
    fstproject --project_output=true | \
    fstpushspecial | \
    fstminimizeencoded | \
    fstrmsymbols --remove-from-output=true <(echo $phone_disambig_int) >$dir/unk.fst
  
  fstprint $sym_opts <$dir/unk.fst >$dir/unk_fst.txt
  
  
  exit 0;