Blame view
egs/wsj/s5/utils/lang/make_unk_lm.sh
14.3 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 |
#!/bin/bash # Copyright 2016 Johns Hopkins University (Author: Daniel Povey); # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # Begin configuration section. cmd=run.pl ngram_order=4 num_extra_ngrams=10000 position_dependent_phones=true use_pocolm=true min_word_length=2 stage=0 phone_disambig_symbol="#1" # end configuration sections [ -f path.sh ] && . ./path.sh . utils/parse_options.sh if [ $# -ne 2 ]; then echo "Usage: $0 [options] <input-dict-dir> <work-dir>" echo "e.g.: $0 data/local/dict exp/make_unk" echo "" echo "This script creates, as an FST, a phone language model suitable for modeling" echo "the unknown word. It first trains a language model on the phone sequences of the" echo "provided dictionary entries (which should be without any word-position-dependency" echo "tags); it then creates an FST from it, while, for compactness after context-dependency" echo "limiting the transitions to seen bigram pairs of phones. Then, by composing with" echo "a separate FST it converts it into word-position-dependent phones if applicable," echo "while imposing a minimum-number-of-phones constraint." echo "" echo " <input-dict-dir>: A dictionary directory (as validated by validate_dict_dir.pl);" echo " the dictionary from this location (lexicon.txt, lexiconp.txt, or" echo " lexiconp_silprob.txt) will be used to train the language model on" echo " phones. The files silence_phones.txt and nonsilence_phones.txt will" echo " be used to construct a symbol table used internally, and to" echo " exclude lexicon entries containing silences." echo " <work-dir>: A place to put logs and the output of this script. The output of" echo " this script will be written to <work-dir>/unk_fst.txt (we write in" echo " text form so that it's independent of the phones.txt)." echo "Options:" echo " --ngram-order <n> # (default: 4) N-gram order of the phone-level language" echo " # model. Must be in range [2, 7]" echo " --num-extra-ngrams <n> # (default: 10000). The maximum the number of n-grams" echo " # that may be present in the language model in addition" echo " # to the unigrams. The LM will be pruned to achieve this." echo " --use-pocolm <true|false> # (default: true). If true, use pocolm to estimate the" echo " # language model; you will be prompted to install it if" echo " # needed. (If false, we use the script make_phone_lm.py," echo " # which is simpler but the perplexity is not as good)." echo " --position-dependent-phones <true|false> # (default: true). If true, assume position-dependent" echo " # phones (although in any case the lexicon should use position-" echo " # independent phones). If position-dependent phones are used," echo " # after creating the LM we compose with an FST that converts" echo " # into position-dependent phones while enforcing the natural" echo " # constraints that they form a single word." echo " --min-word-length <1|2> # (default: 2). May only be 1 or 2. The minimum word length" echo " # (in number of phones) that is allowed" echo " --phone-disambig-symbol <symbol> # default: '#1'. This is the symbol that will be put on the" echo " # input side of backoff arcs. You won't normally have to change" echo " # this because prepare_lang.sh expects '#1' there." exit 1; fi dict_dir=$1 dir=$2 set -e mkdir -p $dir/log if [ $stage -le 0 ]; then if ! utils/validate_dict_dir.pl $dict_dir >&$dir/log/validate_dict_dir.log; then cat $dir/log/validate_dict_dir.log echo "$0: failed to validate input dict-dir $dict_dir" exit 1 fi fi if ! [ $ngram_order -ge 2 ] || ! [ $ngram_order -le 7 ]; then echo "$0: invalid --ngram-order $ngram_order (must be in [2,7])" exit 1 fi if ! [ $min_word_length -ge 1 ] || ! [ $min_word_length -le 2 ]; then echo "$0: invalid --min-word-length $min_word_length (must be in [1,2])" exit 1 fi # The next command creates a symbol table that will cover all the symbols we might # possibly need in this script. The word-position-dependent suffixes (_B and so on # won't be needed if --position-dependent-phones is false, but it won't hurt. cat $dict_dir/silence_phones.txt $dict_dir/nonsilence_phones.txt | \ awk '{for(n=1;n<=NF;n++) print $n; }' | \ awk '{print $1; print $1 "_B"; print $1 "_I"; print $1 "_S"; print $1 "_E";}' | \ cat - <(echo "$phone_disambig_symbol") | \ awk 'BEGIN{print "<eps> 0";} {print $1, NR;}' > $dir/phones.txt phone_disambig_int=$(tail -n 1 <$dir/phones.txt | awk '{print $2}') if ! [ $phone_disambig_int == $phone_disambig_int ]; then echo "$0: problem working out integer form of phone-disambig symbol." exit 1; fi if [ -e $dict_dir/lexicon.txt ]; then src_dict=$dict_dir/lexicon.txt first_phone_field=2 elif [ -e $dict_dir/lexiconp.txt ]; then src_dict=$dict_dir/lexiconp.txt first_phone_field=3 else [ ! -e $dict_dir/lexiconp_silprob.txt ] && \ echo "$0: expected file $dict_dir/lexiconp_silprob.txt to exist" && exit 1 src_dict=$dict_dir/lexiconp_silprob.tt first_phone_field=6 fi cat $dict_dir/silence_phones.txt | awk '{for(n=1;n<=NF;n++) print $n; }' > $dir/silence_phones.txt # prepare the cleaned up version of the dictionary (to train our phone LM), with # the first field (the word) removed, with prons that have silence phones in # them removed, and with empty prons (which should not be allowed anyway, but # just in case..) removed. awk -v dir=$dir -v ff=$first_phone_field \ 'BEGIN{ while ((getline <(dir"/silence_phones.txt")) > 0) sil[$1]=1; } { ok=1; for (n=ff; n<=NF; n++) { if ($n in sil) ok=0; } if (ok && NF>=ff) { for (n=ff;n<=NF;n++) printf("%s ",$n); print ""; } else { print("make_unk_lm.sh: info: not including dict line: ", $0) >"/dev/stderr" }}' <$src_dict >$dir/training.txt cat $dir/training.txt | awk '{for(n=1;n<=NF;n++) seen[$n]=1; } END{for (k in seen) print k;}' > $dir/all_nonsil_phones num_dict_lines=$(wc -l <$src_dict) num_train_lines=$(wc -l < $dir/training.txt) if ! [ $num_train_lines -gt 0 ]; then echo "$0: something went wrong getting text to train phone-level LM." exit 1 fi echo "$0: training on $num_train_lines words out of $num_dict_lines in the " echo " ... original dictionary (excluding words with silence phones)." if [ $num_train_lines -lt 2000 ] && $use_pocolm; then echo "$0: the number of lines of training data is very small [$num_train_lines]." echo " Setting --use-pocolm to false since it probably won't work well" echo " on so little data (e.g. hard to estimate the discounting parameters)" echo " Using make_phone_lm.py instead." use_pocolm=false fi if $use_pocolm; then if [ ! -e $KALDI_ROOT/tools/pocolm ]; then echo "$0: $KALDI_ROOT/tools/pocolm does not exist:" echo " ... please do: cd $KALDI_ROOT/tools; extras/install_pocolm.sh" echo " ... and then rerun this script." exit 1 fi PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH if [ $stage -le 1 ]; then echo "$0: training $ngram_order-gram LM with pocolm" mkdir -p $dir/pocolm/text heldout_ratio=5 # hold out one fifth of the data as validation to estimate # metaparameters; we'll fold it back in before estimating the # final LM. cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h == 0) print; }' > $dir/pocolm/text/dev.txt cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h != 0) print; }' > $dir/pocolm/text/train.txt # the following options are because we expect the amount of data to be small, # all the data subsampling isn't really needed and will increase the chance of # something going wrong. small_data_opts="--num-splits 4 --warm-start-ratio 1" $cmd $dir/log/train_lm.log \ train_lm.py --wordlist $dir/all_nonsil_phones $small_data_opts \ --fold-dev-into=train $dir/pocolm/text $ngram_order $dir/pocolm fi if [ $stage -le 2 ]; then echo "$0: pruning LM with pocolm" num_words=$(wc -l <$dir/all_nonsil_phones) num_ngrams=$[$num_extra_ngrams+$num_words] $cmd $dir/log/prune_lm_dir.log \ prune_lm_dir.py --target-num-ngrams=$num_ngrams \ $dir/pocolm/all_nonsil_phones_${ngram_order}.pocolm $dir/poclm/lm_pruned # format as arpa. format_arpa_lm.py $dir/poclm/lm_pruned > $dir/pocolm.arpa fi if [ $stage -le 3 ]; then echo "$0: applying bigram constraints and converting from ARPA to FST" # now get bigram constraints: we want to get an FST that only allows phone # bigrams that we've seen (this may enforce certain linguistic constraints, # and also stops the graph from blowing up too much once we introduce # phonetic context. # The NF > 0 is just a double-check that there are no empty prons, which # would be bad as it would allow an empty pronunciation of the unknown word. cat $dir/training.txt | awk '{ if (NF > 0) printf("<s> %s </s> ", $0); }' | \ awk '{for(n=1;n<NF;n++) { m=n+1; seen[ $n " " $m ] = 1; }} END{for(k in seen) print k;}' \ > $dir/allowed_bigrams $cmd $dir/log/arpa2fst.log \ utils/lang/internal/arpa2fst_constrained.py --verbose=3 \ --disambig-symbol="$phone_disambig_symbol" \ $dir/pocolm.arpa $dir/allowed_bigrams '>' $dir/unk_fst_orig.txt fi else if [ $stage -le 1 ]; then echo "$0: using make_phone_lm.py to create $ngram_order-gram language-model FST" $cmd $dir/log/make_phone_lm.log \ utils/sym2int.pl $dir/phones.txt $dir/training.txt '|' \ utils/lang/make_phone_lm.py --verbose=2 \ --phone-disambig-symbol=$phone_disambig_int \ --num-extra-ngrams=$num_extra_ngrams \ --ngram-order=$ngram_order '|' \ utils/int2sym.pl -f 3-4 $dir/phones.txt '>'$dir/unk_fst_orig.txt fi fi sym_opts="--isymbols=$dir/phones.txt --osymbols=$dir/phones.txt" if ! $position_dependent_phones; then if [ $min_word_length == 1 ]; then echo "$0: no word-length constraint or word-position-dependency, so exiting." # There is no need to compose unk_fst_orig.txt with a separate FST: because of # the bigram constraints and because we ensure that there were no empty prons # in the dictionary (no empty lines in training.txt), the FST wouldn't allow # length-zero words anyway. cp $dir/unk_fst_orig.txt $dir/unk_fst.txt fstcompile $sym_opts <$dir/unk_fst.txt >$dir/unk.fst exit 0; else echo "$0: creating constraint_fst.txt for min-word-length=2 constraint." # min-word-length is 2; we need to apply that constraint. A note on the FST # states: 0 is start state, 1 is "seen one phone", 2 is "seen two or more # phones". # We don't need to take into account the disambig symbol because we compose on # the right with this FST, and it doesn't appear on the output side. cat $dir/all_nonsil_phones | \ awk '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; } print 2,0.0; }' > $dir/constraint_fst.txt fi else echo "$0: creating constraint_fst.txt for min-word-length=$min_word_length constraint, plus word-position-dependency conversion." # Add constraints and convert phones without tags into phones with the _B, _E, _I and _S # tags (begin, end, internal, singleton). # States: # 0 is start state, # 1 is "seen initial phone (and maybe internal phones) of multi-phone word", # 2 is "seen final phone of multi-phone word". # 3 is "seen phone of single-phone word"; note, if --min-word-length is 2, # then state 3 will not exist. cat $dir/all_nonsil_phones | \ awk -v mwl=$min_word_length -v "disambig=$phone_disambig_symbol" \ '{ph[$1]=1} END{ for (n=0; n<3; n++) print n,n,disambig,disambig; for (p in ph) { printf("0 1 %s %s_B ", p, p); printf("1 1 %s %s_I ", p, p); printf("1 2 %s %s_E ", p, p); if (mwl==1) printf("0 3 %s %s_S ", p, p); } print 2,0.0; if (mwl==1) print 3,0.0; }' >$dir/constraint_fst.txt fi echo "$0: creating final FST via composition, etc." fstcompile $sym_opts <$dir/constraint_fst.txt | fstarcsort > $dir/constraint.fst fstcompile $sym_opts <$dir/unk_fst_orig.txt >$dir/unk_orig.fst # The first 'fstproject' below projects on the input; it makes sure the # disambiguation symbol appears on the output side also. # The fstcompose actually applies the constraints and does the conversion, but # after this the "correct" phones appear only on the output side. # The second 'fstproject' copies the word-position-dependent phones to # the input side. # The 'fstpushspecial' pushes the weights, as the composition with the # constraint FST makes the FST quite non-stochastic [weights per state do not # sum up to one]. # The 'fstrmsymbols' command makes sure the disambiguation symbol appears only # on the input side. # 'fstminimizeencoded' combines states that are the same as far as their output # arcs are concerned; in the case where --min-word-length is 1, this combines # a lot of final-states that have no transitions out of them. fstproject $dir/unk_orig.fst | \ fstcompose - $dir/constraint.fst | \ fstproject --project_output=true | \ fstpushspecial | \ fstminimizeencoded | \ fstrmsymbols --remove-from-output=true <(echo $phone_disambig_int) >$dir/unk.fst fstprint $sym_opts <$dir/unk.fst >$dir/unk_fst.txt exit 0; |