Blame view
egs/wsj/s5/local/wsj_train_lms.sh
8.62 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
#!/bin/bash # This script trains LMs on the WSJ LM-training data. # It requires that you have already run wsj_extend_dict.sh, # to get the larger-size dictionary including all of CMUdict # plus any OOVs and possible acronyms that we could easily # derive pronunciations for. dict_suffix= echo "$0 $@" # Print the command line for logging . utils/parse_options.sh || exit 1; dir=data/local/local_lm srcdir=data/local/dict${dict_suffix}_larger mkdir -p $dir . ./path.sh || exit 1; # for KALDI_ROOT export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH ( # First make sure the kaldi_lm toolkit is installed. cd $KALDI_ROOT/tools || exit 1; if [ -d kaldi_lm ]; then echo Not installing the kaldi_lm toolkit since it is already there. else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; cd kaldi_lm make || exit 1; echo Done making the kaldi_lm tools fi ) || exit 1; if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist"; echo "You need to run local/wsj_extend_dict.sh before running this script." exit 1; fi # Get a wordlist-- keep everything but silence, which should not appear in # the LM. awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt # Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>. echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \ 'BEGIN{while((getline<w)>0) v[$1]=1;} {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \ | gzip -c > $dir/train_nounk.gz # Get unigram counts (without bos/eos, but this doens't matter here, it's # only to get the word-map, which treats them specially & doesn't need their # counts). # Add a 1-count for each word in word-list by including that in the data, # so all words appear. gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \ awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \ sort -nr > $dir/unigram.counts # Get "mapped" words-- a character encoding of the words that makes the common words very short. cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;} { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz # To save disk space, remove the un-mapped training data. We could # easily generate it again if needed. rm $dir/train_nounk.gz train_lm.sh --arpa --lmtype 3gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 # 7.8 million N-grams. prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ # 1.45 million N-grams. # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 train_lm.sh --arpa --lmtype 4gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 # 10.3 million N-grams. prune_lm.sh --arpa 7.0 $dir/4gram-mincount # 1.50 million N-grams # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 exit 0 ### Below here, this script is showing various commands that ## were run during LM tuning. train_lm.sh --arpa --lmtype 3gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826 # 7.8 million N-grams. prune_lm.sh --arpa 3.0 $dir/3gram-mincount/ #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740 # 2.5 million N-grams. prune_lm.sh --arpa 6.0 $dir/3gram-mincount/ # 1.45 million N-grams. # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139 train_lm.sh --arpa --lmtype 4gram-mincount $dir #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180 # 10.3 million N-grams. prune_lm.sh --arpa 3.0 $dir/4gram-mincount #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294 # 2.6 million N-grams. prune_lm.sh --arpa 4.0 $dir/4gram-mincount # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717 # 2.15 million N-grams. prune_lm.sh --arpa 5.0 $dir/4gram-mincount # 1.86 million N-grams # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023 prune_lm.sh --arpa 7.0 $dir/4gram-mincount # 1.50 million N-grams # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757 train_lm.sh --arpa --lmtype 3gram $dir # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866 # 20.0 million N-grams ! which ngram-count \ && echo "SRILM tools not installed so not doing the comparison" && exit 1; ################# # You could finish the script here if you wanted. # Below is to show how to do baselines with SRILM. # You'd have to install the SRILM toolkit first. heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. mkdir -p $sdir gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train (echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s # 3-gram: ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2 #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs #0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437 # Trying 4-gram: ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs #0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822 #3-gram with pruning: ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \ -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs #0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616 # Around 2.25M N-grams. # Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/" # above, which gave 2.5 million N-grams and a perplexity of 156. # Note: all SRILM experiments above fully discount all singleton 3 and 4-grams. # You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to # the kaldi_lm experiments above without "-mincount". ## From here is how to train with # IRSTLM. This is not really working at the moment. if [ -z $IRSTLM ] ; then export IRSTLM=$KALDI_ROOT/tools/irstlm/ fi export PATH=${PATH}:$IRSTLM/bin if ! command -v prune-lm >/dev/null 2>&1 ; then echo "$0: Error: the IRSTLM is not available or compiled" >&2 echo "$0: Error: We used to install it by default, but." >&2 echo "$0: Error: this is no longer the case." >&2 echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 echo "$0: Error: and run extras/install_irstlm.sh" >&2 exit 1 fi idir=$dir/irstlm mkdir $idir gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | add-start-end.sh | \ gzip -c > $idir/train.gz dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\ {print $0;}}' > vocab.irstlm.20k build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \ -n 3 -s improved-kneser-ney -b yes # Testing perplexity with SRILM tools: ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout #data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs #0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599 # Perplexity is very bad (should be ~141, since we used -p option, # not 175), # but adding -debug 3 to the command line shows that # the IRSTLM LM does not seem to sum to one properly, so it seems that # it produces an LM that isn't interpretable in the normal way as an ARPA # LM. |