Blame view

egs/wsj/s5/local/wsj_train_lms.sh 8.62 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
  #!/bin/bash
  
  # This script trains LMs on the WSJ LM-training data.
  # It requires that you have already run wsj_extend_dict.sh,
  # to get the larger-size dictionary including all of CMUdict
  # plus any OOVs and possible acronyms that we could easily 
  # derive pronunciations for.
  
  dict_suffix=
  
  echo "$0 $@"  # Print the command line for logging
  . utils/parse_options.sh || exit 1;
  
  dir=data/local/local_lm
  srcdir=data/local/dict${dict_suffix}_larger
  mkdir -p $dir
  . ./path.sh || exit 1; # for KALDI_ROOT
  export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
  ( # First make sure the kaldi_lm toolkit is installed.
   cd $KALDI_ROOT/tools || exit 1;
   if [ -d kaldi_lm ]; then
     echo Not installing the kaldi_lm toolkit since it is already there.
   else
     echo Downloading and installing the kaldi_lm tools
     if [ ! -f kaldi_lm.tar.gz ]; then
       wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
     fi
     tar -xvzf kaldi_lm.tar.gz || exit 1;
     cd kaldi_lm
     make || exit 1;
     echo Done making the kaldi_lm tools
   fi
  ) || exit 1;
  
  
  
  if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
    echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
    echo "You need to run local/wsj_extend_dict.sh before running this script."
    exit 1;
  fi
  
  # Get a wordlist-- keep everything but silence, which should not appear in
  # the LM.
  awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
  
  # Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
  echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
  gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
    'BEGIN{while((getline<w)>0) v[$1]=1;}
    {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
    | gzip -c > $dir/train_nounk.gz
  
  # Get unigram counts (without bos/eos, but this doens't matter here, it's
  # only to get the word-map, which treats them specially & doesn't need their
  # counts).
  # Add a 1-count for each word in word-list by including that in the data,
  # so all words appear.
  gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
    awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
   sort -nr > $dir/unigram.counts
  
  # Get "mapped" words-- a character encoding of the words that makes the common words very short.
  cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
  
  gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
    { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
  
  # To save disk space, remove the un-mapped training data.  We could
  # easily generate it again if needed.
  rm $dir/train_nounk.gz 
  
  train_lm.sh --arpa --lmtype 3gram-mincount $dir
  #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
  # 7.8 million N-grams.
  
  prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
  # 1.45 million N-grams.
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
  
  train_lm.sh --arpa --lmtype 4gram-mincount $dir
  #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
  # 10.3 million N-grams.
  
  prune_lm.sh --arpa 7.0 $dir/4gram-mincount
  # 1.50 million N-grams
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
  
  
  exit 0
  
  ### Below here, this script is showing various commands that 
  ## were run during LM tuning.
  
  train_lm.sh --arpa --lmtype 3gram-mincount $dir
  #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
  # 7.8 million N-grams.
  
  prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
  #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
  # 2.5 million N-grams.
  
  prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
  # 1.45 million N-grams.
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
  
  train_lm.sh --arpa --lmtype 4gram-mincount $dir
  #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
  # 10.3 million N-grams.
  
  prune_lm.sh --arpa 3.0 $dir/4gram-mincount
  #Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
  # 2.6 million N-grams.
  
  prune_lm.sh --arpa 4.0 $dir/4gram-mincount
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
  # 2.15 million N-grams.
  
  prune_lm.sh --arpa 5.0 $dir/4gram-mincount
  # 1.86 million N-grams
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
  
  prune_lm.sh --arpa 7.0 $dir/4gram-mincount
  # 1.50 million N-grams
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
  
  train_lm.sh --arpa --lmtype 3gram $dir
  # Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
  # 20.0 million N-grams
  
  ! which ngram-count  \
    && echo "SRILM tools not installed so not doing the comparison" && exit 1;
  
  #################
  # You could finish the script here if you wanted.
  # Below is to show how to do baselines with SRILM.
  #  You'd have to install the SRILM toolkit first.
  
  heldout_sent=10000 # Don't change this if you want result to be comparable with
      # kaldi_lm results
  sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
  mkdir -p $sdir
  gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
  gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
  (echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
  
  # 3-gram:
  ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
    -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
  ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
  #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
  #0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
  
  # Trying 4-gram:
  ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
    -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
  ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout 
  #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
  #0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
  
  #3-gram with pruning:
  ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
    -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
  ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout 
  #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
  #0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
  # Around 2.25M N-grams.
  # Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
  # above, which gave 2.5 million N-grams and a perplexity of 156.
  
  # Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
  # You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
  # the kaldi_lm experiments above without "-mincount".
  
  ##  From here is how to train with
  # IRSTLM.  This is not really working at the moment.
  
  if [ -z $IRSTLM ] ; then
    export IRSTLM=$KALDI_ROOT/tools/irstlm/
  fi
  export PATH=${PATH}:$IRSTLM/bin
  if ! command -v prune-lm >/dev/null 2>&1 ; then
    echo "$0: Error: the IRSTLM is not available or compiled" >&2
    echo "$0: Error: We used to install it by default, but." >&2
    echo "$0: Error: this is no longer the case." >&2
    echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
    echo "$0: Error: and run extras/install_irstlm.sh" >&2
    exit 1
  fi
  
  idir=$dir/irstlm
  mkdir $idir
  gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | add-start-end.sh | \
    gzip -c > $idir/train.gz
  
  dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
   cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
  {print $0;}}' > vocab.irstlm.20k
  
  
  build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz  -p yes \
    -n 3 -s improved-kneser-ney -b yes
  # Testing perplexity with SRILM tools:
  ngram -lm $idir/lm_3gram.gz  -ppl $sdir/cleaned.heldout 
  #data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
  #file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
  #0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
  
  # Perplexity is very bad (should be ~141, since we used -p option,
  # not 175),
  # but adding -debug 3 to the command line shows that
  # the IRSTLM LM does not seem to sum to one properly, so it seems that
  # it produces an LM that isn't interpretable in the normal way as an ARPA
  # LM.