Blame view

egs/chime5/s5b/local/train_lms_srilm.sh 12.1 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  #!/bin/bash
  # Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe)
  # Apache 2.0
  
  export LC_ALL=C
  
  # Begin configuration section.
  words_file=
  train_text=
  dev_text=
  oov_symbol="<UNK>"
  # End configuration section
  
  echo "$0 $@"
  
  [ -f path.sh ]  && . ./path.sh
  . ./utils/parse_options.sh || exit 1
  
  echo "-------------------------------------"
  echo "Building an SRILM language model     "
  echo "-------------------------------------"
  
  if [ $# -ne 2 ] ; then
    echo "Incorrect number of parameters. "
    echo "Script has to be called like this:"
    echo "  $0 [switches] <datadir> <tgtdir>"
    echo "For example: "
    echo "  $0 data data/srilm"
    echo "The allowed switches are: "
    echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
    echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
    echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
    echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
    exit 1
  fi
  
  datadir=$1
  tgtdir=$2
  
  ##End of configuration
  loc=`which ngram-count`;
  if [ -z $loc ]; then
    echo >&2 "You appear to not have SRILM tools installed, either on your path,"
    echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it."
    exit 1
  fi
  
  # Prepare the destination directory
  mkdir -p $tgtdir
  
  for f in $words_file $train_text $dev_text; do
    [ ! -s $f ] && echo "No such file $f" && exit 1;
  done
  
  [ -z $words_file ] && words_file=$datadir/lang/words.txt
  if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
    nr=`cat  $train_text | wc -l`
    nr_dev=$(($nr / 10 ))
    nr_train=$(( $nr - $nr_dev ))
    orig_train_text=$train_text
    head -n $nr_train $train_text > $tgtdir/train_text
    tail -n $nr_dev $train_text > $tgtdir/dev_text
  
    train_text=$tgtdir/train_text
    dev_text=$tgtdir/dev_text
    echo "Using words file: $words_file"
    echo "Using train text: 9/10 of $orig_train_text"
    echo "Using dev text  : 1/10 of $orig_train_text"
  elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
    echo "Using words file: $words_file"
    echo "Using train text: $train_text"
    echo "Using dev text  : $dev_text"
    train_text=$train_text
    dev_text=$dev_text
  else
    train_text=$datadir/train/text
    dev_text=$datadir/dev2h/text
    echo "Using words file: $words_file"
    echo "Using train text: $train_text"
    echo "Using dev text  : $dev_text"
  
  fi
  
  [ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1
  [ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1
  [ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1
  
  
  # Extract the word list from the training dictionary; exclude special symbols
  sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
  if (($?)); then
    echo "Failed to create vocab from $words_file"
    exit 1
  else
    # wc vocab # doesn't work due to some encoding issues
    echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words
  ";}'`
  fi
  
  # Kaldi transcript files contain Utterance_ID as the first word; remove it
  # We also have to avoid skewing the LM by incorporating  the same sentences
  # from different channels
  sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \
    perl -ane 'print join(" ", @F[1..$#F]) . "
  " if @F > 1' > $tgtdir/train.txt
  if (($?)); then
      echo "Failed to create $tgtdir/train.txt from $train_text"
      exit 1
  else
      echo "Removed first word (uid) from every line of $train_text"
      # wc text.train train.txt # doesn't work due to some encoding issues
      echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences
  ";}'`
      echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences
  ";}'`
  fi
  
  # Kaldi transcript files contain Utterance_ID as the first word; remove it
  sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \
    perl -ane 'print join(" ", @F[1..$#F]) . "
  " if @F > 1' > $tgtdir/dev.txt
  if (($?)); then
      echo "Failed to create $tgtdir/dev.txt from $dev_text"
      exit 1
  else
      echo "Removed first word (uid) from every line of $dev_text"
      # wc text.train train.txt # doesn't work due to some encoding issues
      echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences
  ";}'`
      echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences
  ";}'`
  fi
  
  
  echo "-------------------"
  echo "Good-Turing 3grams"
  echo "-------------------"
  ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  
  echo "-------------------"
  echo "Kneser-Ney 3grams"
  echo "-------------------"
  ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \
    -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \
    -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \
    -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \
    -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \
    -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \
    -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \
    -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \
    -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  
  
  echo "-------------------"
  echo "Good-Turing 4grams"
  echo "-------------------"
  ngram-count -lm $tgtdir/4gram.gt0111.gz \
    -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.gt0112.gz \
    -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.gt0122.gz \
    -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.gt0123.gz \
    -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.gt0113.gz \
    -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.gt0222.gz \
    -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.gt0223.gz \
    -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  
  echo "-------------------"
  echo "Kneser-Ney 4grams"
  echo "-------------------"
  ngram-count -lm $tgtdir/4gram.kn0111.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.kn0112.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.kn0113.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.kn0122.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.kn0123.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.kn0222.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  ngram-count -lm $tgtdir/4gram.kn0223.gz \
    -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
    -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
  
  if [ ! -z ${LIBLBFGS} ]; then
    #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
    #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
    echo "-------------------"
    echo "Maxent 3grams"
    echo "-------------------"
    sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
      ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
      ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
      sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
  
    echo "-------------------"
    echo "Maxent 4grams"
    echo "-------------------"
    sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
      ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
      ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
      sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
  else
    echo >&2  "SRILM is not compiled with the support of MaxEnt models."
    echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
    echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
    exit 1;
  fi
  
  
  echo "--------------------"
  echo "Computing perplexity"
  echo "--------------------"
  (
    for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
    for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
  )  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
  
  echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
  echo ""
  
  for best_ngram in {3,4}gram ; do
    outlm=best_${best_ngram}.gz
    lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ')
    echo "$outlm -> $lmfilename"
    (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm )
  done