Blame view

egs/hub4_english/s5/local/train_lm.sh 12 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
  #!/bin/bash
  
  # Copyright 2016  Vimal Manohar
  # Apache 2.0
  #
  # This script trains a LM on the Broadcast News transcripts.
  # It is based on the example scripts distributed with PocoLM.
  
  # It will first check if pocolm is installed and if not will process with installation
  
  
  set -e
  set -o pipefail 
  set -u
  
  stage=0
  dir=data/local/local_lm
  cmd=run.pl
  vocab_size=   # Preferred vocabulary size
  
  echo "$0 $@"  # Print the command line for logging
  . utils/parse_options.sh || exit 1;
  
  lm_dir=${dir}/data
  
  mkdir -p $dir
  . ./path.sh || exit 1; # for KALDI_ROOT
  export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  ( # First make sure the pocolm toolkit is installed.
   cd $KALDI_ROOT/tools || exit 1;
   if [ -d pocolm ]; then
     echo Not installing the pocolm toolkit since it is already there.
   else
     echo "$0: Please install the PocoLM toolkit with: "
     echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
     exit 1;
   fi
  ) || exit 1;
  
  num_dev_sentences=4500
  RANDOM=0  # set seed for shuffling to ensure reproducibility
  
  if [ $stage -le 0 ]; then
    mkdir -p ${dir}/data
    mkdir -p ${dir}/data/text
  
    echo "$0: Getting the Data sources"
  
    rm ${dir}/data/text/* 2>/dev/null || true
  
    # Take unique subset to make sure that the training text is not in the 
    # dev set.
    # Replace train with train_bn96 in order to use only the 1996 HUB4 set
    cat data/train/text | cut -d ' ' -f 2- | sort | uniq -c | \
      shuf > ${dir}/train_text_with_count
    head -n $num_dev_sentences < ${dir}/train_text_with_count | \
      awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
      ${dir}/data/text/dev.txt 
    tail -n +$[num_dev_sentences+1] < ${dir}/train_text_with_count | \
      awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
      ${dir}/data/text/train.txt
  
    # Get text from NA News corpus 
    for x in data/local/data/na_news/*; do
      y=`basename $x`
      [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
    done
  
    # Get text from 1996 CSR HUB4 LM corpus
    for x in `cat data/local/data/csr96_hub4/{train,test}.filelist`; do
      gunzip -c $x
    done | gzip -c > ${dir}/data/text/csr96_hub4.txt.gz
    
    # Get text from 1995 CSR-IV HUB4 corpus
    cat data/local/data/csr95_hub4/dev95_text \
      data/local/data/csr95_hub4/eval95_text \
      data/local/data/csr95_hub4/train95_text | cut -d ' ' -f 2- > \
      ${dir}/data/text/csr95_hub4.txt
  
    # Get text from NA News supplement corpus 
    for x in data/local/data/na_news_supp; do
      y=`basename $x`
      [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
    done
  
    # for reporting perplexities, we'll use the "real" dev set.
    # note, we can't put it in ${dir}/data/text/, because then pocolm would use
    # it as one of the data sources.
    for x in dev96pe dev96ue eval96 eval97 eval98 eval99_1 eval99_2; do
      cat data/$x/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
        awk '!/IGNORE_TIME_SEGMENT_IN_SCORING/ {print $0}' | \
        local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
        cut -d ' ' -f 2- > ${dir}/data/${x}.txt
    done
  fi
  
  if [ $stage -le 1 ]; then
    mkdir -p $dir/data/work
    if [ ! -f $dir/data/work/word_counts/.done ]; then
      get_word_counts.py $dir/data/text $dir/data/work/word_counts
      touch $dir/data/work/word_counts/.done
    fi
  fi
  
  if [ $stage -le 2 ]; then
    # decide on the vocabulary.
  
    # NA news corpus is not clean. So better not to get vocabulary from there.
    # for x in data/local/data/na_news/*; do
    #   y=$dir/data/work/word_counts/`basename $x`.counts
    #   [ -f $y ] && cat $y 
    # done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
  
    cat $dir/data/work/word_counts/{train,dev}.counts | \
      local/lm/merge_word_counts.py 2 > $dir/data/work/train.wordlist_counts
  
    cat $dir/data/work/word_counts/csr96_hub4.counts | \
      local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts
  
    cat $dir/data/work/word_counts/csr95_hub4.counts | \
      local/lm/merge_word_counts.py 5 > $dir/data/work/csr95_hub4.wordlist_counts
  
    cat $dir/data/work/{train,csr96_hub4,csr95_hub4}.wordlist_counts | \
      perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1]
  "; }' | \
      local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts
  
    if [ ! -z "$vocab_size" ]; then
      awk -v sz=$vocab_size 'BEGIN{count=-1;} 
      { i+=1; if (i == int(sz)) { count = $1; };
        if (count > 0 && count != $1) { exit(0); } 
        print $0;
      }' $dir/data/work/final.wordlist_counts
    else 
      cat $dir/data/work/final.wordlist_counts
    fi | awk '{print $2}' > $dir/data/work/wordlist
  fi
  
  order=4
  wordlist=$dir/data/work/wordlist
  
  min_counts='default=5 train=1 csr96_hub4=2,3 csr95_hub4=2,3'
  
  lm_name="`basename ${wordlist}`_${order}"
  if [ -n "${min_counts}" ]; then
    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`"
  fi
  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
  
  export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  
  if [ $stage -le 3 ]; then
    echo "$0: training the unpruned LM"
  
    $cmd ${unpruned_lm_dir}/log/train.log \
      train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
                   --limit-unk-history=true \
                   --fold-dev-into=train \
                   --min-counts="${min_counts}" \
                   ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
  
    for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
      $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
        get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
  
      cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
    done
    
    # train_lm.py: You can set --bypass-metaparameter-optimization='0.829,0.997,0.066,0.014,0.171,0.244,0.063,0.001,0.023,0.004,0.014,0.006,0.018,0.027,0.082,1.000,0.004,0.007,0.024,0.703,0.108,0.046,0.019,0.848,0.258,0.208,0.195,0.889,0.297,0.282,0.242' to get equivalent results
    # train_lm.py: Ngram counts: 98768 + 26286404 + 21077207 + 17945418 = 65407797
    
    # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88365261291 per word [perplexity = 132.112338899] over 18771.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.9299451353 per word [perplexity = 138.371920398] over 23710.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.8308081807 per word [perplexity = 125.312194639] over 20553.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.82377287988 per word [perplexity = 124.433679586] over 33234.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88114977878 per word [perplexity = 131.782097071] over 33180.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01175279868 per word [perplexity = 150.167719384] over 11529.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01485733132 per word [perplexity = 150.634644387] over 16395.0 words.
    
  fi
  
  if [ $stage -le 4 ]; then
    echo "$0: pruning the LM (to larger size)"
    # Using 10 million n-grams for a big LM for rescoring purposes.
    size=10000000
    $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \
      prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 \
      ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
  
    for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
      $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \
        get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big
  
      cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity'
    done
  
    # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.96695051249 per word [perplexity = 143.588348177] over 18771.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.01232680304 per word [perplexity = 150.253941052] over 23710.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91227395027 per word [perplexity = 135.948202644] over 20553.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92411302883 per word [perplexity = 137.567269311] over 33234.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.97443821579 per word [perplexity = 144.667530381] over 33180.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10483206523 per word [perplexity = 164.816389804] over 11529.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10905926136 per word [perplexity = 165.514575655] over 16395.0 words.
  
    mkdir -p ${dir}/data/arpa
    format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: pruning the LM (to smaller size)"
    # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
    # bigger-pruned LM, it'll be faster.
    size=2000000
    
    $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \
      prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big \
      ${dir}/data/lm_${order}_prune_small
  
    for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
      $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \
        get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small
  
      cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity'
    done
  
    # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12459372596 per word [perplexity = 168.105830741] over 18771.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.16866547448 per word [perplexity = 175.680231224] over 23710.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.08096906048 per word [perplexity = 160.929931226] over 20553.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09222677679 per word [perplexity = 162.751870937] over 33234.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12842796263 per word [perplexity = 168.751625556] over 33180.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26755997571 per word [perplexity = 193.942161054] over 11529.0 words.
    # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.27092234584 per word [perplexity = 194.595363921] over 16395.0 words
  
    format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
  fi