Blame view

egs/iam/v2/local/train_lm.sh 6.17 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
  #!/bin/bash
  
  # Copyright 2016  Vincent Nguyen
  #           2016  Johns Hopkins University (author: Daniel Povey)
  #           2017  Ashish Arora
  #           2017  Hossein Hadian
  # Apache 2.0
  #
  # This script trains an LM on the LOB+Brown text data and IAM training transcriptions.
  # It is based on the example scripts distributed with PocoLM
  
  # It will check if pocolm is installed and if not will proceed with installation
  
  set -e
  stage=0
  vocab_size=50000
  
  echo "$0 $@"  # Print the command line for logging
  . ./utils/parse_options.sh || exit 1;
  
  dir=data/local/local_lm
  lm_dir=${dir}/data
  
  
  mkdir -p $dir
  . ./path.sh || exit 1; # for KALDI_ROOT
  export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  ( # First make sure the pocolm toolkit is installed.
   cd $KALDI_ROOT/tools || exit 1;
   if [ -d pocolm ]; then
     echo Not installing the pocolm toolkit since it is already there.
   else
     echo "$0: Please install the PocoLM toolkit with: "
     echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
     exit 1;
   fi
  ) || exit 1;
  
  bypass_metaparam_optim_opt=
  # If you want to bypass the metaparameter optimization steps with specific metaparameters
  # un-comment the following line, and change the numbers to some appropriate values.
  # You can find the values from output log of train_lm.py.
  # These example numbers of metaparameters is for 4-gram model (with min-counts)
  # running with train_lm.py.
  # The dev perplexity should be close to the non-bypassed model.
  #bypass_metaparam_optim_opt=
  # Note: to use these example parameters, you may need to remove the .done files
  # to make sure the make_lm_dir.py be called and tain only 3-gram model
  #for order in 3; do
  #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
  
  if [ $stage -le 0 ]; then
    mkdir -p ${dir}/data
    mkdir -p ${dir}/data/text
  
    echo "$0: Getting the Data sources"
  
    rm ${dir}/data/text/* 2>/dev/null || true
  
    # Using LOB and brown corpus.
    if [ ! -f data/local/lob-train-only.txt ]; then
      cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
        local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \
                                                 > data/local/lob-train-only.txt
    fi
    cat data/local/lob-train-only.txt | \
      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
      | sed 's/@@//g' > ${dir}/data/text/lob.txt
    cat data/local/browncorpus/brown.txt | \
      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
      | sed 's/@@//g' > ${dir}/brown.txt
    tail -n +5000 ${dir}/brown.txt > ${dir}/data/text/brown.txt
    if [ -d "data/local/wellingtoncorpus" ]; then
      cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \
        utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
        | sed 's/@@//g' > ${dir}/data/text/wellington.txt
    fi
  
    # use the validation data as the dev set.
    # Note: the name 'dev' is treated specially by pocolm, it automatically
    # becomes the dev set.
    head -5000 ${dir}/brown.txt > ${dir}/data/text/dev.txt
  
    # use the training data as an additional data source.
    # we can later fold the dev data into this.
    cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/iam.txt
  
    # for reporting perplexities, we'll use the "real" dev set.
    # (the validation data is used as ${dir}/data/text/dev.txt to work
    # out interpolation weights.)
    # note, we can't put it in ${dir}/data/text/, because then pocolm would use
    # it as one of the data sources.
    cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
  
    # get the wordlist from IAM text
    if [ -d "data/local/wellingtoncorpus" ]; then
      cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[
  *]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
    else
      echo "$0: Wellington Corpus not found. Proceeding without using that corpus."
      cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[
  *]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
    fi
    head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
  fi
  
  order=6
  
  if [ $stage -le 1 ]; then
    # decide on the vocabulary.
    # Note: you'd use --wordlist if you had a previously determined word-list
    # that you wanted to use.
    # Note: if you have more than one order, use a certain amount of words as the
    # vocab and want to restrict max memory for 'sort',
    echo "$0: training the unpruned LM"
    min_counts='brown=2 lob=2 iam=1'
    wordlist=${dir}/data/wordlist
  
    lm_name="`basename ${wordlist}`_${order}"
    if [ -n "${min_counts}" ]; then
      lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
    fi
    unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
  
    train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
                 --limit-unk-history=true \
                 ${bypass_metaparam_optim_opt} \
                 ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
  
    mkdir -p ${dir}/data/arpa
    format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
  
    get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
  fi
  
  if [ $stage -le 2 ]; then
    echo "$0: pruning the LM (to larger size)"
    # Using 1 million n-grams for a big LM for rescoring purposes.
    size=1000000
    prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
  
    get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
  
    mkdir -p ${dir}/data/arpa
    format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
  fi
  
  if [ $stage -le 3 ]; then
    echo "$0: pruning the LM (to smaller size)"
    # Using 500,000 n-grams for a smaller LM for graph building.  Prune from the
    # bigger-pruned LM, it'll be faster.
    size=500000
    prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
  
    get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
  
    format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
  fi