Blame view

egs/librispeech/s5/local/lm/train_lm.sh 4.97 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
  #!/bin/bash
  
  # Copyright 2014 Vassil Panayotov
  # Apache 2.0
  
  # This is the top-level LM training script
  
  . ./path.sh || exit 1
  . ./cmd.sh || exit 1
  
  # use to skip some of the initial steps
  stage=1
  
  # how many words we want in the LM's vocabulary
  vocab_size=200000
  
  # LM pruning threshold for the 'small' trigram model
  prune_thresh_small=0.0000003
  
  # LM pruning threshold for the 'medium' trigram model
  prune_thresh_medium=0.0000001
  
  # how many text normalization jobs to run in parallel
  normjobs=2
  
  . utils/parse_options.sh || exit 1
  
  if [[ $# -ne 4 ]]; then
    echo "Usage: $1 <lm-texts-root> <tmp-dir> <txt-norm-root> <out-lm-dir>"
    echo "where,"
    echo "  <lm-text-root>: the root directory containing the raw(unnormalized) LM training texts"
    echo "  <tmp-dir>: store the temp files into this dir"
    echo "  <txt-norm-root>: store the normalized texts in subdirectories under this root dir"
    echo "  <out-lm-dir>: the directory to store the trained ARPA model"
    exit 1
  fi
  
  corpus_dir=$1/corpus
  tmp_dir=$2
  norm_dir=$3
  lm_dir=$4
  
  [[ -d "$corpus_dir" ]] || { echo "No such directory '$corpus_dir'"; exit 1; }
  
  split_prefix=$tmp_dir/split
  
  if [ "$stage" -le 1 ]; then
    mkdir -p $tmp_dir
    echo "Splitting into $normjobs parts, to allow for parallel processing ..."
    split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}")
    find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
      tee $tmp_dir/all_texts.txt |\
      utils/split_scp.pl /dev/stdin $split_files
    echo "Checking the splits ..."
    total_count=$(wc -l <$tmp_dir/all_texts.txt)
    split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}')
    [[ "$total_count" -eq "$split_count" ]] || { echo "Inconsistent counts"; exit 1; }
  fi
  
  if [ "$stage" -le 2 ]; then
    echo "Performing text normalization ($normjobs jobs) - check $tmp_dir/txt_norm.JOB.log ..."
    mkdir -p $norm_dir
    $mkgraph_cmd JOB=1:$normjobs $tmp_dir/txt_norm.JOB.log \
      local/lm/normalize_text.sh $split_prefix-JOB $norm_dir || exit 1
    echo "Finished OK"
  fi
  
  word_counts=$lm_dir/word_counts.txt
  vocab=$lm_dir/librispeech-vocab.txt
  full_corpus=$lm_dir/librispeech-lm-norm.txt.gz
  
  if [ "$stage" -le 3 ]; then
    echo "Selecting the vocabulary ($vocab_size words) ..."
    mkdir -p $lm_dir
    echo "Making the corpus and the vocabulary ..."
    # The following sequence of commands does the following:
    # 1) Eliminates duplicate sentences and saves the resulting corpus
    # 2) Splits the corpus into words
    # 3) Sorts the words in respect to their frequency
    # 4) Saves the list of the first $vocab_size words sorted by their frequencies
    # 5) Saves an alphabetically sorted vocabulary, that include the most frequent $vocab_size words
    for f in $(find $norm_dir -iname '*.txt'); do cat $f; done |\
      sort -u | tee >(gzip >$full_corpus) | tr -s '[[:space:]]' '
  ' |\
      sort | uniq -c | sort -k1 -n -r |\
      head -n $vocab_size | tee $word_counts | awk '{print $2}' | sort >$vocab || exit 1
    echo "Word counts saved to '$word_counts'"
    echo "Vocabulary saved as '$vocab'"
    echo "All unique sentences (in sorted order) stored in '$full_corpus'"
    echo "Counting the total number word tokens in the corpus ..."
    echo "There are $(wc -w < <(zcat $full_corpus)) tokens in the corpus"
  fi
  
  trigram_lm=$lm_dir/lm_tglarge.arpa.gz
  
  if [ "$stage" -le 4 ]; then
    echo "Training a 3-gram LM ..."
    command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    echo "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine"
    echo "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html"
    ngram-count -order 3  -kndiscount -interpolate \
      -unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $trigram_lm || exit 1
    du -h $trigram_lm
  fi
  
  trigram_pruned_small=$lm_dir/lm_tgsmall.arpa.gz
  
  if [ "$stage" -le 5 ]; then
    echo "Creating a 'small' pruned 3-gram LM (threshold: $prune_thresh_small) ..."
    command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    ngram -prune $prune_thresh_small -lm $trigram_lm -write-lm $trigram_pruned_small || exit 1
    du -h $trigram_pruned_small
  fi
  
  trigram_pruned_medium=$lm_dir/lm_tgmed.arpa.gz
  
  if [ "$stage" -le 5 ]; then
    echo "Creating a 'medium' pruned 3-gram LM (threshold: $prune_thresh_medium) ..."
    command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    ngram -prune $prune_thresh_medium -lm $trigram_lm -write-lm $trigram_pruned_medium || exit 1
    du -h $trigram_pruned_medium
  fi
  
  fourgram_lm=$lm_dir/lm_fglarge.arpa.gz
  
  if [ "$stage" -le 4 ]; then
    # This requires even more RAM than the 3-gram
    echo "Training a 4-gram LM ..."
    command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    ngram-count -order 4  -kndiscount -interpolate \
      -unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $fourgram_lm || exit 1
    du -h $fourgram_lm
  fi
  
  exit 0