train_lm.sh
4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/bin/bash
# Copyright 2014 Vassil Panayotov
# Apache 2.0
# This is the top-level LM training script
. ./path.sh || exit 1
. ./cmd.sh || exit 1
# use to skip some of the initial steps
stage=1
# how many words we want in the LM's vocabulary
vocab_size=200000
# LM pruning threshold for the 'small' trigram model
prune_thresh_small=0.0000003
# LM pruning threshold for the 'medium' trigram model
prune_thresh_medium=0.0000001
# how many text normalization jobs to run in parallel
normjobs=2
. utils/parse_options.sh || exit 1
if [[ $# -ne 4 ]]; then
echo "Usage: $1 <lm-texts-root> <tmp-dir> <txt-norm-root> <out-lm-dir>"
echo "where,"
echo " <lm-text-root>: the root directory containing the raw(unnormalized) LM training texts"
echo " <tmp-dir>: store the temp files into this dir"
echo " <txt-norm-root>: store the normalized texts in subdirectories under this root dir"
echo " <out-lm-dir>: the directory to store the trained ARPA model"
exit 1
fi
corpus_dir=$1/corpus
tmp_dir=$2
norm_dir=$3
lm_dir=$4
[[ -d "$corpus_dir" ]] || { echo "No such directory '$corpus_dir'"; exit 1; }
split_prefix=$tmp_dir/split
if [ "$stage" -le 1 ]; then
mkdir -p $tmp_dir
echo "Splitting into $normjobs parts, to allow for parallel processing ..."
split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}")
find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
tee $tmp_dir/all_texts.txt |\
utils/split_scp.pl /dev/stdin $split_files
echo "Checking the splits ..."
total_count=$(wc -l <$tmp_dir/all_texts.txt)
split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}')
[[ "$total_count" -eq "$split_count" ]] || { echo "Inconsistent counts"; exit 1; }
fi
if [ "$stage" -le 2 ]; then
echo "Performing text normalization ($normjobs jobs) - check $tmp_dir/txt_norm.JOB.log ..."
mkdir -p $norm_dir
$mkgraph_cmd JOB=1:$normjobs $tmp_dir/txt_norm.JOB.log \
local/lm/normalize_text.sh $split_prefix-JOB $norm_dir || exit 1
echo "Finished OK"
fi
word_counts=$lm_dir/word_counts.txt
vocab=$lm_dir/librispeech-vocab.txt
full_corpus=$lm_dir/librispeech-lm-norm.txt.gz
if [ "$stage" -le 3 ]; then
echo "Selecting the vocabulary ($vocab_size words) ..."
mkdir -p $lm_dir
echo "Making the corpus and the vocabulary ..."
# The following sequence of commands does the following:
# 1) Eliminates duplicate sentences and saves the resulting corpus
# 2) Splits the corpus into words
# 3) Sorts the words in respect to their frequency
# 4) Saves the list of the first $vocab_size words sorted by their frequencies
# 5) Saves an alphabetically sorted vocabulary, that include the most frequent $vocab_size words
for f in $(find $norm_dir -iname '*.txt'); do cat $f; done |\
sort -u | tee >(gzip >$full_corpus) | tr -s '[[:space:]]' '\n' |\
sort | uniq -c | sort -k1 -n -r |\
head -n $vocab_size | tee $word_counts | awk '{print $2}' | sort >$vocab || exit 1
echo "Word counts saved to '$word_counts'"
echo "Vocabulary saved as '$vocab'"
echo "All unique sentences (in sorted order) stored in '$full_corpus'"
echo "Counting the total number word tokens in the corpus ..."
echo "There are $(wc -w < <(zcat $full_corpus)) tokens in the corpus"
fi
trigram_lm=$lm_dir/lm_tglarge.arpa.gz
if [ "$stage" -le 4 ]; then
echo "Training a 3-gram LM ..."
command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
echo "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine"
echo "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html"
ngram-count -order 3 -kndiscount -interpolate \
-unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $trigram_lm || exit 1
du -h $trigram_lm
fi
trigram_pruned_small=$lm_dir/lm_tgsmall.arpa.gz
if [ "$stage" -le 5 ]; then
echo "Creating a 'small' pruned 3-gram LM (threshold: $prune_thresh_small) ..."
command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
ngram -prune $prune_thresh_small -lm $trigram_lm -write-lm $trigram_pruned_small || exit 1
du -h $trigram_pruned_small
fi
trigram_pruned_medium=$lm_dir/lm_tgmed.arpa.gz
if [ "$stage" -le 5 ]; then
echo "Creating a 'medium' pruned 3-gram LM (threshold: $prune_thresh_medium) ..."
command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
ngram -prune $prune_thresh_medium -lm $trigram_lm -write-lm $trigram_pruned_medium || exit 1
du -h $trigram_pruned_medium
fi
fourgram_lm=$lm_dir/lm_fglarge.arpa.gz
if [ "$stage" -le 4 ]; then
# This requires even more RAM than the 3-gram
echo "Training a 4-gram LM ..."
command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
ngram-count -order 4 -kndiscount -interpolate \
-unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $fourgram_lm || exit 1
du -h $fourgram_lm
fi
exit 0