ted_train_lm.sh
5.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
# Copyright 2016 Vincent Nguyen
# 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
#
# This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data.
# It is based on the example scripts distributed with PocoLM
# It will first check if pocolm is installed and if not will process with installation
# It will then get the source data from the pre-downloaded Cantab-Tedlium files
# and the pre-prepared data/train text source.
set -e
stage=0
echo "$0 $@" # Print the command line for logging
. utils/parse_options.sh || exit 1;
dir=data/local/local_lm
lm_dir=${dir}/data
mkdir -p $dir
. ./path.sh || exit 1; # for KALDI_ROOT
export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
( # First make sure the pocolm toolkit is installed.
cd $KALDI_ROOT/tools || exit 1;
if [ -d pocolm ]; then
echo Not installing the pocolm toolkit since it is already there.
else
echo "$0: Please install the PocoLM toolkit with: "
echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
exit 1;
fi
) || exit 1;
num_dev_sentences=10000
#bypass_metaparam_optim_opt=
# If you want to bypass the metaparameter optimization steps with specific metaparameters
# un-comment the following line, and change the numbers to some appropriate values.
# You can find the values from output log of train_lm.py.
# These example numbers of metaparameters is for 4-gram model (with min-counts)
# running with train_lm.py.
# The dev perplexity should be close to the non-bypassed model.
bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406"
# Note: to use these example parameters, you may need to remove the .done files
# to make sure the make_lm_dir.py be called and tain only 3-gram model
#for order in 3; do
#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
if [ $stage -le 0 ]; then
mkdir -p ${dir}/data
mkdir -p ${dir}/data/text
echo "$0: Getting the Data sources"
rm ${dir}/data/text/* 2>/dev/null || true
# Unzip TEDLIUM 6 data sources, remove </s>, gzip the result.
gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > ${dir}/data/text/train.txt.gz
# use a subset of the annotated training data as the dev set .
# Note: the name 'dev' is treated specially by pocolm, it automatically
# becomes the dev set.
head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
# .. and the rest of the training data as an additional data source.
# we can later fold the dev data into this.
tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/ted.txt
# for reporting perplexities, we'll use the "real" dev set.
# (a subset of the training data is used as ${dir}/data/text/ted.txt to work
# out interpolation weights.
# note, we can't put it in ${dir}/data/text/, because then pocolm would use
# it as one of the data sources.
cut -d " " -f 2- < data/dev/text > ${dir}/data/real_dev_set.txt
# get wordlist
awk '{print $1}' db/TEDLIUM_release-3/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist
fi
order=4
if [ $stage -le 1 ]; then
# decide on the vocabulary.
# Note: you'd use --wordlist if you had a previously determined word-list
# that you wanted to use.
# Note: if you have more than one order, use a certain amount of words as the
# vocab and want to restrict max memory for 'sort',
echo "$0: training the unpruned LM"
min_counts='train=2 ted=1'
wordlist=${dir}/data/wordlist
lm_name="`basename ${wordlist}`_${order}"
if [ -n "${min_counts}" ]; then
lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
fi
unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \
--limit-unk-history=true \
--fold-dev-into=ted ${bypass_metaparam_optim_opt} \
--min-counts="${min_counts}" \
${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
#[perplexity = 157.87] over 18290.0 words
fi
if [ $stage -le 2 ]; then
echo "$0: pruning the LM (to larger size)"
# Using 10 million n-grams for a big LM for rescoring purposes.
size=10000000
prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
# current results, after adding --limit-unk-history=true:
# get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words.
mkdir -p ${dir}/data/arpa
format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
fi
if [ $stage -le 3 ]; then
echo "$0: pruning the LM (to smaller size)"
# Using 2 million n-grams for a smaller LM for graph building. Prune from the
# bigger-pruned LM, it'll be faster.
size=2000000
prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
# current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst):
# get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words.
format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
fi