lm_prep.sh
1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/bash
## Install SRILM in the `tools` directory (install_srilm.sh)
## Only run this file from the example root directory
## $ ./local/data_prep.sh
mkdir -p "data/local/tmp" "data/lang/tmp"
source ./path.sh
if [ -d "../../../tools/srilm/bin/i686-m64" ]; then
ngram_count_exe="../../../tools/srilm/bin/i686-m64/ngram-count"
elif [ -d "../../../tools/srilm/bin/i686" ]; then
ngram_count_exe="../../../tools/srilm/bin/i686/ngram-count"
else
echo
echo "[!] Install SRILM in the 'tools' directory (install_srilm.sh)"
echo
exit 1
fi
########################
# data/local/tmp/lm_text
########################
# Text sentences input for language model generation
# taken from data/[train|test]/text but with utterance IDs removed
cat data/train/text data/test/text | cut -d' ' -f1 --complement > data/local/tmp/lm_text
#################################
# data/local/tmp/3gram_arpa_lm.gz
##################################
$ngram_count_exe -lm data/local/tmp/3gram_lm.arpa.kn.gz \
-order 3 \
-write-vocab data/local/tmp/vocab-full.txt \
-sort \
-wbdiscount \
-unk \
-map-unk "<UNK>" \
-text data/local/tmp/lm_text
# -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 \
# -kndiscount3 -gt3min 3 -order 3 \
#################
# data/lang/G.fst
#################
utils/format_lm.sh data/lang \
data/local/tmp/3gram_lm.arpa.kn.gz \
data/local/dict/lexicon.txt \
data/lang