voxforge_prepare_lm.sh
1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash
# Copyright 2012 Vassil Panayotov
# Apache 2.0
. ./path.sh || exit 1
echo "=== Building a language model ..."
locdata=data/local
loctmp=$locdata/tmp
echo "--- Preparing a corpus from test and train transcripts ..."
# Language model order
order=3
. utils/parse_options.sh
# Prepare a LM training corpus from the transcripts _not_ in the test set
cut -f2- -d' ' < $locdata/test_trans.txt |\
sed -e 's:[ ]\+: :g' | sort -u > $loctmp/test_utt.txt
# We are not removing the test utterances in the current version of the recipe
# because this messes up with some of the later stages - e.g. too many OOV
# words in tri2b_mmi
cut -f2- -d' ' < $locdata/train_trans.txt |\
sed -e 's:[ ]\+: :g' |\
sort -u > $loctmp/corpus.txt
loc=`which ngram-count`;
if [ -z $loc ]; then
if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
else
sdir=$KALDI_ROOT/tools/srilm/bin/i686
fi
if [ -f $sdir/ngram-count ]; then
echo Using SRILM tools from $sdir
export PATH=$PATH:$sdir
else
echo You appear to not have SRILM tools installed, either on your path,
echo or installed in $sdir. See tools/install_srilm.sh for installation
echo instructions.
exit 1
fi
fi
ngram-count -order $order -write-vocab $locdata/vocab-full.txt -wbdiscount \
-text $loctmp/corpus.txt -lm $locdata/lm.arpa
echo "*** Finished building the LM model!"