gp_format_lm.sh
3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o pipefail
. ./path.sh # Sets the PATH to contain necessary executables
# Begin configuration section.
filter_vocab_sri=false # if true, use SRILM to change the LM vocab
srilm_opts="-subset -prune-lowprobs -unk -tolower"
# end configuration sections
help_message="Usage: "`basename $0`" [options] LM-dir LC [LC ... ]
where LC is a 2-letter code for GlobalPhone languages, and LM-dir is assumed to
contain LMs for all the languages (e.g. RU.3gram.lm.gz for Russian).
options:
--help # print this message and exit
--filter-vocab-sri (true|false) # use SRILM to change the LM vocab (default: $filter_vocab_sri)
--srilm-opts STRING # options to pass to SRILM tools (default: '$srilm_opts')
";
. utils/parse_options.sh
if [ $# -lt 2 ]; then
printf "$help_message\n"; exit 1;
fi
LMDIR=$1; shift;
LANGUAGES=
while [ $# -gt 0 ]; do
case "$1" in
??) LANGUAGES=$LANGUAGES" $1"; shift ;;
*) echo "Unknown argument: $1, exiting"; error_exit $usage ;;
esac
done
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
for L in $LANGUAGES; do
lm=$LMDIR/${L}.3gram.lm.gz
[ -f $lm ] || { echo "LM '$lm' not found"; exit 1; }
test=data/$L/lang_test_tg
if $filter_vocab_sri; then # use SRILM to change LM vocab
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
data/$L/lang $lm data/$L/local/dict/lexicon.txt "${test}_sri"
else # just remove out-of-lexicon words without renormalizing the LM
utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test"
fi
# Create a pruned version of the LM for building the decoding graphs, using
# 'prune-lm' from IRSTLM:
mkdir -p data/$L/local/lm
prune-lm --threshold=1e-7 $lm /dev/stdout | gzip -c \
> data/$L/local/lm/${L}.tgpr.arpa.gz
lm=data/$L/local/lm/${L}.tgpr.arpa.gz
test=data/$L/lang_test_tgpr
if $filter_vocab_sri; then # use SRILM to change LM vocab
utils/format_lm_sri.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt \
"${test}_sri"
else # just remove out-of-lexicon words without renormalizing the LM
utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test"
fi
done