apply_unk_lm.sh
3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
# Copyright 2016 Johns Hopkins University (Author: Daniel Povey);
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Begin configuration section.
# end configuration sections
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh
. utils/parse_options.sh
if [ $# -ne 2 ]; then
echo "Usage: $0 [options] <input-unk-lm-fst> <lang-dir>"
echo "e.g.: $0 exp/make_unk/unk_fst.txt data/lang_unk"
echo ""
echo "This script, which is called from the end of prepare_lang.sh,"
echo "inserts the unknown-word LM FST into the lexicon FSTs"
echo "<lang-dir>/L.fst and <lang-dir>/L_disambig.fst in place of"
echo "the special disambiguation symbol #2 (which was inserted by"
echo "add_lex_disambig.pl as a placeholder for this FST)."
echo ""
echo " <input-unk-lm-fst>: A text-form FST, typically with the name"
echo " unk_fst.txt. We will remove all symbols from the"
echo " output before applying it."
echo " <lang-dir>: A partially built lang/ directory. We modify"
echo " L.fst and L_disambig.fst, and read only words.txt."
exit 1;
fi
unk_lm_fst=$1
lang=$2
set -e
for f in "$unk_lm_fst" $lang/L.fst $lang/L_disambig.fst $lang/words.txt $lang/oov.int; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
unused_phone_label=$(tail -n 1 $lang/phones.txt | awk '{print $2 + 1}')
label_to_replace=$(awk '{if ($1 == "#2") {print $2;}}' <$lang/phones.txt)
! [ "$unused_phone_label" -eq "$unused_phone_label" -a "$label_to_replace" -eq "$label_to_replace" ] && \
echo "$0: error getting unused phone label or label for #2" && exit 1
# OK, now fstreplace works based on olabels, but we actually want to deal with ilabels,
# so we need to invert all the FSTs before and after doing fstreplace.
awk '{if(NF>=4) $4 = "<eps>"; print }' <$unk_lm_fst | \
fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt | \
fstinvert > $lang/unk_temp.fst
num_states_unk=$(fstinfo $lang/unk_temp.fst | grep '# of states' | awk '{print $NF}')
# fstreplace usage is:
# Usage: fstreplace root.fst rootlabel [rule1.fst label1 ...] [out.fst]
# ... the rootlabel should just be an otherwise unused symbol.
# all the labels are olabels (word labels).. that is hardcoded in fstreplace.
for f in L.fst L_disambig.fst; do
# with OpenFst tools, to refer to the standard input/output you need to use
# the empty string '' and not '-'.
fstinvert $lang/$f | fstreplace '' "$unused_phone_label" $lang/unk_temp.fst "$label_to_replace" | fstinvert > $lang/${f}.temp
num_states_old=$(fstinfo $lang/$f | grep '# of states' | awk '{print $NF}')
num_states_new=$(fstinfo $lang/${f}.temp | grep '# of states' | awk '{print $NF}')
num_states_added=$[$num_states_new-$num_states_old]
echo "$0: in $f, substituting in the unknown-word LM (which had $num_states_unk states) added $num_states_added new FST states."
mv -f $lang/${f}.temp $lang/$f
done
rm $lang/unk_temp.fst
exit 0;