make_biased_lm_graphs.sh
7.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey)
# 2016 Vimal Manohar
# Apache 2.0
# This script creates biased decoding graphs based on the data transcripts as
# HCLG.fsts.scp, in the specified directory; this can be consumed by
# decode_segmentation.sh.
# This is for use in data-cleanup and data-filtering.
set -u
set -o pipefail
set -e
# Begin configuration section.
nj=10
cmd=run.pl
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
top_n_words=100 # Number of common words that we compile into each graph (most frequent
# in $data/text.orig.
top_n_words_weight=1.0 # this weight is before renormalization; it can be more
# or less than 1.
min_words_per_graph=100 # Utterances will be grouped so that they have at least
# this many words, before making the graph.
stage=0
### options for make_one_biased_lm.py.
ngram_order=4 # maximum n-gram order to use (but see also --min-lm-state-cout).
min_lm_state_count=10 # make this smaller (e.g. 2) for more strongly biased LM.
discounting_constant=0.3 # strictly between 0 and 1. Make this closer to 0 for
# more strongly biased LM.
# End configuration options.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "usage: $0 <data-dir|text> <lang-dir> <dir> <graph-dir>"
echo "e.g.: $0 data/train data/lang exp/tri3_cleanup exp/tri3_cleanup/graphs"
echo " This script creates biased decoding graphs per utterance (or possibly"
echo " groups of utterances, depending on --min-words-per-graph). Its output"
echo " goes to <dir>/HCLG.fsts.scp, indexed by utterance. Directory <dir> is"
echo " required to be a model or alignment directory, containing 'tree' and 'final.mdl'."
echo "Main options (for others, see top of script file)"
echo " --scale-opts <scale-opts> # Options relating to language"
echo " # model scale; default is "
echo " # '--transition-scale=1.0 --self-loop-scale=0.1'"
echo " --top-n-words <N> # Number of most-common-words to add with"
echo " # unigram probabilities into graph (default: 100)"
echo " --top-n-words-weight <float> # Weight given to top-n-words portion of graph"
echo " # (before renormalizing); may be any positive"
echo " # number (default: 1.0)"
echo " --min-words-per-graph <N> # A constant that controls grouping of utterances"
echo " # (we make the LMs for groups of utterances)."
echo " # Default: 100."
echo " --ngram-order <N> # N-gram order in range [2,7]. Maximum n-gram order "
echo " # that may be used (but also see --min-lm-state-count)."
echo " # Default 4"
echo " --min-lm-state-count <N> # Minimum state count for an LM-state of order >2 to "
echo " # be completely pruned away [bigrams will always be kept]"
echo " # Default 10. Smaller -> more strongly biased LM"
echo " --discounting-constant <float> # Discounting constant for Kneser-Ney, strictly between 0"
echo " # and 1. Default 0.3. Smaller -> more strongly biased LM."
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
data_or_text=$1
lang=$2
dir=$3
graph_dir=$4
if [ -d $data_or_text ]; then
text=$data_or_text/text
else
text=$data_or_text
fi
mkdir -p $graph_dir
for f in $text $lang/oov.int $dir/tree $dir/final.mdl \
$lang/L_disambig.fst $lang/phones/disambig.int; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
utils/lang/check_phones_compatible.sh $lang/phones.txt $dir/phones.txt
cp $lang/phones.txt $graph_dir
oov=`cat $lang/oov.int` || exit 1;
mkdir -p $graph_dir/log
# create top_words.{int,txt}
if [ $stage -le 0 ]; then
export LC_ALL=C
# the following pipe will be broken due to the 'head'; don't fail.
set +o pipefail
utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $text | \
awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
sort -nr | head -n $top_n_words > $graph_dir/word_counts.int
set -o pipefail
total_count=$(awk '{x+=$1} END{print x}' < $graph_dir/word_counts.int)
# print top-n words with their unigram probabilities.
awk -v tot=$total_count -v weight=$top_n_words_weight '{print $2, ($1*weight)/tot;}' \
<$graph_dir/word_counts.int >$graph_dir/top_words.int
utils/int2sym.pl -f 1 $lang/words.txt <$graph_dir/top_words.int >$graph_dir/top_words.txt
fi
word_disambig_symbol=$(cat $lang/words.txt | grep -w "#0" | awk '{print $2}')
if [ -z "$word_disambig_symbol" ]; then
echo "$0: error getting word disambiguation symbol"
exit 1
fi
mkdir -p $graph_dir/texts
split_text=
for n in `seq $nj`; do
split_text="$split_text $graph_dir/texts/text.$n"
done
utils/split_scp.pl $text $split_text
mkdir -p $graph_dir/log $graph_dir/fsts
# Make $dir an absolute pathname
dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
if [ $stage -le 1 ]; then
echo "$0: creating utterance-group-specific decoding graphs with biased LMs"
# These options are passed through directly to make_one_biased_lm.py.
lm_opts="--word-disambig-symbol=$word_disambig_symbol --ngram-order=$ngram_order --min-lm-state-count=$min_lm_state_count --discounting-constant=$discounting_constant --top-words=$graph_dir/top_words.int"
$cmd JOB=1:$nj $graph_dir/log/compile_decoding_graphs.JOB.log \
utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $graph_dir/texts/text.JOB \| \
steps/cleanup/make_biased_lms.py --min-words-per-graph=$min_words_per_graph \
--lm-opts="$lm_opts" $graph_dir/fsts/utt2group.JOB \| \
compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
$dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- \
ark,scp:$graph_dir/fsts/HCLG.fsts.JOB.ark,$graph_dir/fsts/HCLG.fsts.JOB.scp || exit 1
fi
for j in $(seq $nj); do cat $graph_dir/fsts/HCLG.fsts.$j.scp; done > $graph_dir/fsts/HCLG.fsts.per_utt.scp
for j in $(seq $nj); do cat $graph_dir/fsts/utt2group.$j; done > $graph_dir/fsts/utt2group
cp $lang/words.txt $graph_dir/
cp -r $lang/phones $graph_dir/
# The following command gives us an scp file relative to utterance-id.
utils/apply_map.pl -f 2 $graph_dir/fsts/HCLG.fsts.per_utt.scp <$graph_dir/fsts/utt2group > $graph_dir/HCLG.fsts.scp
n1=$(cat $text | wc -l)
n2=$(cat $graph_dir/HCLG.fsts.scp | wc -l)
if [ $[$n1*9] -gt $[$n2*10] ]; then
echo "$0: too many utterances have no scp, something seems to have gone wrong."
exit 1
fi
exit 0;