make_phone_bigram_lang.sh
4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
# Apache 2.0. Copyright 2012, Johns Hopkins University (author: Daniel Povey)
# This script creates a "lang" directory of the "testing" type (including G.fst)
# given an existing "alignment" directory and an existing "lang" directory.
# The directory contains only single-phone words, and a bigram language model that
# is built without smoothing, on top of single phones. The point of no smoothing
# is to limit the number of transitions, so we can decode reasonably fast, and the
# graph won't blow up. This is probably going to be most useful for things like
# language-id.
#
# See also steps/make_phone_graph.sh
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0: [options] <lang-dir> <ali-dir> <output-lang-dir>"
echo "e.g.: $0: data/lang exp/tri3b_ali data/lang_phone_bg"
exit 1;
fi
lang=$1
alidir=$2
lang_out=$3
for f in $lang/phones.txt $alidir/ali.1.gz; do
[ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
done
mkdir -p $lang_out || exit 1;
grep -v '#' $lang/phones.txt > $lang_out/phones.txt # no disambig symbols
# needed; G and L . G will be deterministic.
cp $lang/topo $lang_out
rm -r $lang_out/phones 2>/dev/null
cp -r $lang/phones/ $lang_out/
rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
# no longer be valid.
rm $lang_out/phones/wdisambig* 2>/dev/null # ditto this.
# List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
# are determinizable without any.
echo -n > $lang_out/phones/disambig.txt
echo -n > $lang_out/phones/disambig.int
echo -n > $lang_out/phones/disambig.csl
echo -n > $lang_out/phones/wdisambig.txt
echo -n > $lang_out/phones/wdisambig_phones.int
echo -n > $lang_out/phones/wdisambig_words.int
# Let OOV symbol be the first phone. This is arbitrary, it's just
# so that validate_lang.pl succeeds. We should never actually use
# this.
oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}')
oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}')
echo $oov_sym > $lang_out/oov.txt
echo $oov_int > $lang_out/oov.int
# Get phone-level transcripts of training data and create a
# language model.
ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
perl -e 'while(<>) {
@A = split(" ", $_);
shift @A; # Remove the utterance-id.
foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones.
unshift @A, "<s>";
push @A, "</s>";
for ($n = 0; $n+1 < @A; $n++) {
$p = $A[$n]; $q = $A[$n+1];
$count{$p,$q}++;
$histcount{$p}++;
}
}
@phones = keys %phones;
unshift @phones, "<s>";
# @phones is now all real phones, plus <s>.
for ($n = 0; $n < @phones; $n++) {
$phn2state{$phones[$n]} = $n;
}
foreach $p (@phones) {
$src = $phn2state{$p};
$hist = $histcount{$p};
$hist > 0 || die;
foreach $q (@phones) {
$c = $count{$p,$q};
if (defined $c) {
$cost = -log($c / $hist); # cost on FST arc.
$dest = $phn2state{$q};
print "$src $dest $q $cost\n"; # Note: q is actually numeric.
}
}
$c = $count{$p,"</s>"};
if (defined $c) {
$cost = -log($c / $hist); # cost on FST arc.
print "$src $cost\n"; # final-prob.
}
} ' | fstcompile --acceptor=true | \
fstarcsort --sort_type=ilabel > $lang_out/G.fst
# symbols for phones and words are the same.
# Neither has disambig symbols.
cp $lang_out/phones.txt $lang_out/words.txt
grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \
fstcompile > $lang_out/L.fst
# note: first two fields of align_lexicon.txt are interpreted as the word; the remaining
# fields are the phones that are in the pron of the word. These are all the same, for us.
for p in $(grep -v '<eps>' $lang_out/phones.txt | awk '{print $1}'); do echo $p $p $p; done > $lang_out/phones/align_lexicon.txt
# just use one sym2int.pl command, since phones.txt and words.txt are identical.
utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang_out/phones/align_lexicon.int
# L and L_disambig are the same.
cp $lang_out/L.fst $lang_out/L_disambig.fst
utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1;