get_prons.sh
7.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/bin/bash
# Copyright 2014 Johns Hopkins University (Author: Daniel Povey)
# 2014 Guoguo Chen
# Apache 2.0
# Begin configuration section.
cmd=run.pl
stage=1
lmwt=10
# End configuration options.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "usage: $0 <data-dir> <lang-dir> <dir>"
echo "e.g.: $0 data/train data/lang exp/tri3"
echo "or: $0 data/train data/lang exp/tri3/decode_dev"
echo "This script writes files prons.*.gz in the directory provided, which must"
echo "contain alignments (ali.*.gz) or lattices (lat.*.gz). These files are as"
echo "output by nbest-to-prons (see its usage message)."
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --lmwt <lm-weight> # scale for LM, only applicable"
echo " # for lattice input (default: 10)"
exit 1;
fi
# As the usage message of nbest-to-prons says, its output has lines that can be interpreted as
# <utterance-id> <begin-frame> <num-frames> <word> <phone1> <phone2> ... <phoneN>
# and you could convert these into text form using a command like:
# gunzip -c prons.*.gz | utils/sym2int.pl -f 4 words.txt | utils/sym2int.pl -f 5- phones.txt
data=$1
lang=$2
dir=$3
for f in $data/utt2spk $lang/words.txt $dir/num_jobs; do
[ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done
nj=$(cat $dir/num_jobs) || exit 1;
sdata=$data/split$nj
oov=`cat $lang/oov.int` || exit 1;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
if [ -f $dir/final.mdl ]; then
mdl=$dir/final.mdl
else
if [ -f $dir/../final.mdl ]; then
mdl=$dir/../final.mdl # e.g. decoding directories.
else
echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist."
exit 1;
fi
fi
if [ -f $lang/phones/word_boundary.int ]; then
align_words_cmd="lattice-align-words $lang/phones/word_boundary.int $mdl ark:- ark:-"
else
if [ ! -f $lang/phones/align_lexicon.int ]; then
echo "$0: expected either $lang/phones/word_boundary.int or $lang/phones/align_lexicon.int to exist."
exit 1;
fi
align_words_cmd="lattice-align-words-lexicon $lang/phones/align_lexicon.int $mdl ark:- ark:-"
fi
if [ -f $dir/ali.1.gz ]; then
echo "$0: $dir/ali.1.gz exists, so starting from alignments."
if [ $stage -le 1 ]; then
rm $dir/prons.*.gz 2>/dev/null
$cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
"ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$sdata/JOB/text |" \
"" "" ark:- \| $align_words_cmd \| \
nbest-to-prons $mdl ark:- "|gzip -c >$dir/prons.JOB.gz" || exit 1;
fi
else
if [ ! -f $dir/lat.1.gz ]; then
echo "$0: expected either $dir/ali.1.gz or $dir/lat.1.gz to exist."
exit 1;
fi
echo "$0: $dir/lat.1.gz exists, so starting from lattices."
if [ $stage -le 1 ]; then
rm $dir/prons.*.gz 2>/dev/null
$cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
$align_words_cmd \| \
nbest-to-prons $mdl ark:- "|gzip -c >$dir/prons.JOB.gz" || exit 1;
fi
fi
if [ $stage -le 2 ]; then
gunzip -c $dir/prons.*.gz | \
awk '{ $1=""; $2=""; $3=""; count[$0]++; } END{for (k in count) { print count[k], k; }}' > $dir/pron_counts.int || exit 1;
fi
if [ $stage -le 3 ]; then
cat $dir/pron_counts.int | utils/int2sym.pl -f 2 $lang/words.txt | \
utils/int2sym.pl -f 3- $lang/phones.txt | sort -nr > $dir/pron_counts.txt
fi
if [ $stage -le 4 ]; then
if [ -f $lang/phones/word_boundary.int ]; then
# remove the _B, _I, _S, _E markers from phones; this is often convenient
# if we want to go back to a word-position-independent source lexicon.
cat $dir/pron_counts.txt | perl -ane '@A = split(" ", $_);
for ($n=2;$n<@A;$n++) { $A[$n] =~ s/_[BISE]$//; } print join(" ", @A) . "\n"; ' >$dir/pron_counts_nowb.txt
fi
fi
if [ $stage -le 5 ]; then
# Here we figure the count of silence before and after words (actually prons)
# 1. Create a text like file, but instead of putting words, we write
# "word pron" pairs. We change the format of prons.*.gz from pron-per-line
# to utterance-per-line (with "word pron" pairs tab-separated), and add
# <s> and </s> at the begin and end of each sentence. The _B, _I, _S, _E
# markers are removed from phones.
gunzip -c $dir/prons.*.gz | utils/int2sym.pl -f 4 $lang/words.txt | \
utils/int2sym.pl -f 5- $lang/phones.txt | cut -d ' ' -f 1,4- | awk '
BEGIN { utter_id = ""; }
{
if (utter_id == "") { utter_id = $1; printf("%s\t<s>", utter_id); }
else if (utter_id != $1) {
printf "\t</s>\n"; utter_id = $1; printf("%s\t<s>", utter_id);
}
printf("\t%s", $2);
for (n = 3; n <= NF; n++) { sub("_[BISE]$", "", $n); printf(" %s", $n); }
}
END { printf "\t</s>\n"; }' > $dir/pron_perutt_nowb.txt
# 2. Collect bigram counts for words. To be more specific, we are actually
# collecting counts for "v ? w", where "?" represents silence or
# non-silence.
cat $dir/pron_perutt_nowb.txt | perl -ape 's/<eps>[^\t]*\t//g;' | perl -e '
while (<>) {
chomp; @col = split("\t");
for($i = 1; $i < scalar(@col) - 1; $i += 1) {
$bigram{$col[$i] . "\t" . $col[$i + 1]} += 1;
}
}
foreach $key (keys %bigram) {
print "$bigram{$key}\t$key\n";
}' > $dir/pron_bigram_counts_nowb.txt
# 3. Collect bigram counts for silence and words. the count file has 4 fields
# for counts, followed by the "word pron" pair. All fields are separated by
# spaces:
# <sil-before-count> <nonsil-before-count> <sil-after-count> <nonsil-after-count> <word> <phone1> <phone2 >...
cat $dir/pron_perutt_nowb.txt | cut -f 2- | perl -e '
%sil_wpron = (); %nonsil_wpron = (); %wpron_sil = (); %wpron_nonsil = ();
%words = ();
while (<STDIN>) {
chomp;
@col = split(/[\t]+/, $_); @col >= 2 || die "'$0': bad line \"$_\"\n";
for ($n = 0; $n < @col - 1; $n++) {
# First word is not silence, collect the wpron_sil and wpron_nonsil
# stats.
if ($col[$n] !~ m/^<eps> /) {
if ($col[$n + 1] =~ m/^<eps> /) { $wpron_sil{$col[$n]} += 1; }
else { $wpron_nonsil{$col[$n]} += 1; }
$words{$col[$n]} = 1;
}
# Second word is not silence, collect the sil_wpron and nonsil_wpron
# stats.
if ($col[$n + 1] !~ m/^<eps> /) {
if ($col[$n] =~ m/^<eps> /) { $sil_wpron{$col[$n + 1]} += 1; }
else { $nonsil_wpron{$col[$n + 1]} += 1; }
$words{$col[$n + 1]} = 1;
}
}
}
foreach $wpron (sort keys %words) {
$sil_wpron{$wpron} += 0; $nonsil_wpron{$wpron} += 0;
$wpron_sil{$wpron} += 0; $wpron_nonsil{$wpron} += 0;;
print "$sil_wpron{$wpron} $nonsil_wpron{$wpron} ";
print "$wpron_sil{$wpron} $wpron_nonsil{$wpron} $wpron\n";
}
'> $dir/sil_counts_nowb.txt
fi
echo "$0: done writing prons to $dir/prons.*.gz, silence counts in "
echo "$0: $dir/sil_counts_nowb.txt and pronunciation counts in "
echo "$0: $dir/pron_counts.{int,txt}"
if [ -f $lang/phones/word_boundary.int ]; then
echo "$0: ... and also in $dir/pron_counts_nowb.txt"
fi
exit 0;