tidigits_prepare_lang.sh
5.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This script prepares the lang/ directory.
#
. ./path.sh
# Decided to do this using something like a real lexicon, although we
# could also have used whole-word models.
tmpdir=data/local/dict
lang=data/lang
mkdir -p $tmpdir
cat >$tmpdir/lexicon.txt <<EOF
z z iy r ow
o ow
1 w ah n
2 t uw
3 th r iy
4 f ao r
5 f ay v
6 s ih k s
7 s eh v ah n
8 ey t
9 n ay n
EOF
# and note, we'll have a silence phone, but it won't appear
# in this form of lexicon as there's no silence word; it's an option
# in the lexicon FST that gets added by the script.
mkdir -p $lang/phones
# symbol-table for words:
cat $tmpdir/lexicon.txt | awk '{print $1}' | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s\n", $1, n++); }' \
>$lang/words.txt
# list of phones.
cat $tmpdir/lexicon.txt | awk '{for(n=2;n<=NF;n++) seen[$n]=1; } END{print "sil"; for (w in seen) { print w; }}' \
>$tmpdir/phone.list
# symbol-table for phones:
cat $tmpdir/phone.list | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s\n", $1, n++); }' \
>$lang/phones.txt
p=$lang/phones
echo sil > $p/silence.txt
echo sil > $p/context_indep.txt
echo sil > $p/optional_silence.txt
grep -v -w sil $tmpdir/phone.list > $p/nonsilence.txt
touch $p/disambig.txt # disambiguation-symbols list, will be empty.
touch $p/extra_questions.txt # list of "extra questions"-- empty; we don't
# have things like tone or word-positions or stress markings.
cat $tmpdir/phone.list > $p/sets.txt # list of "phone sets"-- each phone is in its
# own set. Normally, each line would have a bunch of word-position-dependenent or
# stress-dependent realizations of the same phone.
for t in silence nonsilence context_indep optional_silence disambig; do
utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
cat $p/$t.int | awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $p/$t.csl
done
for t in extra_questions sets; do
utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
done
cat $tmpdir/phone.list | awk '{printf("shared split %s\n", $1);}' >$p/roots.txt
utils/sym2int.pl -f 3- $lang/phones.txt $p/roots.txt >$p/roots.int
echo z > $lang/oov.txt # we map OOV's to this.. there are no OOVs in this setup,
# but the scripts expect this file to exist.
utils/sym2int.pl $lang/words.txt <$lang/oov.txt >$lang/oov.int
# Note: "word_boundary.{txt,int}" will not exist in this setup. This will mean it's
# not very easy to get word alignments, but it simplifies some things.
# Make the FST form of the lexicon (this includes optional silence).
utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $lang/L.fst
# Note: in this setup there are no "disambiguation symbols" because the lexicon
# contains no homophones; and there is no '#0' symbol in the LM because it's
# not a backoff LM, so L_disambig.fst is the same as L.fst.
cp $lang/L.fst $lang/L_disambig.fst
num_sil_states=5
num_nonsil_states=3
silphonelist=`cat $lang/phones/silence.csl`
nonsilphonelist=`cat $lang/phones/nonsilence.csl`
utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo
# Now we prepare a simple grammar G.fst that's a kind of loop of
# digits (no silence in this, since that's handled in L.fst)
# there are 12 options: 1-9, zero, oh, and end-of-sentence.
penalty=`perl -e '$prob = 1.0/12; print -log($prob); '` # negated log-prob,
# which becomes the cost on the FST.
( for x in `echo z o 1 2 3 4 5 6 7 8 9`; do
echo 0 0 $x $x $penalty # format is: from-state to-state input-symbol output-symbol cost
done
echo 0 $penalty # format is: state final-cost
) | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false |\
fstarcsort --sort_type=ilabel > $lang/G.fst
exit 0;
if [ $# -ne 0 ]; then
echo "Argument should be the TIDIGITS directory, see ../run.sh for example."
exit 1;
fi
tidigits=$1
tmpdir=`pwd`/data/local/data
mkdir -p $tmpdir
# Note: the .wav files are not in .wav format but "sphere" format (this was
# produced in the days before Windows).
find $tidigits/tidigits/train -name '*.wav' > $tmpdir/train.flist
n=`cat $tmpdir/train.flist | wc -l`
[ $n -eq 8623 ] || echo Unexpected number of training files $n versus 8623
find $tidigits/tidigits/test -name '*.wav' > $tmpdir/test.flist
n=`cat $tmpdir/test.flist | wc -l`
[ $n -eq 8700 ] || echo Unexpected number of test files $n versus 8700
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
for x in train test; do
# get scp file that has utterance-ids and maps to the sphere file.
cat $tmpdir/$x.flist | perl -ane 'm|/(..)/([1-9zo]+[ab])\.wav| || die "bad line $_"; print "$1_$2 $_"; ' \
| sort > $tmpdir/${x}_sph.scp
# turn it into one that has a valid .wav format in the modern sense (i.e. RIFF format, not sphere).
# This file goes into its final location
mkdir -p data/$x
awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < $tmpdir/${x}_sph.scp > data/$x/wav.scp
# Now get the "text" file that says what the transcription is.
cat data/$x/wav.scp |
perl -ane 'm/^(.._([1-9zo]+)[ab]) / || die; $text = join(" ", split("", $2)); print "$1 $text\n";' \
<data/$x/wav.scp >data/$x/text
# now get the "utt2spk" file that says, for each utterance, the speaker name.
perl -ane 'm/^((..)_\S+) / || die; print "$1 $2\n"; ' \
<data/$x/wav.scp >data/$x/utt2spk
# create the file that maps from speaker to utterance-list.
utils/utt2spk_to_spk2utt.pl <data/$x/utt2spk >data/$x/spk2utt
done
echo "Data preparation succeeded"