Blame view
egs/tidigits/s5/local/tidigits_prepare_lang.sh
5.74 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0. # This script prepares the lang/ directory. # . ./path.sh # Decided to do this using something like a real lexicon, although we # could also have used whole-word models. tmpdir=data/local/dict lang=data/lang mkdir -p $tmpdir cat >$tmpdir/lexicon.txt <<EOF z z iy r ow o ow 1 w ah n 2 t uw 3 th r iy 4 f ao r 5 f ay v 6 s ih k s 7 s eh v ah n 8 ey t 9 n ay n EOF # and note, we'll have a silence phone, but it won't appear # in this form of lexicon as there's no silence word; it's an option # in the lexicon FST that gets added by the script. mkdir -p $lang/phones # symbol-table for words: cat $tmpdir/lexicon.txt | awk '{print $1}' | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s ", $1, n++); }' \ >$lang/words.txt # list of phones. cat $tmpdir/lexicon.txt | awk '{for(n=2;n<=NF;n++) seen[$n]=1; } END{print "sil"; for (w in seen) { print w; }}' \ >$tmpdir/phone.list # symbol-table for phones: cat $tmpdir/phone.list | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s ", $1, n++); }' \ >$lang/phones.txt p=$lang/phones echo sil > $p/silence.txt echo sil > $p/context_indep.txt echo sil > $p/optional_silence.txt grep -v -w sil $tmpdir/phone.list > $p/nonsilence.txt touch $p/disambig.txt # disambiguation-symbols list, will be empty. touch $p/extra_questions.txt # list of "extra questions"-- empty; we don't # have things like tone or word-positions or stress markings. cat $tmpdir/phone.list > $p/sets.txt # list of "phone sets"-- each phone is in its # own set. Normally, each line would have a bunch of word-position-dependenent or # stress-dependent realizations of the same phone. for t in silence nonsilence context_indep optional_silence disambig; do utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int cat $p/$t.int | awk '{printf(":%d", $1);} END{printf " "}' | sed s/:// > $p/$t.csl done for t in extra_questions sets; do utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int done cat $tmpdir/phone.list | awk '{printf("shared split %s ", $1);}' >$p/roots.txt utils/sym2int.pl -f 3- $lang/phones.txt $p/roots.txt >$p/roots.int echo z > $lang/oov.txt # we map OOV's to this.. there are no OOVs in this setup, # but the scripts expect this file to exist. utils/sym2int.pl $lang/words.txt <$lang/oov.txt >$lang/oov.int # Note: "word_boundary.{txt,int}" will not exist in this setup. This will mean it's # not very easy to get word alignments, but it simplifies some things. # Make the FST form of the lexicon (this includes optional silence). utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \ fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ fstarcsort --sort_type=olabel > $lang/L.fst # Note: in this setup there are no "disambiguation symbols" because the lexicon # contains no homophones; and there is no '#0' symbol in the LM because it's # not a backoff LM, so L_disambig.fst is the same as L.fst. cp $lang/L.fst $lang/L_disambig.fst num_sil_states=5 num_nonsil_states=3 silphonelist=`cat $lang/phones/silence.csl` nonsilphonelist=`cat $lang/phones/nonsilence.csl` utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo # Now we prepare a simple grammar G.fst that's a kind of loop of # digits (no silence in this, since that's handled in L.fst) # there are 12 options: 1-9, zero, oh, and end-of-sentence. penalty=`perl -e '$prob = 1.0/12; print -log($prob); '` # negated log-prob, # which becomes the cost on the FST. ( for x in `echo z o 1 2 3 4 5 6 7 8 9`; do echo 0 0 $x $x $penalty # format is: from-state to-state input-symbol output-symbol cost done echo 0 $penalty # format is: state final-cost ) | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \ --keep_isymbols=false --keep_osymbols=false |\ fstarcsort --sort_type=ilabel > $lang/G.fst exit 0; if [ $# -ne 0 ]; then echo "Argument should be the TIDIGITS directory, see ../run.sh for example." exit 1; fi tidigits=$1 tmpdir=`pwd`/data/local/data mkdir -p $tmpdir # Note: the .wav files are not in .wav format but "sphere" format (this was # produced in the days before Windows). find $tidigits/tidigits/train -name '*.wav' > $tmpdir/train.flist n=`cat $tmpdir/train.flist | wc -l` [ $n -eq 8623 ] || echo Unexpected number of training files $n versus 8623 find $tidigits/tidigits/test -name '*.wav' > $tmpdir/test.flist n=`cat $tmpdir/test.flist | wc -l` [ $n -eq 8700 ] || echo Unexpected number of test files $n versus 8700 sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi for x in train test; do # get scp file that has utterance-ids and maps to the sphere file. cat $tmpdir/$x.flist | perl -ane 'm|/(..)/([1-9zo]+[ab])\.wav| || die "bad line $_"; print "$1_$2 $_"; ' \ | sort > $tmpdir/${x}_sph.scp # turn it into one that has a valid .wav format in the modern sense (i.e. RIFF format, not sphere). # This file goes into its final location mkdir -p data/$x awk '{printf("%s '$sph2pipe' -f wav %s | ", $1, $2);}' < $tmpdir/${x}_sph.scp > data/$x/wav.scp # Now get the "text" file that says what the transcription is. cat data/$x/wav.scp | perl -ane 'm/^(.._([1-9zo]+)[ab]) / || die; $text = join(" ", split("", $2)); print "$1 $text ";' \ <data/$x/wav.scp >data/$x/text # now get the "utt2spk" file that says, for each utterance, the speaker name. perl -ane 'm/^((..)_\S+) / || die; print "$1 $2 "; ' \ <data/$x/wav.scp >data/$x/utt2spk # create the file that maps from speaker to utterance-list. utils/utt2spk_to_spk2utt.pl <data/$x/utt2spk >data/$x/spk2utt done echo "Data preparation succeeded" |