Blame view

egs/tidigits/s5/local/tidigits_prepare_lang.sh 5.74 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  #!/bin/bash
  
  # Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0.
  
  # This script prepares the lang/ directory.
  #
  
  . ./path.sh 
  
  
  # Decided to do this using something like a real lexicon, although we
  # could also have used whole-word models.
  tmpdir=data/local/dict
  lang=data/lang
  mkdir -p $tmpdir
  
  cat >$tmpdir/lexicon.txt <<EOF
  z z iy r ow
  o ow
  1 w ah n
  2 t uw
  3 th r iy
  4 f ao r
  5 f ay v
  6 s ih k s
  7 s eh v ah n
  8 ey t
  9 n ay n
  EOF
  # and note, we'll have a silence phone, but it won't appear
  # in this form of lexicon as there's no silence word; it's an option
  # in the lexicon FST that gets added by the script.
  
  mkdir -p $lang/phones
  
  # symbol-table for words:
  cat $tmpdir/lexicon.txt | awk '{print $1}' | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s
  ", $1, n++); }' \
    >$lang/words.txt
  
  # list of phones.
  cat $tmpdir/lexicon.txt | awk '{for(n=2;n<=NF;n++) seen[$n]=1; } END{print "sil"; for (w in seen) { print w; }}' \
   >$tmpdir/phone.list
  
  # symbol-table for phones:
  cat $tmpdir/phone.list | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s
  ", $1, n++); }' \
    >$lang/phones.txt
  
  p=$lang/phones
  echo sil > $p/silence.txt
  echo sil > $p/context_indep.txt
  echo sil > $p/optional_silence.txt
  grep -v -w sil $tmpdir/phone.list > $p/nonsilence.txt
  touch $p/disambig.txt # disambiguation-symbols list, will be empty.
  touch $p/extra_questions.txt # list of "extra questions"-- empty; we don't
   # have things like tone or word-positions or stress markings.
  cat $tmpdir/phone.list > $p/sets.txt # list of "phone sets"-- each phone is in its
   # own set.  Normally, each line would have a bunch of word-position-dependenent or
   # stress-dependent realizations of the same phone.
  
  for t in silence nonsilence context_indep optional_silence disambig; do
    utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
    cat $p/$t.int | awk '{printf(":%d", $1);} END{printf "
  "}' | sed s/:// > $p/$t.csl 
  done
  for t in extra_questions sets; do
    utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
  done
  
  cat $tmpdir/phone.list | awk '{printf("shared split %s
  ", $1);}' >$p/roots.txt
  utils/sym2int.pl -f 3-  $lang/phones.txt $p/roots.txt >$p/roots.int
  
  echo z > $lang/oov.txt # we map OOV's to this.. there are no OOVs in this setup,
     # but the scripts expect this file to exist.
  utils/sym2int.pl $lang/words.txt <$lang/oov.txt >$lang/oov.int
  
  # Note: "word_boundary.{txt,int}" will not exist in this setup.  This will mean it's
  # not very easy to get word alignments, but it simplifies some things.
  
  # Make the FST form of the lexicon (this includes optional silence).
  utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
    fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \
    --keep_isymbols=false --keep_osymbols=false | \
     fstarcsort --sort_type=olabel > $lang/L.fst 
  
  # Note: in this setup there are no "disambiguation symbols" because the lexicon
  # contains no homophones; and there is no '#0' symbol in the LM because it's
  # not a backoff LM, so L_disambig.fst is the same as L.fst.
  
  cp $lang/L.fst $lang/L_disambig.fst
  
  num_sil_states=5
  num_nonsil_states=3
  silphonelist=`cat $lang/phones/silence.csl`
  nonsilphonelist=`cat $lang/phones/nonsilence.csl`
  utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo
  
  # Now we prepare a simple grammar G.fst that's a kind of loop of
  # digits (no silence in this, since that's handled in L.fst)
  # there are 12 options: 1-9, zero, oh, and end-of-sentence.
  penalty=`perl -e '$prob = 1.0/12; print -log($prob); '` # negated log-prob,
    # which becomes the cost on the FST.
  ( for x in `echo z o 1 2 3 4 5 6 7 8 9`; do
     echo 0 0 $x $x $penalty   # format is: from-state to-state input-symbol output-symbol cost
   done 
   echo 0 $penalty # format is: state final-cost
  ) | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
     --keep_isymbols=false --keep_osymbols=false |\
     fstarcsort --sort_type=ilabel > $lang/G.fst
  
  
  exit 0;
  
  
  
  
  
  
  
  
  if [ $# -ne 0 ]; then
     echo "Argument should be the TIDIGITS directory, see ../run.sh for example."
     exit 1;
  fi
  
  
  
  
  tidigits=$1
  
  tmpdir=`pwd`/data/local/data
  mkdir -p $tmpdir
  
  # Note: the .wav files are not in .wav format but "sphere" format (this was 
  # produced in the days before Windows).
  
  find $tidigits/tidigits/train -name '*.wav' > $tmpdir/train.flist
  n=`cat $tmpdir/train.flist | wc -l`
  [ $n -eq 8623 ] || echo Unexpected number of training files $n versus 8623
  
  find $tidigits/tidigits/test -name '*.wav' > $tmpdir/test.flist
  n=`cat $tmpdir/test.flist | wc -l`
  [ $n -eq 8700 ] || echo Unexpected number of test files $n versus 8700
  
  sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
     exit 1;
  fi
  
  for x in train test; do
    # get scp file that has utterance-ids and maps to the sphere file.
    cat $tmpdir/$x.flist | perl -ane 'm|/(..)/([1-9zo]+[ab])\.wav| || die "bad line $_"; print "$1_$2 $_"; ' \
     | sort > $tmpdir/${x}_sph.scp
    # turn it into one that has a valid .wav format in the modern sense (i.e. RIFF format, not sphere).
    # This file goes into its final location
    mkdir -p data/$x
    awk '{printf("%s '$sph2pipe' -f wav %s |
  ", $1, $2);}' < $tmpdir/${x}_sph.scp > data/$x/wav.scp
  
    # Now get the "text" file that says what the transcription is.
    cat data/$x/wav.scp | 
     perl -ane 'm/^(.._([1-9zo]+)[ab]) / || die; $text = join(" ", split("", $2)); print "$1 $text
  ";' \
      <data/$x/wav.scp >data/$x/text
  
    # now get the "utt2spk" file that says, for each utterance, the speaker name.  
    perl -ane 'm/^((..)_\S+) / || die; print "$1 $2
  "; ' \
      <data/$x/wav.scp >data/$x/utt2spk
    # create the file that maps from speaker to utterance-list.
    utils/utt2spk_to_spk2utt.pl <data/$x/utt2spk >data/$x/spk2utt
  done
  
  echo "Data preparation succeeded"