tidigits_prepare_lang.sh 5.74 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171


#!/bin/bash

# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This script prepares the lang/ directory.
#

. ./path.sh 


# Decided to do this using something like a real lexicon, although we
# could also have used whole-word models.
tmpdir=data/local/dict
lang=data/lang
mkdir -p $tmpdir

cat >$tmpdir/lexicon.txt <<EOF
z z iy r ow
o ow
1 w ah n
2 t uw
3 th r iy
4 f ao r
5 f ay v
6 s ih k s
7 s eh v ah n
8 ey t
9 n ay n
EOF
# and note, we'll have a silence phone, but it won't appear
# in this form of lexicon as there's no silence word; it's an option
# in the lexicon FST that gets added by the script.

mkdir -p $lang/phones

# symbol-table for words:
cat $tmpdir/lexicon.txt | awk '{print $1}' | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s\n", $1, n++); }' \
  >$lang/words.txt

# list of phones.
cat $tmpdir/lexicon.txt | awk '{for(n=2;n<=NF;n++) seen[$n]=1; } END{print "sil"; for (w in seen) { print w; }}' \
 >$tmpdir/phone.list

# symbol-table for phones:
cat $tmpdir/phone.list | awk 'BEGIN {print "<eps> 0"; n=1;} { printf("%s %s\n", $1, n++); }' \
  >$lang/phones.txt

p=$lang/phones
echo sil > $p/silence.txt
echo sil > $p/context_indep.txt
echo sil > $p/optional_silence.txt
grep -v -w sil $tmpdir/phone.list > $p/nonsilence.txt
touch $p/disambig.txt # disambiguation-symbols list, will be empty.
touch $p/extra_questions.txt # list of "extra questions"-- empty; we don't
 # have things like tone or word-positions or stress markings.
cat $tmpdir/phone.list > $p/sets.txt # list of "phone sets"-- each phone is in its
 # own set.  Normally, each line would have a bunch of word-position-dependenent or
 # stress-dependent realizations of the same phone.

for t in silence nonsilence context_indep optional_silence disambig; do
  utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
  cat $p/$t.int | awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $p/$t.csl 
done
for t in extra_questions sets; do
  utils/sym2int.pl $lang/phones.txt <$p/$t.txt >$p/$t.int
done

cat $tmpdir/phone.list | awk '{printf("shared split %s\n", $1);}' >$p/roots.txt
utils/sym2int.pl -f 3-  $lang/phones.txt $p/roots.txt >$p/roots.int

echo z > $lang/oov.txt # we map OOV's to this.. there are no OOVs in this setup,
   # but the scripts expect this file to exist.
utils/sym2int.pl $lang/words.txt <$lang/oov.txt >$lang/oov.int

# Note: "word_boundary.{txt,int}" will not exist in this setup.  This will mean it's
# not very easy to get word alignments, but it simplifies some things.

# Make the FST form of the lexicon (this includes optional silence).
utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
  fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt \
  --keep_isymbols=false --keep_osymbols=false | \
   fstarcsort --sort_type=olabel > $lang/L.fst 

# Note: in this setup there are no "disambiguation symbols" because the lexicon
# contains no homophones; and there is no '#0' symbol in the LM because it's
# not a backoff LM, so L_disambig.fst is the same as L.fst.

cp $lang/L.fst $lang/L_disambig.fst

num_sil_states=5
num_nonsil_states=3
silphonelist=`cat $lang/phones/silence.csl`
nonsilphonelist=`cat $lang/phones/nonsilence.csl`
utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo

# Now we prepare a simple grammar G.fst that's a kind of loop of
# digits (no silence in this, since that's handled in L.fst)
# there are 12 options: 1-9, zero, oh, and end-of-sentence.
penalty=`perl -e '$prob = 1.0/12; print -log($prob); '` # negated log-prob,
  # which becomes the cost on the FST.
( for x in `echo z o 1 2 3 4 5 6 7 8 9`; do
   echo 0 0 $x $x $penalty   # format is: from-state to-state input-symbol output-symbol cost
 done 
 echo 0 $penalty # format is: state final-cost
) | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
   --keep_isymbols=false --keep_osymbols=false |\
   fstarcsort --sort_type=ilabel > $lang/G.fst


exit 0;


if [ $# -ne 0 ]; then
   echo "Argument should be the TIDIGITS directory, see ../run.sh for example."
   exit 1;
fi


tidigits=$1

tmpdir=`pwd`/data/local/data
mkdir -p $tmpdir

# Note: the .wav files are not in .wav format but "sphere" format (this was 
# produced in the days before Windows).

find $tidigits/tidigits/train -name '*.wav' > $tmpdir/train.flist
n=`cat $tmpdir/train.flist | wc -l`
[ $n -eq 8623 ] || echo Unexpected number of training files $n versus 8623

find $tidigits/tidigits/test -name '*.wav' > $tmpdir/test.flist
n=`cat $tmpdir/test.flist | wc -l`
[ $n -eq 8700 ] || echo Unexpected number of test files $n versus 8700

sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
   exit 1;
fi

for x in train test; do
  # get scp file that has utterance-ids and maps to the sphere file.
  cat $tmpdir/$x.flist | perl -ane 'm|/(..)/([1-9zo]+[ab])\.wav| || die "bad line $_"; print "$1_$2 $_"; ' \
   | sort > $tmpdir/${x}_sph.scp
  # turn it into one that has a valid .wav format in the modern sense (i.e. RIFF format, not sphere).
  # This file goes into its final location
  mkdir -p data/$x
  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < $tmpdir/${x}_sph.scp > data/$x/wav.scp

  # Now get the "text" file that says what the transcription is.
  cat data/$x/wav.scp | 
   perl -ane 'm/^(.._([1-9zo]+)[ab]) / || die; $text = join(" ", split("", $2)); print "$1 $text\n";' \
    <data/$x/wav.scp >data/$x/text

  # now get the "utt2spk" file that says, for each utterance, the speaker name.  
  perl -ane 'm/^((..)_\S+) / || die; print "$1 $2\n"; ' \
    <data/$x/wav.scp >data/$x/utt2spk
  # create the file that maps from speaker to utterance-list.
  utils/utt2spk_to_spk2utt.pl <data/$x/utt2spk >data/$x/spk2utt
done

echo "Data preparation succeeded"