csj_train_rnnlms.sh 6.42 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151


#!/bin/bash 

# 2016 Modified by Tomohiro Tanaka at Tokyo Institute of Technology
# for Japanese speech recognition using CSJ.

# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
#           2015  Guoguo Chen

# This script trains LMs on the WSJ LM-training data.
# It requires that you have already run wsj_extend_dict.sh,
# to get the larger-size dictionary including all of CMUdict
# plus any OOVs and possible acronyms that we could easily 
# derive pronunciations for.

# This script takes no command-line arguments but takes the --cmd option.

# Begin configuration section.
rand_seed=0
cmd=run.pl
nwords=10000 # This is how many words we're putting in the vocab of the RNNLM. 
hidden=30
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
direct=1000 # Number of weights that are used for "direct" connections, in millions.
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
threads=1 # for RNNLM-HS
bptt=6 # length of BPTT unfolding in RNNLM
bptt_block=10 # length of BPTT unfolding in RNNLM
dict_suffix=
# End configuration section.

[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh

if [ $# != 1 ]; then
   echo "Usage: local/wsj_train_rnnlms.sh [options] <dest-dir>"
   echo "For options, see top of script file"
   exit 1;
fi

dir=$1
text=data/local/train/text
lexicon=data/local/dict${dict_suffix}/lexicon.txt
mkdir -p $dir

rnnlm=$KALDI_ROOT/tools/$rnnlm_ver/rnnlm


export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH

( # First make sure the kaldi_lm toolkit is installed.
 # Note: this didn't work out of the box for me, I had to
 # change the g++ version to just "g++" (no cross-compilation
 # needed for me as I ran on a machine that had been setup
 # as 64 bit by default.
 cd $KALDI_ROOT/tools || exit 1;
 if [ -f $rnnlm_ver/rnnlm ]; then
   echo Not installing the rnnlm toolkit since it is already there.
 else
   if [ $rnnlm_ver == "rnnlm-hs-0.1b" ]; then
       extras/install_rnnlm_hs.sh
   else
       echo Downloading and installing the rnnlm tools
       # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
       if [ ! -f $rnnlm_ver.tgz ]; then
	   wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
       fi
       mkdir $rnnlm_ver
       cd $rnnlm_ver
       tar -xvzf ../$rnnlm_ver.tgz || exit 1;
       make CC=g++ || exit 1;
       echo Done making the rnnlm tools
       fi
   fi
) || exit 1;

echo "lexicon->wordlist"
cut -d' ' -f1 $lexicon  > $dir/wordlist.all

# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.                                                                                                   
echo "Getting training data with OOV words replaced with <UNK>"
cut -d' ' -f2- $text | awk -v w=$dir/wordlist.all \
  'BEGIN{while((getline<w)>0) v[$1]=1;}                                                                                                                                              
  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
  | gzip -c > $dir/all.gz

echo "Splitting data into train and validation sets."
heldout_sent=10000
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data                                                                                                      
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
 perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
  > $dir/train.in # training data 

    # The rest will consist of a word-class represented by <RNN_UNK>, that
    # maps (with probabilities) to a whole class of words.

# Get unigram counts from our training data, and use this to select word-list
# for RNNLM training; e.g. 10k most frequent words.  Rest will go in a class
# that we (manually, at the shell level) assign probabilities for words that
# are in that class.  Note: this word-list doesn't need to include </s>; this
# automatically gets added inside the rnnlm program.
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
# smoothing of the counts.

echo "unigram-count"
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
    sort -nr > $dir/unigram.counts

head -$nwords $dir/unigram.counts | awk '{print $2}' > $dir/wordlist.rnn
tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts

tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts  >$dir/unk.probs

for type in train valid; do
  cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
    'BEGIN{while((getline<w)>0) v[$1]=1;}                                                                                                                                            
    {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
    > $dir/$type
done
rm $dir/train.in # no longer needed-- and big.                                                                                                                                       

# Now randomize the order of the training data.                                                                                                                                     
echo "Randomize training data"
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
 sort | cut -f 2 > $dir/foo
mv $dir/foo $dir/train

echo "Finish data preparation!"

# OK we'll train the RNNLM on this data.
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"

$cmd $dir/rnnlm.log \
   $rnnlm -threads $threads -independent -train $dir/train -valid $dir/valid \
   -rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt $bptt -bptt-block $bptt_block \
   -direct-order 4 -direct $direct -binary || exit 1;


# make it like a Kaldi table format, with fake utterance-ids.
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids

utils/rnnlm_compute_scores.sh --rnnlm_ver $rnnlm_ver $dir $dir/tmp.valid $dir/valid.with_ids \
  $dir/valid.scores
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
  # is one per word, to account for the </s> at the end of each sentence; this is the
  # correct number to normalize buy.
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores` 
echo Perplexity is $p | tee $dir/perplexity.log

rm $dir/train $dir/all.gz