Blame view

scripts/rnnlm/prepare_rnnlm_dir.sh 6.28 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  #!/usr/bin/env bash
  
  # This script prepares some things needed by rnnlm/train_rnnlm.sh, e.g. it initializes
  # the model and ensures we have the split-up integerized data on disk.
  
  cmd=run.pl
  stage=0
  sampling=true            # add the option --sampling false to disable creation
                           # of sampling.lm
  words_per_split=5000000  # aim for training on 5 million words per job.  the
                           # aim is to have the jobs last at least a couple of
                           # minutes.  If this leads to having just one job,
                           # we repeat the archive as many times as needed to
                           # get the target length.
  unigram_factor=100.0     # Option used when pruning the LM used for sampling.
                           # You can increase this, e.g. to 200 or 400, if the LM used
                           # for sampling is too big, causing rnnlm-get-egs to
                           # take up too many CPUs worth of compute.
  
  . utils/parse_options.sh
  
  if [ $# != 3 ]; then
    echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>"
    echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by"
    echo "rnnlm/train_rnnlm.sh, and initializes the model."
    echo " <text-dir> is as validated by rnnlm/validate_text_dir.py"
    echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh."
    exit 1
  fi
  
  
  text_dir=$1
  config_dir=$2
  dir=$3
  
  set -e
  . ./path.sh
  
  if [ $stage -le 0 ]; then
    echo "$0: validating input"
  
    rnnlm/validate_text_dir.py --spot-check=true $text_dir
  
    rnnlm/validate_config_dir.sh $text_dir $config_dir
  
    if ! mkdir -p $dir; then
      echo "$0: could not create RNNLM dir $dir"
    fi
  fi
  
  if [ $stage -le 1 ]; then
    if [ $config_dir != $dir/config ]; then
      echo "$0: copying config directory"
      mkdir -p $dir/config
      # copy expected things from $config_dir to $dir/config.
      for f in words.txt data_weights.txt oov.txt xconfig; do
        cp $config_dir/$f $dir/config
      done
      # features.txt is optional, check separately
      if [ -f $config_dir/features.txt ]; then
        cp $config_dir/features.txt $dir/config
      fi
    fi
  
    rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
  fi
  
  if [ $stage -le 2 ]; then
    if [ ! -f $text_dir/dev.counts ] || [ $text_dir/dev.counts -ot $text_dir/dev.txt ]; then
      echo "$0: preparing unigram counts in $text_dir"
      rnnlm/get_unigram_counts.sh $text_dir
    fi
  fi
  
  
  if [ $stage -le 3 ]; then
    if [ -f $dir/config/features.txt ]; then
      # prepare word-level features in $dir/word_feats.txt
      # first we need the appropriately weighted unigram counts.
  
      if awk '{if($2 == "unigram"){saw_unigram=1;}} END{exit(saw_unigram ? 0 : 1)}' $dir/config/features.txt; then
        # we need the unigram probabilities
        rnnlm/ensure_counts_present.sh $text_dir
  
        rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
          --unk-word=$(cat $dir/config/oov.txt) \
          --data-weights-file=$dir/config/data_weights.txt $text_dir \
          >$dir/unigram_probs.txt
        unigram_opt="--unigram-probs=$dir/unigram_probs.txt"
      else
        unigram_opt=
      fi
      rnnlm/get_word_features.py $unigram_opt --treat-as-bos='#0' \
        $dir/config/words.txt $dir/config/features.txt >$dir/word_feats.txt
    else
      [ -f $dir/word_feats.txt ] && rm $dir/word_feats.txt
    fi
  fi
  
  if [ $stage -le 4 ]; then
    echo "$0: preparing split-up integerized text data with weights"
    num_repeats=1
    num_splits=$(rnnlm/get_num_splits.sh $words_per_split $text_dir $dir/config/data_weights.txt)
    if [ $num_splits -lt 1 ]; then
      # the script outputs the negative of the num-repeats if the num-splits is 1
      # and the num-repeats is >1.
      num_repeats=$[-num_splits]
      num_splits=1
    fi
  
    # note: the python script treats the empty unknown word as a special case,
    # so if oov.txt is empty we don't have to take any special action.
    rnnlm/prepare_split_data.py --unk-word="$(cat $dir/config/oov.txt)" \
       --vocab-file=$dir/config/words.txt --data-weights-file=$dir/config/data_weights.txt \
       --num-splits=$num_splits $text_dir $dir/text
    echo $num_repeats >$dir/text/info/num_repeats
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: initializing neural net"
    mkdir -p $dir/config/nnet
  
    steps/nnet3/xconfig_to_configs.py --xconfig-file=$dir/config/xconfig \
      --config-dir=$dir/config/nnet
  
    # initialize the neural net.
    nnet3-init $dir/config/nnet/ref.config $dir/0.raw
  fi
  
  embedding_dim=$(rnnlm/get_embedding_dim.py $dir/0.raw)
  
  if [ $stage -le 6 ]; then
    echo "$0: initializing embedding matrix"
    if [ -f $dir/config/features.txt ]; then
      # We are using sparse features.
      feat_dim=$(tail -n 1 $dir/config/features.txt | awk '{print $1 + 1;}')
  
      first_element_opt=
      if grep -q '0\tconstant' $dir/config/features.txt; then
        first_element_opt="--first-element 1.0"
      fi
      # we'll probably make the stddev configurable soon, or maybe just remove it.
      # At some point stability was an issue.
      rnnlm/initialize_matrix.pl $first_element_opt --stddev 0.001 \
        $feat_dim $embedding_dim > $dir/feat_embedding.0.mat
  
      [ -f $dir/word_embedding.0.mat ] && rm $dir/word_embedding.0.mat
    else
      vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
      rnnlm/initialize_matrix.pl --first-column 1.0 $vocab_size $embedding_dim > $dir/word_embedding.0.mat
  
      [ -f $dir/feat_embedding.0.mat ] && rm $dir/feat_embedding.0.mat
    fi
  fi
  
  mkdir -p $dir/log
  
  if [ $stage -le 7 ]; then
    if $sampling; then
      echo "$0: preparing language model for sampling"
      num_splits=$(cat $dir/text/info/num_splits)
      text_files=$(for n in $(seq $num_splits); do echo -n $dir/text/$n.txt ''; done)
      vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
  
      special_symbol_opts=$(cat $dir/special_symbol_opts.txt)
  
      # this prints some nontrivial log information, so run using '$cmd' to ensure
      # the output gets saved.
      # ***NOTE*** we will likely later have to pass in options to this program to control
      # the size of the sampling LM.
      $cmd $dir/log/prepare_sampling_lm.log \
           rnnlm-get-sampling-lm --unigram-factor=$unigram_factor $special_symbol_opts \
                --vocab-size=$vocab_size  "cat $text_files|" $dir/sampling.lm
      echo "$0: done estimating LM for sampling."
    else
      [ -f $dir/sampling.lm ] && rm $dir/sampling.lm
    fi
  fi