Blame view

egs/wsj/s5/steps/nnet2/train_discriminative.sh 17.5 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
  #!/bin/bash
  
  # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
  
  # This script does MPE or MMI or state-level minimum bayes risk (sMBR) training
  # of neural nets. 
  
  # Begin configuration section.
  cmd=run.pl
  num_epochs=4       # Number of epochs of training
  learning_rate=0.00002
  effective_lrate=    # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet.
  acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
  criterion=smbr
  boost=0.0       # option relevant for MMI
  drop_frames=false #  option relevant for MMI
  one_silence_class=true # Option relevant for MPE/SMBR
  num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
                     # will interact with the learning rates (if you decrease
                     # this, you'll have to decrease the learning rate, and vice
                     # versa).
  samples_per_iter=400000 # measured in frames, not in "examples"
  
  modify_learning_rates=true
  last_layer_factor=1.0  # relates to modify-learning-rates
  first_layer_factor=1.0 # relates to modify-learning-rates
  shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
                  # on each iter.  You could set it to 0 or to a large value for complete
                  # randomization, but this would both consume memory and cause spikes in
                  # disk I/O.  Smaller is easier on disk and memory but less random.  It's
                  # not a huge deal though, as samples are anyway randomized right at the start.
  
  
  stage=-8
  
  io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
  
  num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                  # using GPUs.
  parallel_opts="--num-threads 16 --mem 1G" # by default we use 4 threads; this lets the queue know.
    # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
  transform_dir= # If this is a SAT system, directory for transforms
  cleanup=true
  transform_dir=
  degs_dir=
  retroactive=false
  online_ivector_dir=
  # End configuration section.
  
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  
  if [ $# != 6 ]; then
    echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <exp-dir>"
    echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe"
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config file containing options"
    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
    echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
    echo "  --effective-lrate <effective-learning-rate>      # If supplied, learning rate will be set to"
    echo "                                                   # this value times num-jobs-nnet."
    echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
    echo "                                                   # training (will affect results as well as speed; try 8, 16)"
    echo "                                                   # Note: if you increase this, you may want to also increase"
    echo "                                                   # the learning rate."
    echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
    echo "                                                   # as well as speed; may interact with batch size; if you increase"
    echo "                                                   # this, you may want to decrease the batch size."
    echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
    echo "                                                   # use multiple threads... "
    echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
    echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
    echo "                                                   # process."
    echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
    echo "                                                   # the middle."
    echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
    echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
    echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
    echo "                                                   # changes across layers."
    echo "  --degs-dir <dir|"">                              # Directory for discriminative examples, e.g. exp/foo/degs"
    echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
    echo "                                                   # where the numerator transition-id is not in the denominator lattice."
    echo "  --one-silence-class <true,false|false>           # Option that affects MPE/SMBR training (will tend to reduce insertions)"
    echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
    echo "                                                   # online-neural-net setup."
    exit 1;
  fi
  
  data=$1
  lang=$2
  alidir=$3
  denlatdir=$4
  src_model=$5
  dir=$6
  
  
  extra_files=
  [ ! -z $online_ivector_dir ] && \
   extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"
  
  # Check some files.
  for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
           $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  
  nj=$(cat $alidir/num_jobs) || exit 1; # caution: $nj is the number of
                                        # splits of the denlats and alignments, but
                                        # num_jobs_nnet is the number of nnet training
                                        # jobs we run in parallel.
  if ! [ $nj == $(cat $denlatdir/num_jobs) ]; then
    echo "Number of jobs mismatch: $nj versus $(cat $denlatdir/num_jobs)"
    exit 1;
  fi
  
  mkdir -p $dir/log || exit 1;
  [ -z "$degs_dir" ] && mkdir -p $dir/degs
  
  utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
  cp $lang/phones.txt $dir || exit 1;
  
  sdata=$data/split$nj
  utils/split_data.sh $data $nj
  
  # function to remove egs that might be soft links.
  remove () { for x in $*; do [ -L $x ] && rm $(utils/make_absolute.sh $x); rm $x; done }
  
  splice_opts=`cat $alidir/splice_opts 2>/dev/null`
  silphonelist=`cat $lang/phones/silence.csl` || exit 1;
  cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
  cp $alidir/splice_opts $dir 2>/dev/null
  cp $alidir/cmvn_opts $dir 2>/dev/null
  cp $alidir/tree $dir
  
  if [ ! -z "$online_ivector_dir" ]; then
    ivector_period=$(cat $online_ivector_dir/ivector_period)
    ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
    # the 'const_dim_opt' allows it to write only one iVector per example,
    # rather than one per time-index... it has to average over
    const_dim_opt="--const-feat-dim=$ivector_dim"
  fi
  
  ## Set up features.
  ## Don't support deltas, only LDA or raw (mainly because deltas are less frequently used).
  if [ -z $feat_type ]; then
    if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
  fi
  echo "$0: feature type is $feat_type"
  
  case $feat_type in
    raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
     ;;
    lda) 
      splice_opts=`cat $alidir/splice_opts 2>/dev/null`
      cp $alidir/final.mat $dir    
      feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      ;;
    *) echo "$0: invalid feature type $feat_type" && exit 1;
  esac
  
  if [ -z "$transform_dir" ]; then
    if [ -f $transform_dir/trans.1 ] || [ -f $transform_dir/raw_trans.1 ]; then
      transform_dir=$alidir
    fi
  fi
  
  if [ ! -z "$transform_dir" ]; then
    echo "$0: using transforms from $transform_dir"
    [ ! -s $transform_dir/num_jobs ] && \
      echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
    nj_orig=$(cat $transform_dir/num_jobs)
    
    if [ $feat_type == "raw" ]; then trans=raw_trans;
    else trans=trans; fi
    if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
      echo "$0: LDA transforms differ between $alidir and $transform_dir"
      exit 1;
    fi
    if [ ! -f $transform_dir/$trans.1 ]; then
      echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
      exit 1;
    fi
    if [ $nj -ne $nj_orig ]; then
      # Copy the transforms into an archive with an index.
      for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
         copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
      feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
    else
      # number of jobs matches with alignment dir.
      feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
    fi
  fi
  if [ ! -z $online_ivector_dir ]; then
    # add iVectors to the features.
    feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
  fi
  
  
  if [ -z "$degs_dir" ]; then
    if [ $stage -le -8 ]; then
      echo "$0: working out number of frames of training data"
      num_frames=$(steps/nnet2/get_num_frames.sh $data)
      echo $num_frames > $dir/num_frames
      # Working out number of iterations per epoch.
      iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
      [ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
      echo $iters_per_epoch > $dir/degs/iters_per_epoch  || exit 1;
    else
      num_frames=$(cat $dir/num_frames) || exit 1;
      iters_per_epoch=$(cat $dir/degs/iters_per_epoch) || exit 1;
    fi
  
    samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
    echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations,"
    echo "$0: giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
  else
    iters_per_epoch=$(cat $degs_dir/iters_per_epoch) || exit 1;
    [ -z "$iters_per_epoch" ] && exit 1;
    echo "$0: Every epoch, splitting the data up into $iters_per_epoch iterations"
  fi
  
  
  # we create these data links regardless of the stage, as there are situations where we
  # would want to recreate a data link that had previously been deleted.
  if [ -z "$degs_dir" ] && [ -d $dir/degs/storage ]; then
    echo "$0: creating data links for distributed storage of degs"
      # See utils/create_split_dir.pl for how this 'storage' directory
      # is created.
    for x in $(seq $num_jobs_nnet); do
      for y in $(seq $nj); do
        utils/create_data_link.pl $dir/degs/degs_orig.$x.$y.ark
      done
      for z in $(seq 0 $[$iters_per_epoch-1]); do
        utils/create_data_link.pl $dir/degs/degs_tmp.$x.$z.ark
        utils/create_data_link.pl $dir/degs/degs.$x.$z.ark
      done
    done
  fi
  
  
  
  if [ $stage -le -7 ]; then
    echo "$0: Copying initial model and modifying preconditioning setup"
  
    # Note, the baseline model probably had preconditioning, and we'll keep it;
    # but we want online preconditioning with a larger number of samples of
    # history, since in this setup the frames are only randomized at the segment
    # level so they are highly correlated.  It might make sense to tune this a
    # little, later on, although I doubt it matters once the --num-samples-history
    # is large enough.
  
    if [ ! -z "$effective_lrate" ]; then
      learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);")
      echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
    fi
    $cmd $dir/log/convert.log \
      nnet-am-copy --learning-rate=$learning_rate "$src_model" - \| \
      nnet-am-switch-preconditioning  --num-samples-history=50000 - $dir/0.mdl || exit 1;
  fi
  
  
  
  
  if [ $stage -le -6 ] && [ -z "$degs_dir" ]; then
    echo "$0: getting initial training examples by splitting lattices"
  
    egs_list=
    for n in `seq 1 $num_jobs_nnet`; do
      egs_list="$egs_list ark:$dir/degs/degs_orig.$n.JOB.ark"
    done
  
  
    $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
      nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
        $dir/0.mdl "$feats" \
      "ark,s,cs:gunzip -c $alidir/ali.JOB.gz |" \
      "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
      nnet-copy-egs-discriminative $const_dim_opt ark:- $egs_list || exit 1;
  fi
  
  if [ $stage -le -5 ] && [ -z "$degs_dir" ]; then
    echo "$0: rearranging examples into parts for different parallel jobs"
  
    # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
    # then split into multiple parts egs.JOB.*.scp for different parts of the
    # data, 0 .. $iters_per_epoch-1.
  
    if [ $iters_per_epoch -eq 1 ]; then
      echo "Since iters-per-epoch == 1, just concatenating the data."
      for n in `seq 1 $num_jobs_nnet`; do
        cat $dir/degs/degs_orig.$n.*.ark > $dir/degs/degs_tmp.$n.0.ark || exit 1;
        remove $dir/degs/degs_orig.$n.*.ark  # don't "|| exit 1", due to NFS bugs...
      done
    else # We'll have to split it up using nnet-copy-egs.
      egs_list=
      for n in `seq 0 $[$iters_per_epoch-1]`; do
        egs_list="$egs_list ark:$dir/degs/degs_tmp.JOB.$n.ark"
      done
      $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/split_egs.JOB.log \
        nnet-copy-egs-discriminative --srand=JOB \
          "ark:cat $dir/degs/degs_orig.JOB.*.ark|" $egs_list || exit 1;
      remove $dir/degs/degs_orig.*.*.ark
    fi
  fi
  
  
  if [ $stage -le -4 ] && [ -z "$degs_dir" ]; then
    # Next, shuffle the order of the examples in each of those files.
    # Each one should not be too large, so we can do this in memory.
    # Then combine the examples together to form suitable-size minibatches
    # (for discriminative examples, it's one example per minibatch, so we
    # have to combine the lattices).
    echo "Shuffling the order of training examples"
    echo "(in order to avoid stressing the disk, these won't all run at once)."
  
    # note, the "|| true" below is a workaround for NFS bugs
    # we encountered running this script with Debian-7, NFS-v4.
    # Also, we should note that we used to do nnet-combine-egs-discriminative
    # at this stage, but if iVectors are used this would expand the size of
    # the examples on disk (because they could no longer be stored in the spk_info
    # variable of the discrminative example, no longer being constant), so
    # now we do the nnet-combine-egs-discriminative operation on the fly during
    # training.
    for n in `seq 0 $[$iters_per_epoch-1]`; do
      $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
        nnet-shuffle-egs-discriminative "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
        ark:$dir/degs/degs_tmp.JOB.$n.ark ark:$dir/degs/degs.JOB.$n.ark || exit 1;
      remove $dir/degs/degs_tmp.*.$n.ark
    done
  fi
  
  if [ -z "$degs_dir" ]; then
    degs_dir=$dir/degs
  fi
  
  num_iters=$[$num_epochs * $iters_per_epoch];
  
  echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
  
  if [ $num_threads -eq 1 ]; then
   train_suffix="-simple" # this enables us to use GPU code if
                          # we have just one thread.
  else
    train_suffix="-parallel --num-threads=$num_threads"
  fi
  
  
  x=0   
  while [ $x -lt $num_iters ]; do
    if [ $stage -le $x ]; then
      
      echo "Training neural net (pass $x)"
  
      $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
        nnet-train-discriminative$train_suffix --silence-phones=$silphonelist \
         --criterion=$criterion --drop-frames=$drop_frames \
         --one-silence-class=$one_silence_class --boost=$boost \
         --acoustic-scale=$acoustic_scale $dir/$x.mdl \
         "ark,bg:nnet-combine-egs-discriminative ark:$degs_dir/degs.JOB.$[$x%$iters_per_epoch].ark ark:- |" \
          $dir/$[$x+1].JOB.mdl \
        || exit 1;
  
      nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.mdl; done)
  
      $cmd $dir/log/average.$x.log \
        nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
  
      if $modify_learning_rates; then
        $cmd $dir/log/modify_learning_rates.$x.log \
          nnet-modify-learning-rates --retroactive=$retroactive \
          --last-layer-factor=$last_layer_factor \
          --first-layer-factor=$first_layer_factor \
          $dir/$x.mdl $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
      fi
      rm $nnets_list
    fi
  
    x=$[$x+1]
  done
  
  rm $dir/final.mdl 2>/dev/null
  ln -s $x.mdl $dir/final.mdl
  
  
  echo Done
  
  if $cleanup; then
    echo Cleaning up data
  
    echo Removing training examples
    if [ -d $dir/degs ] && [ ! -L $dir/degs ]; then # only remove if directory is not a soft link.
      remove $dir/degs/degs.*
    fi
  
    echo Removing most of the models
    for x in `seq 0 $num_iters`; do
      if [ $[$x%$iters_per_epoch] -ne 0 ]; then
        # delete all but the epoch-final models.
        rm $dir/$x.mdl 2>/dev/null
      fi
    done
  fi
  
  for n in $(seq 0 $num_epochs); do
    x=$[$n*$iters_per_epoch]
    rm $dir/epoch$n.mdl 2>/dev/null
    ln -s $x.mdl $dir/epoch$n.mdl
  done