Blame view

Scripts/steps/train_nnet_cpu_mmi.sh 16 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
  #!/bin/bash
  
  # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
  
  # MMI (or boosted MMI) training (A.K.A. sequence training) of a neural net based 
  # system as trained by train_nnet_cpu.sh
  
  
  # Begin configuration section.
  cmd=run.pl
  epochs_per_ebw_iter=1 # Number of times we iterate over the whole
                         # data each time we do an "EBW" iteration.
  num_ebw_iters=4 # Number of "EBW" iterations.
  initial_learning_rate=0.001 # learning rate we start with.
  learning_rate_factor=1.0 # factor by which we change the learning
                           # rate each iteration (should be <= 1.0)
  E=2.0  # this is slightly analogous to the constant E used in
         # Extended Baum-Welch updates of GMMs.  It slows down (and
         # somewhat regularizes) the update.
  
  minibatch_size=256 # since the learning rate is always quite low compared with
                     # what we have at the start of ML training, we can probably
                     # afford a somewhat higher minibatch size than there, as
                     # there is less risk of instability.
  
  samples_per_iter=400000 # each phase of training, see this many samples
                           # per job.  Note: this is a kind of suggestion; we
                           # will actually find a number that will make the
                            # #iters per epoch a whole number.
  num_jobs_nnet=8 # Number of neural net training jobs to run in parallel.
                  # not the same as the num-jobs (nj) which will be the same as the
                  # alignment and denlat directories.
  stage=0
  sub_stage=-3 # this can be used to start from a particular sub-iteration of an
               # iteration
  acwt=0.1
  boost=0.0  # boosting for BMMI (you can try 0.1).. this is applied per frame.
  transform_dir=  # Note: by default any transforms in $alidir will be used.
  
  parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
  io_opts="-tc 10" # max 5 jobs running at one time (a lot of I/O.)
  num_threads=16 # number of threads for neural net trainer..
  mkl_num_threads=1
  random_copy=false
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  
  if [ $# != 6 ]; then
    echo "Usage: steps/train_nnet_cpu_mmi.sh [opts] <data> <lang> <src-dir> <ali-dir> <denlat-dir> <exp-dir>"
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "Note, the terminology is: each iteration of EBW we do multiple epochs; each epoch"
    echo " we have multiple iterations of training (note the same as the EBW iters)."
    echo "  --config <config-file>                           # config file containing options"
    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --num-ebw-iters <#iters|4>                       # number of pseudo-Extended-Baum-Welch iterations (default: 4)"
    echo "  --epochs-per-ebw-iter <#epochs|1>                # number of times to see all the data per EBW iter."
    echo "  --initial-learning-rate <initial-lrate|0.005>    # learning rate to use on the first iteration"
    echo "  --learning-rate-factor <lrate-factor|1.0>        # Factor by which to change the learning rate on each"
    echo "                                                   # EBW iteration (should be <= 1.0)"
    echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
    echo "                                                   # training (will affect results as well as speed; try 8, 16)."
    echo "                                                   # Note: if you increase this, you may want to also increase"
    echo "                                                   # the learning rate."
    echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
    echo "                                                   # as well as speed; may interact with batch size; if you increase"
    echo "                                                   # this, you may want to decrease the batch size."
    echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
    echo "                                                   # use multiple threads."
    echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for any especially I/O intensive jobs"
    echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
    echo "                                                   # should not get too large, e.g. >2k)."
    echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, for each"
    echo "                                                   # process.  Note: this will get modified to a number that will"
    echo "                                                   # divide the data into a whole number of pieces."
    echo "  --transform-dir <dir>                            # Directory to find fMLLR transforms; if not specified, "
    echo "                                                   # $alidir will be used if it has transforms"
    echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
    echo "                                                   # the middle."
    echo "  --sub-stage <sub-stage|0>                        # In conjunction with --stage, can be used to start a partially-completed"
    echo "                                                   # training process (refers to the phase number)"
    
  
    exit 1;
  fi
  
  data=$1
  lang=$2
  srcdir=$3
  alidir=$4 # Also used for transforms by default, if transform-dir not specified.
  denlatdir=$5
  dir=$6 # experimental directory
  
  # Check that some files exist, mostly to verify correct directory arguments.
  for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $alidir/ali.1.gz $denlatdir/lat.1.gz; do
    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
  done
  
  mkdir -p $dir/log
  cp $srcdir/tree $dir
  learning_rate=$initial_learning_rate
  if [ $stage -ge -1 ]; then
    $cmd $dir/log/copy_initial.log \
       nnet-am-copy --learning-rate=$learning_rate $srcdir/final.mdl $dir/0.1.mdl
  fi
  
  nnet_context_opts="--left-context=`nnet-am-info $dir/0.1.mdl 2>/dev/null | grep -w left-context | awk '{print $2}'` --right-context=`nnet-am-info $dir/0.1.mdl 2>/dev/null | grep -w right-context | awk '{print $2}'`" || exit 1;
  
  silphonelist=`cat $lang/phones/silence.csl` || exit 1;
  
  nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
  nj2=`cat $denlatdir/num_jobs` || exit 1; # number of jobs in denlat dir
  [ "$nj" != "$nj2" ] && echo "Mismatch in #jobs $nj vs $nj2" && exit 1;
  
  sdata=$data/split$nj
  
  splice_opts=`cat $alidir/splice_opts 2>/dev/null`
  cp $alidir/splice_opts $dir 2>/dev/null
  cp $alidir/final.mat $dir 2>/dev/null # any LDA matrix...
  cp $alidir/tree $dir
  
  ## Set up features.  Note: these are different from the normal features
  ## because we have one rspecifier that has the features for the entire
  ## training set, not separate ones for each batch.
  if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
  echo "$0: feature type is $feat_type"
  
  case $feat_type in
    delta) all_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
       feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
     ;;
    lda) all_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$data/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
        feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      ;;
    *) echo "$0: invalid feature type $feat_type" && exit 1;
  esac
  
  if [ -z "$transform_dir" ] && [ -f "$alidir/trans.1" ]; then 
    # --transform-dir option not set and $alidir has transforms in it.
    transform_dir=$alidir
  fi
  
  if [ -f $alidir/trans.1 ]; then
    echo "$0: using transforms from $alidir"
    all_feats="$all_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
  else
    echo "$0: not using fMLLR transforms (assuming unadapted system)"
  fi
  
  echo "$0: working out number of frames of training data"
  
  num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1;
  
  # round to closest int
  iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
  [ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
  samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]
  
  echo "Every EBW iteration, splitting the data up into $iters_per_epoch iterations,"
  echo "giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."
  
  mkdir -p $dir/post $dir/egs
  
  num_epochs=$[$num_ebw_iters*$epochs_per_ebw_iter]
  
  x=0
  while [ $x -lt $num_epochs ]; do
    z=$[$x / $epochs_per_ebw_iter];  # z is the (generally) smaller iteration number that identifies the EBW pass.
    if [ $x -eq $[$z * $epochs_per_ebw_iter] ]; then
      first_iter_of_epoch=true
      echo "Starting pass $z of EBW"
    else
      first_iter_of_epoch=false
    fi
    echo "Epoch $x of $num_epochs"
  
    if [ $stage -le $x ] && $first_iter_of_epoch; then
      if [ $stage -lt $x ] || [ $sub_stage -le -3 ]; then
        # First get the per-frame posteriors, by rescoring the lattices; this
        # process also gives us at the same time the posteriors of each state for
        # each frame (by default, pruned to 0.01 with a randomized algorithm).
        # The matrix-logprob stage produces a diagnostic and passes the pseudo-log-like
        # matrix through unchanged.  (Note: nnet-logprob2-parallel can use up to
        # $num_threads threads, but in practice it may be limited by the speed of
        # the other elements of the pipe.
        $cmd $parallel_opts JOB=1:$nj $dir/log/post.$z.JOB.log \
          nnet-logprob2-parallel --num-threads=$num_threads $dir/$x.1.mdl "$feats" \
            "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
          matrix-logprob ark:- "ark:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $dir/$x.1.mdl ark:- ark:-|" ark:- \| \
          lattice-rescore-mapped $dir/$x.1.mdl "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark:- ark:- \| \
          lattice-boost-ali --b=$boost --silence-phones=$silphonelist $dir/$x.1.mdl ark:- "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
          lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
          post-to-pdf-post $dir/$x.1.mdl ark:- "ark:|gzip -c >$dir/post/den_post.$z.JOB.gz" || exit 1;
      fi
      if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
        # run nnet-get-egs for all files, to get the training examples for each frame--
        # combines the feature and label/posterior information.  The posterior information
        # consists of 2 things: the numerator posteriors from the alignments, the denominator
        # posteriors from the lattices (times -1), and the smoothing posteriors from the 
        # neural net log-probs (times E).  
        # We copy the examples for each job round-robin to multiple archives, one for each
        # of 1...$num_jobs_nnet.  
        egs_out=""
        for n in `seq 1 $num_jobs_nnet`; do
          # indexes are egs_orig.$z.$num_jobs_nnet.$nj
          egs_out="$egs_out ark:$dir/egs/egs_orig.$z.$n.JOB.ark"
        done
        $cmd JOB=1:$nj $dir/log/get_egs.$z.JOB.log \
           ali-to-pdf $dir/$x.1.mdl "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
           ali-to-post ark:- ark:- \| \
           sum-post --scale2=$E ark:- "ark:gunzip -c $dir/post/smooth_post.$z.JOB.gz|" ark:- \| \
           sum-post --scale2=-1.0 ark:- "ark:gunzip -c $dir/post/den_post.$z.JOB.gz|" ark:- \| \
           nnet-get-egs $nnet_context_opts "$feats" ark:- ark:- \| \
           nnet-copy-egs ark:- $egs_out || exit 1;
        rm $dir/post/smooth_post.$z.*.gz $dir/post/den_post.$z.*.gz 
      fi
      if $first_iter_of_epoch; then
        # Diagnostics-- work out an extra term in the objf that we have to add to
        # what we get from the nnet training.
        tail -n 50 $dir/log/post.$z.*.log | perl -e '$acwt=shift @ARGV; $acwt>0.0 || die "bad acwt"; while(<STDIN>) { if (m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames += $2; } if (m|matrix-logprob.+Average log-prob per frame is (\S+) over (\S+) frames|) { $tot_num_like += $1*$2; $tot_num_frames += $2; } } if (abs($tot_frames - $tot_num_frames) > 0.01*($tot_frames + $tot_num_frames)) { print STDERR "#frames differ $tot_frames vs $tot_num_frames
  "; }  $tot_den_lat_like /= $tot_frames; $tot_num_like /= $tot_num_frames; $objf = $acwt * $tot_num_like - $tot_den_lat_like; print $objf."
  "; ' $acwt > $dir/log/objf.$z.log
        echo "Objf on EBW iter $z is `cat $dir/log/objf.$z.log`"
      fi
      if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
        echo "Merging training examples across original #jobs ($nj), and "
        echo "splitting across number of nnet jobs $num_jobs_nnet"
        egs_out2=""
        for n in `seq 1 $iters_per_epoch`; do
          # indexes of egs_merged are: egs_merged.$z.$iters_per_epoch.$num_jobs_nnet
          egs_out2="$egs_out2 ark:$dir/egs/egs_merged.$z.$n.JOB.ark"
        done
        # Note: in the following command, JOB goes from 1 to $num_jobs_nnet, so one
        # job per parallel training job (different from the previous command).
        # We sum up over the index JOB in the previous $cmd, and write to multiple
        # archives, this time one for each "sub-iter".
        # indexes of egs_orig are: egs_orig.$z.$num_jobs_nnet.$nj
        $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/merge_and_split.$x.JOB.log \
          cat $dir/egs/egs_orig.$z.JOB.*.ark \| \
          nnet-copy-egs --random=$random_copy "--srand=\$[JOB+($x*$num_jobs_nnet)]" \
            ark:- $egs_out2 '&&' rm $dir/egs/egs_orig.$z.JOB.*.ark || exit 1;
      fi
      if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
        echo "Randomizing order of examples in each job"
        for n in `seq 1 $iters_per_epoch`; do
          s=$[$num_jobs_nnet*($n+($iters_per_epoch*$z))] # for srand
          $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$z.$n.JOB.log \
            nnet-shuffle-egs "--srand=\$[JOB+$s]" \
            ark:$dir/egs/egs_merged.$z.$n.JOB.ark ark:$dir/egs/egs.$z.$n.JOB.ark '&&' \
            rm $dir/egs/egs_merged.$z.$n.JOB.ark || exit 1;
        done
      fi
    fi
    if [ $stage -le $x ]; then
      # This block does the $iters_per_epoch iters of training.
      y=1; # y is the "sub-iteration" number.
      while [ $y -le $iters_per_epoch ]; do
        echo "Iteration $x, sub-iteration $y"
        if [ $stage -lt $x ] || [ $sub_stage -le $y ]; then
          $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.$y.JOB.log \
            nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
            $dir/$x.$y.mdl ark:$dir/egs/egs.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
            || exit 1;
          nnets_list=
          for n in `seq 1 $num_jobs_nnet`; do
            nnets_list="$nnets_list $dir/$x.$y.$n.mdl"
          done
          if [ $y -eq $iters_per_epoch ]; then next_mdl=$dir/$[$x+1].1.mdl
          else next_mdl=$dir/$x.$[$y+1].mdl; fi
          # Average the parameters of all the parallel jobs.
          $cmd $dir/log/average.$x.$y.log \
             nnet-am-average $nnets_list $next_mdl || exit 1;
          rm $nnets_list
        fi
        y=$[$y+1]
      done
    fi
    if [ $learning_rate_factor != 1.0 ]; then
      learning_rate=`perl -e "print $learning_rate * $learning_rate_factor;"`;
      ! nnet-am-copy --print-args=false --learning-rate=$learning_rate $dir/$[$x+1].1.mdl $dir/$[$x+1].1.mdl && \
         echo Error changing learning rate of neural net && exit 1;
    fi
    x=$[$x+1]
  done
  
  rm $dir/final.mdl 2>/dev/null
  ln -s $x.1.mdl $dir/final.mdl
  
  echo Done