train_nnet_cpu_mmi.sh.svn-base 16 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293


#!/bin/bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# MMI (or boosted MMI) training (A.K.A. sequence training) of a neural net based 
# system as trained by train_nnet_cpu.sh


# Begin configuration section.
cmd=run.pl
epochs_per_ebw_iter=1 # Number of times we iterate over the whole
                       # data each time we do an "EBW" iteration.
num_ebw_iters=4 # Number of "EBW" iterations.
initial_learning_rate=0.001 # learning rate we start with.
learning_rate_factor=1.0 # factor by which we change the learning
                         # rate each iteration (should be <= 1.0)
E=2.0  # this is slightly analogous to the constant E used in
       # Extended Baum-Welch updates of GMMs.  It slows down (and
       # somewhat regularizes) the update.

minibatch_size=256 # since the learning rate is always quite low compared with
                   # what we have at the start of ML training, we can probably
                   # afford a somewhat higher minibatch size than there, as
                   # there is less risk of instability.

samples_per_iter=400000 # each phase of training, see this many samples
                         # per job.  Note: this is a kind of suggestion; we
                         # will actually find a number that will make the
                          # #iters per epoch a whole number.
num_jobs_nnet=8 # Number of neural net training jobs to run in parallel.
                # not the same as the num-jobs (nj) which will be the same as the
                # alignment and denlat directories.
stage=0
sub_stage=-3 # this can be used to start from a particular sub-iteration of an
             # iteration
acwt=0.1
boost=0.0  # boosting for BMMI (you can try 0.1).. this is applied per frame.
transform_dir=  # Note: by default any transforms in $alidir will be used.

parallel_opts="-pe smp 16" # by default we use 16 threads; this lets the queue know.
io_opts="-tc 10" # max 5 jobs running at one time (a lot of I/O.)
num_threads=16 # number of threads for neural net trainer..
mkl_num_threads=1
random_copy=false
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 6 ]; then
  echo "Usage: steps/train_nnet_cpu_mmi.sh [opts] <data> <lang> <src-dir> <ali-dir> <denlat-dir> <exp-dir>"
  echo ""
  echo "Main options (for others, see top of script file)"
  echo "Note, the terminology is: each iteration of EBW we do multiple epochs; each epoch"
  echo " we have multiple iterations of training (note the same as the EBW iters)."
  echo "  --config <config-file>                           # config file containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --num-ebw-iters <#iters|4>                       # number of pseudo-Extended-Baum-Welch iterations (default: 4)"
  echo "  --epochs-per-ebw-iter <#epochs|1>                # number of times to see all the data per EBW iter."
  echo "  --initial-learning-rate <initial-lrate|0.005>    # learning rate to use on the first iteration"
  echo "  --learning-rate-factor <lrate-factor|1.0>        # Factor by which to change the learning rate on each"
  echo "                                                   # EBW iteration (should be <= 1.0)"
  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
  echo "                                                   # training (will affect results as well as speed; try 8, 16)."
  echo "                                                   # Note: if you increase this, you may want to also increase"
  echo "                                                   # the learning rate."
  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
  echo "                                                   # as well as speed; may interact with batch size; if you increase"
  echo "                                                   # this, you may want to decrease the batch size."
  echo "  --parallel-opts <opts|\"-pe smp 16\">            # extra options to pass to e.g. queue.pl for processes that"
  echo "                                                   # use multiple threads."
  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for any especially I/O intensive jobs"
  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
  echo "                                                   # should not get too large, e.g. >2k)."
  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, for each"
  echo "                                                   # process.  Note: this will get modified to a number that will"
  echo "                                                   # divide the data into a whole number of pieces."
  echo "  --transform-dir <dir>                            # Directory to find fMLLR transforms; if not specified, "
  echo "                                                   # $alidir will be used if it has transforms"
  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  echo "                                                   # the middle."
  echo "  --sub-stage <sub-stage|0>                        # In conjunction with --stage, can be used to start a partially-completed"
  echo "                                                   # training process (refers to the phase number)"
  

  exit 1;
fi

data=$1
lang=$2
srcdir=$3
alidir=$4 # Also used for transforms by default, if transform-dir not specified.
denlatdir=$5
dir=$6 # experimental directory

# Check that some files exist, mostly to verify correct directory arguments.
for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $alidir/ali.1.gz $denlatdir/lat.1.gz; do
  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done

mkdir -p $dir/log
cp $srcdir/tree $dir
learning_rate=$initial_learning_rate
if [ $stage -ge -1 ]; then
  $cmd $dir/log/copy_initial.log \
     nnet-am-copy --learning-rate=$learning_rate $srcdir/final.mdl $dir/0.1.mdl
fi

nnet_context_opts="--left-context=`nnet-am-info $dir/0.1.mdl 2>/dev/null | grep -w left-context | awk '{print $2}'` --right-context=`nnet-am-info $dir/0.1.mdl 2>/dev/null | grep -w right-context | awk '{print $2}'`" || exit 1;

silphonelist=`cat $lang/phones/silence.csl` || exit 1;

nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
nj2=`cat $denlatdir/num_jobs` || exit 1; # number of jobs in denlat dir
[ "$nj" != "$nj2" ] && echo "Mismatch in #jobs $nj vs $nj2" && exit 1;

sdata=$data/split$nj

splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/splice_opts $dir 2>/dev/null
cp $alidir/final.mat $dir 2>/dev/null # any LDA matrix...
cp $alidir/tree $dir

## Set up features.  Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"

case $feat_type in
  delta) all_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
     feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
   ;;
  lda) all_feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:$data/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
      feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
    ;;
  *) echo "$0: invalid feature type $feat_type" && exit 1;
esac

if [ -z "$transform_dir" ] && [ -f "$alidir/trans.1" ]; then 
  # --transform-dir option not set and $alidir has transforms in it.
  transform_dir=$alidir
fi

if [ -f $alidir/trans.1 ]; then
  echo "$0: using transforms from $alidir"
  all_feats="$all_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $alidir/trans.*|' ark:- ark:- |"
  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$alidir/trans.JOB ark:- ark:- |"
else
  echo "$0: not using fMLLR transforms (assuming unadapted system)"
fi

echo "$0: working out number of frames of training data"

num_frames=`feat-to-len scp:$data/feats.scp ark,t:- | awk '{x += $2;} END{print x;}'` || exit 1;

# round to closest int
iters_per_epoch=`perl -e "print int($num_frames/($samples_per_iter * $num_jobs_nnet) + 0.5);"` || exit 1;
[ $iters_per_epoch -eq 0 ] && iters_per_epoch=1
samples_per_iter_real=$[$num_frames/($num_jobs_nnet*$iters_per_epoch)]

echo "Every EBW iteration, splitting the data up into $iters_per_epoch iterations,"
echo "giving samples-per-iteration of $samples_per_iter_real (you requested $samples_per_iter)."

mkdir -p $dir/post $dir/egs

num_epochs=$[$num_ebw_iters*$epochs_per_ebw_iter]

x=0
while [ $x -lt $num_epochs ]; do
  z=$[$x / $epochs_per_ebw_iter];  # z is the (generally) smaller iteration number that identifies the EBW pass.
  if [ $x -eq $[$z * $epochs_per_ebw_iter] ]; then
    first_iter_of_epoch=true
    echo "Starting pass $z of EBW"
  else
    first_iter_of_epoch=false
  fi
  echo "Epoch $x of $num_epochs"

  if [ $stage -le $x ] && $first_iter_of_epoch; then
    if [ $stage -lt $x ] || [ $sub_stage -le -3 ]; then
      # First get the per-frame posteriors, by rescoring the lattices; this
      # process also gives us at the same time the posteriors of each state for
      # each frame (by default, pruned to 0.01 with a randomized algorithm).
      # The matrix-logprob stage produces a diagnostic and passes the pseudo-log-like
      # matrix through unchanged.  (Note: nnet-logprob2-parallel can use up to
      # $num_threads threads, but in practice it may be limited by the speed of
      # the other elements of the pipe.
      $cmd $parallel_opts JOB=1:$nj $dir/log/post.$z.JOB.log \
        nnet-logprob2-parallel --num-threads=$num_threads $dir/$x.1.mdl "$feats" \
          "ark:|prob-to-post ark:- ark:- | gzip -c >$dir/post/smooth_post.$z.JOB.gz" ark:- \| \
        matrix-logprob ark:- "ark:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $dir/$x.1.mdl ark:- ark:-|" ark:- \| \
        lattice-rescore-mapped $dir/$x.1.mdl "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark:- ark:- \| \
        lattice-boost-ali --b=$boost --silence-phones=$silphonelist $dir/$x.1.mdl ark:- "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
        lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
        post-to-pdf-post $dir/$x.1.mdl ark:- "ark:|gzip -c >$dir/post/den_post.$z.JOB.gz" || exit 1;
    fi
    if [ $stage -lt $x ] || [ $sub_stage -le -2 ]; then
      # run nnet-get-egs for all files, to get the training examples for each frame--
      # combines the feature and label/posterior information.  The posterior information
      # consists of 2 things: the numerator posteriors from the alignments, the denominator
      # posteriors from the lattices (times -1), and the smoothing posteriors from the 
      # neural net log-probs (times E).  
      # We copy the examples for each job round-robin to multiple archives, one for each
      # of 1...$num_jobs_nnet.  
      egs_out=""
      for n in `seq 1 $num_jobs_nnet`; do
        # indexes are egs_orig.$z.$num_jobs_nnet.$nj
        egs_out="$egs_out ark:$dir/egs/egs_orig.$z.$n.JOB.ark"
      done
      $cmd JOB=1:$nj $dir/log/get_egs.$z.JOB.log \
         ali-to-pdf $dir/$x.1.mdl "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
         ali-to-post ark:- ark:- \| \
         sum-post --scale2=$E ark:- "ark:gunzip -c $dir/post/smooth_post.$z.JOB.gz|" ark:- \| \
         sum-post --scale2=-1.0 ark:- "ark:gunzip -c $dir/post/den_post.$z.JOB.gz|" ark:- \| \
         nnet-get-egs $nnet_context_opts "$feats" ark:- ark:- \| \
         nnet-copy-egs ark:- $egs_out || exit 1;
      rm $dir/post/smooth_post.$z.*.gz $dir/post/den_post.$z.*.gz 
    fi
    if $first_iter_of_epoch; then
      # Diagnostics-- work out an extra term in the objf that we have to add to
      # what we get from the nnet training.
      tail -n 50 $dir/log/post.$z.*.log | perl -e '$acwt=shift @ARGV; $acwt>0.0 || die "bad acwt"; while(<STDIN>) { if (m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames.  Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames += $2; } if (m|matrix-logprob.+Average log-prob per frame is (\S+) over (\S+) frames|) { $tot_num_like += $1*$2; $tot_num_frames += $2; } } if (abs($tot_frames - $tot_num_frames) > 0.01*($tot_frames + $tot_num_frames)) { print STDERR "#frames differ $tot_frames vs $tot_num_frames\n"; }  $tot_den_lat_like /= $tot_frames; $tot_num_like /= $tot_num_frames; $objf = $acwt * $tot_num_like - $tot_den_lat_like; print $objf."\n"; ' $acwt > $dir/log/objf.$z.log
      echo "Objf on EBW iter $z is `cat $dir/log/objf.$z.log`"
    fi
    if [ $stage -lt $x ] || [ $sub_stage -le -1 ]; then
      echo "Merging training examples across original #jobs ($nj), and "
      echo "splitting across number of nnet jobs $num_jobs_nnet"
      egs_out2=""
      for n in `seq 1 $iters_per_epoch`; do
        # indexes of egs_merged are: egs_merged.$z.$iters_per_epoch.$num_jobs_nnet
        egs_out2="$egs_out2 ark:$dir/egs/egs_merged.$z.$n.JOB.ark"
      done
      # Note: in the following command, JOB goes from 1 to $num_jobs_nnet, so one
      # job per parallel training job (different from the previous command).
      # We sum up over the index JOB in the previous $cmd, and write to multiple
      # archives, this time one for each "sub-iter".
      # indexes of egs_orig are: egs_orig.$z.$num_jobs_nnet.$nj
      $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/merge_and_split.$x.JOB.log \
        cat $dir/egs/egs_orig.$z.JOB.*.ark \| \
        nnet-copy-egs --random=$random_copy "--srand=\$[JOB+($x*$num_jobs_nnet)]" \
          ark:- $egs_out2 '&&' rm $dir/egs/egs_orig.$z.JOB.*.ark || exit 1;
    fi
    if [ $stage -lt $x ] || [ $sub_stage -le 0 ]; then
      echo "Randomizing order of examples in each job"
      for n in `seq 1 $iters_per_epoch`; do
        s=$[$num_jobs_nnet*($n+($iters_per_epoch*$z))] # for srand
        $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$z.$n.JOB.log \
          nnet-shuffle-egs "--srand=\$[JOB+$s]" \
          ark:$dir/egs/egs_merged.$z.$n.JOB.ark ark:$dir/egs/egs.$z.$n.JOB.ark '&&' \
          rm $dir/egs/egs_merged.$z.$n.JOB.ark || exit 1;
      done
    fi
  fi
  if [ $stage -le $x ]; then
    # This block does the $iters_per_epoch iters of training.
    y=1; # y is the "sub-iteration" number.
    while [ $y -le $iters_per_epoch ]; do
      echo "Iteration $x, sub-iteration $y"
      if [ $stage -lt $x ] || [ $sub_stage -le $y ]; then
        $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.$y.JOB.log \
          nnet-train-parallel --num-threads=$num_threads --minibatch-size=$minibatch_size \
          $dir/$x.$y.mdl ark:$dir/egs/egs.$z.$y.JOB.ark $dir/$x.$y.JOB.mdl \
          || exit 1;
        nnets_list=
        for n in `seq 1 $num_jobs_nnet`; do
          nnets_list="$nnets_list $dir/$x.$y.$n.mdl"
        done
        if [ $y -eq $iters_per_epoch ]; then next_mdl=$dir/$[$x+1].1.mdl
        else next_mdl=$dir/$x.$[$y+1].mdl; fi
        # Average the parameters of all the parallel jobs.
        $cmd $dir/log/average.$x.$y.log \
           nnet-am-average $nnets_list $next_mdl || exit 1;
        rm $nnets_list
      fi
      y=$[$y+1]
    done
  fi
  if [ $learning_rate_factor != 1.0 ]; then
    learning_rate=`perl -e "print $learning_rate * $learning_rate_factor;"`;
    ! nnet-am-copy --print-args=false --learning-rate=$learning_rate $dir/$[$x+1].1.mdl $dir/$[$x+1].1.mdl && \
       echo Error changing learning rate of neural net && exit 1;
  fi
  x=$[$x+1]
done

rm $dir/final.mdl 2>/dev/null
ln -s $x.1.mdl $dir/final.mdl

echo Done