#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. # This script trains a fairly vanilla network with tanh nonlinearities. # Begin configuration section. cmd=run.pl num_epochs=15 # Number of epochs during which we reduce # the learning rate; number of iteration is worked out from this. num_epochs_extra=5 # Number of epochs after we stop reducing # the learning rate. num_iters_final=20 # Maximum number of final iterations to give to the # optimization over the validation set. initial_learning_rate=0.04 final_learning_rate=0.004 bias_stddev=0.0 shrink_interval=5 # shrink every $shrink_interval iters except while we are # still adding layers, when we do it every iter. shrink=true num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if # given. softmax_learning_rate_factor=0.5 # Train this layer half as fast as the other layers. hidden_layer_dim=300 # You may want this larger, e.g. 1024 or 2048. minibatch_size=128 # by default use a smallish minibatch size for neural net # training; this controls instability which would otherwise # be a problem with multi-threaded update. Note: it also # interacts with the "preconditioned" update which generally # works better with larger minibatch size, so it's not # completely cost free. samples_per_iter=200000 # each iteration of training, see this many samples # per job. This option is passed to get_egs.sh num_jobs_nnet=16 # Number of neural net jobs to run in parallel. This option # is passed to get_egs.sh. get_egs_stage=0 spk_vecs_dir= shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples # on each iter. You could set it to 0 or to a large value for complete # randomization, but this would both consume memory and cause spikes in # disk I/O. Smaller is easier on disk and memory but less random. It's # not a huge deal though, as samples are anyway randomized right at the start. add_layers_period=2 # by default, add new layers every 2 iterations. num_hidden_layers=3 stage=-5 io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. splice_width=4 # meaning +- 4 frames on each side for second LDA randprune=4.0 # speeds up LDA. alpha=4.0 max_change=10.0 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if # specified.) num_threads=16 parallel_opts="-pe smp $num_threads" # using a smallish #threads by default, out of stability concerns. # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. cleanup=true egs_dir= lda_opts= egs_opts= # End configuration section. echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# != 4 ]; then echo "Usage: $0 [opts]

" echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --num-epochs <#epochs|15> # Number of epochs of main training" echo " # while reducing learning rate (determines #iterations, together" echo " # with --samples-per-iter and --num-jobs-nnet)" echo " --num-epochs-extra <#epochs-extra|5> # Number of extra epochs of training" echo " # after learning rate fully reduced" echo " --initial-learning-rate # Learning rate at start of training, e.g. 0.02 for small" echo " # data, 0.01 for large data" echo " --final-learning-rate # Learning rate at end of training, e.g. 0.004 for small" echo " # data, 0.001 for large data" echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" echo " --initial-num-hidden-layers <#hidden-layers|1> # Number of hidden layers to start with." echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer," echo " # per context-dependent state. Try a number several times #states." echo " --num-jobs-nnet # Number of parallel jobs to use for main neural net" echo " # training (will affect results as well as speed; try 8, 16)" echo " # Note: if you increase this, you may want to also increase" echo " # the learning rate." echo " --num-threads # Number of parallel threads per job (will affect results" echo " # as well as speed; may interact with batch size; if you increase" echo " # this, you may want to decrease the batch size." echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" echo " # use multiple threads." echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" echo " # should not get too large, e.g. >2k)." echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" echo " # process." echo " --splice-width # Number of frames on each side to append for feature input" echo " # (note: we splice processed, typically 40-dimensional frames" echo " --lda-dim # Dimension to reduce spliced features to with LDA" echo " --num-iters-final <#iters|10> # Number of final iterations to give to nnet-combine-fast to " echo " # interpolate parameters (the weights are learned with a validation set)" echo " --num-utts-subset <#utts|300> # Number of utterances in subsets used for validation and diagnostics" echo " # (the validation subset is held out from training)" echo " --num-frames-diagnostic <#frames|4000> # Number of frames used in computing (train,valid) diagnostics" echo " --num-valid-frames-combine <#frames|10000> # Number of frames used in getting combination weights at the" echo " # very end." echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." exit 1; fi data=$1 lang=$2 alidir=$3 dir=$4 # Check some files. for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done # Set some variables. num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1; nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... # in this dir we'll have just one job. sdata=$data/split$nj utils/split_data.sh $data $nj mkdir -p $dir/log echo $nj > $dir/num_jobs splice_opts=`cat $alidir/splice_opts 2>/dev/null` cp $alidir/splice_opts $dir 2>/dev/null cp $alidir/tree $dir # Get list of validation utterances. awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ > $dir/valid_uttlist || exit 1; awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; if [ $stage -le -4 ]; then echo "$0: calling get_lda.sh" steps/nnet2/get_lda.sh $lda_opts --splice-width $splice_width --cmd "$cmd" $data $lang $alidir $dir || exit 1; fi # these files will have been written by get_lda.sh feat_dim=`cat $dir/feat_dim` || exit 1; lda_dim=`cat $dir/lda_dim` || exit 1; if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then echo "$0: calling get_egs.sh" [ ! -z $spk_vecs_dir ] && spk_vecs_opt="--spk-vecs-dir $spk_vecs_dir"; steps/nnet2/get_egs.sh $spk_vecs_opt --samples-per-iter $samples_per_iter --num-jobs-nnet $num_jobs_nnet \ --splice-width $splice_width --stage $get_egs_stage --cmd "$cmd" $egs_opts \ $data $lang $alidir $dir || exit 1; fi if [ -z $egs_dir ]; then egs_dir=$dir/egs fi iters_per_epoch=`cat $egs_dir/iters_per_epoch` || exit 1; ! [ $num_jobs_nnet -eq `cat $egs_dir/num_jobs_nnet` ] && \ echo "$0: Warning: using --num-jobs-nnet=`cat $egs_dir/num_jobs_nnet` from $egs_dir" num_jobs_nnet=`cat $egs_dir/num_jobs_nnet` if ! [ $num_hidden_layers -ge 1 ]; then echo "Invalid num-hidden-layers $num_hidden_layers" exit 1 fi if [ $stage -le -2 ]; then echo "$0: initializing neural net"; # Get spk-vec dim (in case we're using them). if [ ! -z "$spk_vecs_dir" ]; then spk_vec_dim=$[$(copy-vector --print-args=false "ark:cat $spk_vecs_dir/vecs.1|" ark,t:- | head -n 1 | wc -w) - 3]; ! [ $spk_vec_dim -gt 0 ] && echo "Error getting spk-vec dim" && exit 1; ext_lda_dim=$[$lda_dim + $spk_vec_dim] extend-transform-dim --new-dimension=$ext_lda_dim $dir/lda.mat $dir/lda_ext.mat || exit 1; lda_mat=$dir/lda_ext.mat ext_feat_dim=$[$feat_dim + $spk_vec_dim] else spk_vec_dim=0 lda_mat=$dir/lda.mat ext_lda_dim=$lda_dim ext_feat_dim=$feat_dim fi stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"` cat >$dir/nnet.config <$dir/hidden.config <= $n ? $f : $i*exp($x*log($f/$i)/$n));' $[$x+1] $num_iters_reduce $initial_learning_rate $final_learning_rate`; softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`; nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo 2>/dev/null || exit 1 nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'` na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` # na is number of last updatable AffineComponent layer [one-based, counting only # updatable components.] lr_string="$learning_rate" for n in `seq 2 $nu`; do if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate; else lr=$learning_rate; fi lr_string="$lr_string:$lr" done $cmd $dir/log/average.$x.log \ nnet-am-average $nnets_list - \| \ nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1; if $shrink && [ $[$x % $shrink_interval] -eq 0 ]; then mb=$[($num_frames_shrink+$num_threads-1)/$num_threads] $cmd $parallel_opts $dir/log/shrink.$x.log \ nnet-subset-egs --n=$num_frames_shrink --randomize-order=true --srand=$x \ ark:$egs_dir/train_diagnostic.egs ark:- \| \ nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \ $dir/$[$x+1].mdl ark:- $dir/$[$x+1].mdl || exit 1; else # On other iters, do nnet-am-fix which is much faster and has roughly # the same effect. nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log fi if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then # mix up. echo Mixing up from $num_leaves to $mix_up components $cmd $dir/log/mix_up.$x.log \ nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \ $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1; fi rm $nnets_list fi x=$[$x+1] done # Now do combination. # At the end, final.mdl will be a combination of the last e.g. 10 models. nnets_list=() if [ $num_iters_final -gt $num_iters_extra ]; then echo "Setting num_iters_final=$num_iters_extra" fi start=$[$num_iters-$num_iters_final+1] for x in `seq $start $num_iters`; do idx=$[$x-$start] if [ $x -gt $mix_up_iter ]; then nnets_list[$idx]=$dir/$x.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |" fi done if [ $stage -le $num_iters ]; then num_egs=`nnet-copy-egs ark:$egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'` mb=$[($num_egs+$num_threads-1)/$num_threads] $cmd $parallel_opts $dir/log/combine.log \ nnet-combine-fast --num-threads=$num_threads --verbose=3 --minibatch-size=$mb \ "${nnets_list[@]}" ark:$egs_dir/combine.egs $dir/final.mdl || exit 1; fi # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. $cmd $dir/log/compute_prob_valid.final.log \ nnet-compute-prob $dir/final.mdl ark:$egs_dir/valid_diagnostic.egs & $cmd $dir/log/compute_prob_train.final.log \ nnet-compute-prob $dir/final.mdl ark:$egs_dir/train_diagnostic.egs & sleep 2 echo Done if $cleanup; then echo Cleaning up data if [ $egs_dir == "$dir/egs" ]; then echo Removing training examples rm $dir/egs/egs* fi echo Removing most of the models for x in `seq 0 $num_iters`; do if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then # delete all but every 10th model; don't delete the ones which combine to form the final model. rm $dir/$x.mdl fi done fi