Blame view
egs/lre/v1/lid/train_full_ubm.sh
4.63 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. # 2013 Daniel Povey # 2014 David Snyder # This trains a full-covariance UBM from an existing (diagonal or full) UBM, # for a specified number of iterations. This is for speaker-id systems # (we use features specialized for that, and vad). # Begin configuration section. nj=16 cmd=run.pl stage=-2 num_gselect=20 # cutoff for Gaussian-selection that we do once at the start. subsample=5 num_iters=4 min_gaussian_weight=1.0e-04 remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed. cleanup=true # End configuration section. echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# != 3 ]; then echo "Usage: steps/train_full_ubm.sh <data> <old-ubm-dir> <new-ubm-dir>" echo "Trains a full-covariance UBM starting from an existing diagonal or" echo "full-covariance UBM system." echo " e.g.: steps/train_full_ubm.sh --num-iters 8 data/train exp/diag_ubm exp/full_ubm" echo "main options (for others, see top of script file)" echo " --config <config-file> # config containing options" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --nj <n|16> # number of parallel training jobs" echo " --num-gselect <n|20> # Number of Gaussians to select using" echo " # initial model (diagonalized if needed)" echo " --subsample <n|5> # Take every n'th sample, for efficiency" echo " --num-iters <n|4> # Number of iterations of E-M" echo " --min-gaussian-weight <weight|1.0e-05> # Minimum Gaussian weight (below this," echo " # we won't update, and will remove Gaussians" echo " # if --remove-low-count-gaussians is true" echo " --remove-low-count-gaussians <true,false|true> # If true, remove Gaussians below min-weight" echo " # (will only happen on last iteration, in any case" echo " --cleanup <true,false|true> # If true, clean up accumulators, intermediate" echo " # models and gselect info" exit 1; fi data=$1 srcdir=$2 dir=$3 for f in $data/feats.scp $data/vad.scp; do [ ! -f $f ] && echo "No such file $f" && exit 1; done mkdir -p $dir/log echo $nj > $dir/num_jobs sdata=$data/split$nj; utils/split_data.sh $data $nj || exit 1; ## Set up features. feats="ark,s,cs:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:$sdata/JOB/feats.scp ark:- | add-deltas-sdc ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- | subsample-feats --n=$subsample ark:- ark:- |" if [ $stage -le -2 ]; then if [ -f $srcdir/final.dubm ]; then # diagonal-covariance in $srcdir $cmd $dir/log/convert_diag_to_full \ gmm-global-to-fgmm $srcdir/final.dubm $dir/0.ubm || exit 1; elif [ -f $srcdir/final.ubm ]; then cp $srcdir/final.ubm $dir/0.ubm || exit 1; else echo "$0: in $srcdir, expecting final.ubm or final.dubm to exist" exit 1; fi fi if [ $stage -le -1 ]; then echo "$0: doing Gaussian selection (using diagonal form of model; selecting $num_gselect indices)" $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ gmm-gselect --n=$num_gselect "fgmm-global-to-gmm $dir/0.ubm - |" "$feats" \ "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; fi x=0 while [ $x -lt $num_iters ]; do echo "Pass $x" if [ $stage -le $x ]; then $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ fgmm-global-acc-stats "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" $dir/$x.ubm "$feats" \ $dir/$x.JOB.acc || exit 1; if [ $[$x+1] -eq $num_iters ];then lowcount_opt="--remove-low-count-gaussians=$remove_low_count_gaussians" # as specified by user. else # On non-final iters, we in any case can't remove low-count Gaussians because it would # cause the gselect info to become out of date. lowcount_opt="--remove-low-count-gaussians=false" fi $cmd $dir/log/update.$x.log \ fgmm-global-est $lowcount_opt --min-gaussian-weight=$min_gaussian_weight --verbose=2 $dir/$x.ubm "fgmm-global-sum-accs - $dir/$x.*.acc |" \ $dir/$[$x+1].ubm || exit 1; $cleanup && rm $dir/$x.*.acc $dir/$x.ubm fi x=$[$x+1] done $cleanup && rm $dir/gselect.*.gz rm $dir/final.ubm 2>/dev/null mv $dir/$x.ubm $dir/final.ubm || exit 1; |