Blame view
egs/lre07/v1/lid/nnet2/get_lda.sh
7.71 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). # 2015 David Snyder # Apache 2.0. # # This script is based off of get_lda.sh in ../../steps/nnet2/, but has been # modified for language recogntion purposes to use a sliding window CMN. # # This script, which will generally be called from other neural-net training # scripts, extracts the training examples used to train the neural net (and also # the validation examples used for diagnostics), and puts them in separate archives. # Begin configuration section. cmd=run.pl feat_type= stage=0 splice_width=4 # meaning +- 4 frames on each side for second LDA left_context= # left context for second LDA right_context= # right context for second LDA rand_prune=4.0 # Relates to a speedup we do for LDA. within_class_factor=0.0001 # This affects the scaling of the transform rows... # sorry for no explanation, you'll have to see the code. transform_dir= # If supplied, overrides alidir num_feats=10000 # maximum number of feature files to use. Beyond a certain point it just # gets silly to use more data. lda_dim= # This defaults to no dimension reduction. online_ivector_dir= ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with # this prob per iVector. ivector_dir= echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# != 4 ]; then echo "Usage: lid/nnet2/get_lda.sh [opts] <data> <lang> <ali-dir> <exp-dir>" echo " e.g.: lid/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet" echo " As well as extracting the examples, this script will also do the LDA computation," echo " if --est-lda=true (default:true)" echo "" echo "Main options (for others, see top of script file)" echo " --config <config-file> # config file containing options" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --splice-width <width|4> # Number of frames on each side to append for feature input" echo " # (note: we splice processed, typically 40-dimensional frames" echo " --left-context <width;4> # Number of frames on left side to append for feature input, overrides splice-width" echo " --right-context <width;4> # Number of frames on right side to append for feature input, overrides splice-width" echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in" echo " # the middle." echo " --online-vector-dir <dir|none> # Directory produced by" echo " # steps/online/nnet2/extract_ivectors_online.sh" exit 1; fi data=$1 lang=$2 alidir=$3 dir=$4 [ -z "$left_context" ] && left_context=$splice_width [ -z "$right_context" ] && right_context=$splice_width [ ! -z "$online_ivector_dir" ] && \ extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" # Check some files. for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done # Set some variables. oov=`cat $lang/oov.int` num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1; silphonelist=`cat $lang/phones/silence.csl` || exit 1; nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... # in this dir we'll have just one job. sdata=$data/split$nj utils/split_data.sh $data $nj mkdir -p $dir/log echo $nj > $dir/num_jobs cp $alidir/tree $dir [ -z "$transform_dir" ] && transform_dir=$alidir ## Set up features. Note: these are different from the normal features ## because we have one rspecifier that has the features for the entire ## training set, not separate ones for each batch. if [ -z $feat_type ]; then if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi fi echo "$0: feature type is $feat_type" # If we have more than $num_feats feature files (default: 10k), # we use a random subset. This won't affect the transform much, and will # spare us an unnecessary pass over the data. Probably 10k is # way too much, but for small datasets this phase is quite fast. N=$[$num_feats/$nj] case $feat_type in raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |" ;; lda) splice_opts=`cat $alidir/splice_opts 2>/dev/null` cp $alidir/{splice_opts,final.mat} $dir || exit 1; feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" ;; *) echo "$0: invalid feature type $feat_type" && exit 1; esac if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then echo "$0: using transforms from $transform_dir" feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" fi if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then echo "$0: using raw-fMLLR transforms from $transform_dir" feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" fi feats_one="$(echo "$feats" | sed s:JOB:1:g)" # note: feat_dim is the raw, un-spliced feature dim without the iVectors. feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; # by default: no dim reduction. spliced_feats="$feats splice-feats --left-context=$left_context --right-context=$right_context ark:- ark:- |" if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; # note: subsample-feats, with negative value of n, repeats each feature n times. spliced_feats="$spliced_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |" ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; else ivector_dim=0 fi echo $ivector_dim >$dir/ivector_dim if [ -z "$lda_dim" ]; then spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)" lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1; fi if [ $stage -le 0 ]; then echo "$0: Accumulating LDA statistics." rm $dir/lda.*.acc 2>/dev/null # in case any left over from before. $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$spliced_feats" ark,s,cs:- \ $dir/lda.JOB.acc || exit 1; fi echo $feat_dim > $dir/feat_dim echo $lda_dim > $dir/lda_dim echo $ivector_dim > $dir/ivector_dim if [ $stage -le 1 ]; then sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1; rm $dir/lda.*.acc fi if [ $stage -le 2 ]; then # There are various things that we sometimes (but not always) need # the within-class covariance and its Cholesky factor for, and we # write these to disk just in case. nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \ --write-within-covar=$dir/within_covar.spmat \ --within-class-factor=$within_class_factor --dim=$lda_dim \ $dir/lda.mat $dir/lda.acc \ 2>$dir/log/lda_est.log || exit 1; fi echo "$0: Finished estimating LDA" |