Blame view
egs/wsj/s5/steps/train_lvtln.sh
14.9 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 |
#!/bin/bash # Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey) # Copyright 2014 Vimal Manohar # This training script trains linear-VTLN models starting from an existing # system based on either LDA+MLLT or delta+delta-delta features. # Works with either mfcc or plp features, but you need to set the # --base-feat-type option. # The resulting system can be used with align_lvtln.sh and/or decode_lvtln.sh # to get VTLN warping factors for data, for warped data extraction, or (for # the training data) you can use the warping factors this script outputs # in $dir/final.warp # # Apache 2.0 # Begin configuration. stage=-6 # This allows restarting after partway, when something when wrong. config= cmd=run.pl scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" realign_iters="10 20 30"; num_iters=35 # Number of iterations of training max_iter_inc=25 # Last iter to increase #Gauss on. beam=10 retry_beam=40 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment power=0.25 # Exponent for number of gaussians according to occurrence counts cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves cmvn_opts= # you can supply e.g. --cmvn-opts "--norm-vars=true" to turn on variance # normalization, but only if base system is the delta type, not LDA. lvtln_iters="2 4 6 8 10 12 14 16 20"; # iters on which to recompute LVTLN transform" num_utt_lvtln_init=200; # number of utterances (subset) to initialize # LVTLN transform. Not too critical. min_warp=0.85 max_warp=1.25 warp_step=0.01 base_feat_type=mfcc # or could be PLP. logdet_scale=0.0 # End configuration. echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh; . parse_options.sh || exit 1; num_classes=$(perl -e "print int(1.5 + ($max_warp - $min_warp) / $warp_step);") || exit 1; default_class=$(perl -e "print int(0.5 + (1.0 - $min_warp) / $warp_step);") || exit 1; if [ $# != 6 ]; then echo "Usage: $0 <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>" echo "e.g.: $0 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1" echo "main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --config <config-file> # config containing options" echo " --stage <stage> # stage to do partial re-run from." exit 1; fi numleaves=$1 totgauss=$2 data=$3 lang=$4 alidir=$5 dir=$6 for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt $data/wav.scp; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done numgauss=$numleaves incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss oov=`cat $lang/oov.int` || exit 1; silphonelist=`cat $lang/phones/silence.csl` || exit 1; ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; nj=`cat $alidir/num_jobs` || exit 1; splice_opts=`cat $alidir/splice_opts 2>/dev/null` mkdir -p $dir/log echo $nj > $dir/num_jobs utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; sdata=$data/split$nj; split_data.sh $data $nj || exit 1; cp $alidir/splice_opts $dir 2>/dev/null if [ ! -f $alidir/final.mat ]; then [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \ echo "$0: warning: ignoring CMVN options from $alidir."; echo $cmvn_opts > $dir/cmvn_opts echo "$0: Using delta+delta-delta features since $alidir/final.mat does not exist" sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |" feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" # for the subsets of features that we use to estimate the linear transforms, we don't # bother with CMVN. This will give us wrong offsets on the transforms, but it will end # up not mattering because we allow an arbitrary offset (bias) term when we apply # these transforms. featsub_warped="ark:add-deltas ark:$dir/feats.CLASS.ark ark:- |" # you need to define CLASS when invoking $cmd. featsub_unwarped="ark:add-deltas ark:$dir/feats.$default_class.ark ark:- |" else echo "$0: Using LDA features" [ ! -z "$cmvn_opts" ] && echo "$0: you cannot supply --cmvn-opts if base system is LDA." cp $alidir/final.mat $alidir/full.mat $alidir/splice_opts $alidir/cmvn_opts $dir 2>/dev/null cmvn_opts=`cat $dir/cmvn_opts 2>/dev/null` sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" featsub_warped="ark:splice-feats $splice_opts ark:$dir/feats.CLASS.ark ark:- | transform-feats $dir/final.mat ark:- ark:- |" # you need to define CLASS when invoking $cmd. featsub_unwarped="ark:splice-feats $splice_opts ark:$dir/feats.$default_class.ark ark:- | transform-feats $dir/final.mat ark:- ark:- |" fi if [ -f $data/utt2warp ]; then echo "$0: source data directory $data appears to already have VTLN."; exit 1; fi # create a small subset of utterances for purposes of initializing the LVTLN transform # utils/shuffle_list.pl is deterministic, unlike sort -R. cat $data/utt2spk | awk '{print $1}' | utils/shuffle_list.pl | \ head -n $num_utt_lvtln_init > $dir/utt_subset if [ $stage -le -6 ]; then echo "$0: computing warped subset of features" if [ -f $data/segments ]; then echo "$0 [info]: segments file exists: using that." subset_feats="utils/filter_scp.pl $dir/utt_subset $data/segments | extract-segments scp:$data/wav.scp - ark:- " else echo "$0 [info]: no segments file exists: using wav.scp directly." subset_feats="utils/filter_scp.pl $dir/utt_subset $data/wav.scp | wav-copy scp:- ark:- " fi rm $dir/.error 2>/dev/null for c in $(seq 0 $[$num_classes-1]); do this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));") $cmd $dir/log/compute_warped_feats.$c.log \ $subset_feats \| compute-${base_feat_type}-feats --verbose=2 \ --config=conf/${base_feat_type}.conf --vtln-warp=$this_warp ark:- ark:- \| \ copy-feats --compress=true ark:- ark:$dir/feats.$c.ark || touch $dir/.error & done wait; if [ -f $dir/.error ]; then echo "$0: Computing warped features failed: check $dir/log/compute_warped_feats.*.log" exit 1; fi fi if ! utils/filter_scp.pl $dir/utt_subset $data/feats.scp | \ compare-feats --threshold=0.98 scp:- ark:$dir/feats.$default_class.ark >&/dev/null; then echo "$0: features stored on disk differ from those computed with no warping." echo " Possibly your feature type is wrong (--base-feat-type option)" exit 1; fi if [ -f $data/segments ]; then subset_utts="ark:extract-segments scp:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |" else echo "$0 [info]: no segments file exists: using wav.scp directly." subset_utts="ark:wav-copy scp:$sdata/JOB/wav.scp ark:- |" fi if [ $stage -le -5 ]; then echo "$0: initializing base LVTLN transforms in $dir/0.lvtln (ignore warnings below)" dim=$(feat-to-dim "$featsub_unwarped" - ) || exit 1; $cmd $dir/log/init_lvtln.log \ gmm-init-lvtln --dim=$dim --num-classes=$num_classes --default-class=$default_class \ $dir/0.lvtln || exit 1; $cmd JOB=1:$nj $dir/log/get_weights.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz |" ark:- \| \ weight-silence-post 0.0 "$silphonelist" $alidir/final.mdl ark:- ark:- \| \ post-to-weights ark:- "ark,scp:$dir/weights.JOB.ark,$dir/weights.JOB.scp" || exit 1 for n in `seq 1 $nj`; do cat $dir/weights.$n.scp done > $dir/weights.scp for c in $(seq 0 $[$num_classes-1]); do this_warp=$(perl -e "print ($min_warp + ($c*$warp_step));") orig_feats=ark:$dir/feats.$default_class.ark warped_feats=ark:$dir/feats.$c.ark logfile=$dir/log/train_special.$c.log this_featsub_warped="$(echo $featsub_warped | sed s/CLASS/$c/)" if ! gmm-train-lvtln-special --warp=$this_warp --normalize-var=true \ --weights-in="scp:$dir/weights.scp" \ $c $dir/0.lvtln $dir/0.lvtln \ "$featsub_unwarped" "$this_featsub_warped" 2>$logfile; then echo "$0: Error training LVTLN transform, see $logfile"; exit 1; fi done rm $dir/final.lvtln 2>/dev/null ln -s 0.lvtln $dir/final.lvtln fi if [ $stage -le -4 ]; then echo "$0: computing initial LVTLN transforms for speakers" if [ -f $alidir/final.alimdl ]; then # if the base system was trained with SAT, it's probably better # to use the .alimdl, trained speaker-independent, to get the # LVTLN transforms (LVTLN may be closer to an unadapted system). echo "$0: to get initial LVTLN transforms, using $alidir/final.alimdl" srcmodel=$alidir/final.alimdl else srcmodel=$alidir/final.mdl fi $cmd JOB=1:$nj $dir/log/lvtln.0.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 "$silphonelist" $alidir/final.mdl ark:- ark:- \| \ gmm-post-to-gpost $srcmodel "$sifeats" ark:- ark:- \| \ gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \ --spk2utt=ark:$sdata/JOB/spk2utt $srcmodel \ $dir/0.lvtln "$sifeats" ark:- ark:$dir/trans.JOB ark,t:$dir/warp.0.JOB || exit 1 # consolidate the warps into one file. for j in $(seq $nj); do cat $dir/warp.0.$j; done > $dir/warp.0 rm $dir/warp.0.* fi if [ $stage -le -3 ]; then echo "$0: accumulating tree stats" $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; rm $dir/*.treeacc fi if [ $stage -le -2 ]; then echo "$0: getting questions for tree-building, via clustering" # preparing questions, roots file... cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; cat $lang/phones/extra_questions.int >> $dir/questions.int compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; echo "$0: building the tree" $cmd $dir/log/build_tree.log \ build-tree --verbose=1 --max-leaves=$numleaves \ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ $dir/questions.qst $lang/topo $dir/tree || exit 1; gmm-init-model --write-occs=$dir/1.occs \ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1; rm $dir/treeacc fi if [ $stage -le -1 ]; then # Convert the alignments. echo "$0: converting alignments from $alidir to use current tree" $cmd JOB=1:$nj $dir/log/convert.JOB.log \ convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi if [ $stage -le 0 ]; then echo "$0: compiling graphs of transcripts" $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; fi x=1 while [ $x -lt $num_iters ]; do echo "$0: training pass $x" if echo $realign_iters | grep -w $x >/dev/null; then if [ $stage -le $x ]; then echo "$0: aligning data" mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" \ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi fi if echo $lvtln_iters | grep -w $x >/dev/null; then if [ $stage -le $x ]; then echo "Re-estimating LVTLN transforms" $cmd JOB=1:$nj $dir/log/lvtln.$x.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \ gmm-post-to-gpost $dir/$x.mdl "$feats" ark:- ark:- \| \ gmm-est-lvtln-trans --logdet-scale=$logdet_scale --verbose=1 \ --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \ $dir/0.lvtln "$sifeats" ark:- ark:$dir/new_trans.JOB ark,t:$dir/warp.$x.JOB || exit 1 # consolidate the warps into one file. for j in $(seq $nj); do mv $dir/new_trans.$j $dir/trans.$j; done for j in $(seq $nj); do cat $dir/warp.$x.$j; done > $dir/warp.$x rm $dir/warp.$x.* fi fi if [ $stage -le $x ]; then $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ gmm-acc-stats-ali $dir/$x.mdl "$feats" \ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; $cmd $dir/log/update.$x.log \ gmm-est --mix-up=$numgauss --power=$power \ --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; rm $dir/$x.mdl $dir/$x.*.acc rm $dir/$x.occs fi [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; x=$[$x+1]; done if [ $stage -le $x ]; then # Accumulate stats for "alignment model"-- this model is computed with the # speaker-independent features, but matches Gaussian-for-Gaussian with the # final speaker-adapted model. $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ ark,s,cs:- $dir/$x.JOB.acc || exit 1; [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; # Update model. $cmd $dir/log/est_alimdl.log \ gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; rm $dir/$x.*.acc fi if true; then # Diagnostics last_iter=$(echo 0 $lvtln_iters | awk '{print $NF;}') ln -sf warp.$last_iter $dir/final.warp if [ -f $data/spk2gender ]; then # To make it easier to eyeball the male and female speakers' warps # separately, separate them out. for g in m f; do # means: for gender in male female cat $dir/final.warp | \ utils/filter_scp.pl <(grep -w $g $data/spk2gender | awk '{print $1}') > $dir/final.warp.$g echo -n "The last few warp factors for gender $g are: " tail -n 10 $dir/final.warp.$g | awk '{printf("%s ", $2);}'; echo done fi fi ln -sf $x.mdl $dir/final.mdl ln -sf $x.occs $dir/final.occs ln -sf $x.alimdl $dir/final.alimdl # Summarize warning messages... utils/summarize_warnings.pl $dir/log echo "$0: Done training LVTLN system in $dir" |