mixup.sh
5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# mix up (or down); do 3 iters of model training; realign; then do two more
# iterations of model training.
# Begin configuration section.
cmd=run.pl
beam=10
retry_beam=40
boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
num_iters=5
realign_iters=3 # Space-separated list of iterations to realign on.
stage=0
# End configuration section.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage: steps/mixup.sh <num-gauss> <data-dir> <lang-dir> <old-exp-dir> <exp-dir>"
echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
exit 1;
fi
numgauss=$1
data=$2
lang=$3
srcdir=$4
dir=$5
for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do
[ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1;
done
nj=`cat $srcdir/num_jobs` || exit 1;
sdata=$data/split$nj;
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
mkdir -p $dir/log
cp $srcdir/splice_opts $dir 2>/dev/null
cp $srcdir/final.mat $dir
echo $nj > $dir/num_jobs
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
cp $srcdir/tree $dir
## Set up features.
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "$0: feature type is $feat_type"
case $feat_type in
delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
cp $srcdir/final.mat $dir
;;
*) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ -f $srcdir/trans.1 ]; then
echo Using transforms from $srcdir;
rm $dir/trans.* 2>/dev/null
ln.pl $srcdir/trans.* $dir # Link those transforms to current directory.
feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
else
feats="$sifeats"
fi
## Done setting up features.
rm $dir/fsts.*.gz 2>/dev/null
ln.pl $srcdir/fsts.*.gz $dir # Link training-graph FSTs to current directory.
## Mix up old model
if [ $stage -le 0 ]; then
echo Mixing up old model to $numgauss Gaussians
# Note: this script also works for mixing down.
$cmd $dir/log/mixup.log \
gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \
$srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1;
fi
## Done.
cur_alidir=$srcdir # dir to find alignments.
[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if
# we won't be generating them.
x=1
while [ $x -le $num_iters ]; do
echo "$0: iteration $x"
if echo $realign_iters | grep -w $x >/dev/null; then
if [ $stage -le $x ]; then
echo "$0: realigning data"
mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
$cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \
"ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
"ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi
cur_alidir=$dir
fi
if [ $stage -le $x ]; then
echo "$0: accumulating statistics"
$cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
gmm-acc-stats-ali $dir/$x.mdl "$feats" \
"ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
echo "$0: re-estimating model"
[ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;
$cmd $dir/log/update.$x.log \
gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
rm $dir/$x.mdl $dir/$x.*.acc
rm $dir/$x.occs 2>/dev/null
fi
x=$[$x+1]
done
rm $dir/final.mdl $dir/final.occs 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
if [ -f $dir/trans.1 ]; then
echo "$0: accumulating stats for alignment model."
$cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
ark,s,cs:- $dir/$x.JOB.acc || exit 1;
[ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;
echo "$0: Re-estimating alignment model."
$cmd $dir/log/est_alimdl.log \
gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1;
rm $dir/$x.*.acc
rm $dir/final.alimdl 2>/dev/null
ln -s $x.alimdl $dir/final.alimdl
fi
utils/summarize_warnings.pl $dir/log
echo Done