get_lda.sh
7.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
# 2015 David Snyder
# Apache 2.0.
#
# This script is based off of get_lda.sh in ../../steps/nnet2/, but has been
# modified for speaker recogntion purposes to use a sliding window CMN.
#
# This script, which will generally be called from other neural-net training
# scripts, extracts the training examples used to train the neural net (and also
# the validation examples used for diagnostics), and puts them in separate archives.
# Begin configuration section.
cmd=run.pl
feat_type=
stage=0
splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # left context for second LDA
right_context= # right context for second LDA
rand_prune=4.0 # Relates to a speedup we do for LDA.
within_class_factor=0.0001 # This affects the scaling of the transform rows...
# sorry for no explanation, you'll have to see the code.
transform_dir= # If supplied, overrides alidir
num_feats=10000 # maximum number of feature files to use. Beyond a certain point it just
# gets silly to use more data.
lda_dim= # This defaults to no dimension reduction.
online_ivector_dir=
ivector_randomize_prob=0.0 # if >0.0, randomizes iVectors during training with
# this prob per iVector.
ivector_dir=
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: steps/nnet2/get_lda.sh [opts] <data> <lang> <ali-dir> <exp-dir>"
echo " e.g.: steps/nnet2/get_lda.sh data/train data/lang exp/tri3_ali exp/tri4_nnet"
echo " As well as extracting the examples, this script will also do the LDA computation,"
echo " if --est-lda=true (default:true)"
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config file containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --left-context <width;4> # Number of frames on left side to append for feature input, overrides splice-width"
echo " --right-context <width;4> # Number of frames on right side to append for feature input, overrides splice-width"
echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
echo " --online-vector-dir <dir|none> # Directory produced by"
echo " # steps/online/nnet2/extract_ivectors_online.sh"
exit 1;
fi
data=$1
lang=$2
alidir=$3
dir=$4
[ -z "$left_context" ] && left_context=$splice_width
[ -z "$right_context" ] && right_context=$splice_width
[ ! -z "$online_ivector_dir" ] && \
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
# Check some files.
for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_files; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
# Set some variables.
oov=`cat $lang/oov.int`
num_leaves=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir...
# in this dir we'll have just one job.
sdata=$data/split$nj
utils/split_data.sh $data $nj
mkdir -p $dir/log
echo $nj > $dir/num_jobs
cp $alidir/tree $dir
[ -z "$transform_dir" ] && transform_dir=$alidir
## Set up features. Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
if [ -z $feat_type ]; then
if [ -f $alidir/final.mat ] && ! [ -f $alidir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
fi
echo "$0: feature type is $feat_type"
# If we have more than $num_feats feature files (default: 10k),
# we use a random subset. This won't affect the transform much, and will
# spare us an unnecessary pass over the data. Probably 10k is
# way too much, but for small datasets this phase is quite fast.
N=$[$num_feats/$nj]
case $feat_type in
raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
;;
lda)
splice_opts=`cat $alidir/splice_opts 2>/dev/null`
cp $alidir/{splice_opts,final.mat} $dir || exit 1;
feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
esac
if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
echo "$0: using transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
fi
if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
echo "$0: using raw-fMLLR transforms from $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
fi
feats_one="$(echo "$feats" | sed s:JOB:1:g)"
# note: feat_dim is the raw, un-spliced feature dim without the iVectors.
feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
# by default: no dim reduction.
spliced_feats="$feats splice-feats --left-context=$left_context --right-context=$right_context ark:- ark:- |"
if [ ! -z "$online_ivector_dir" ]; then
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
# note: subsample-feats, with negative value of n, repeats each feature n times.
spliced_feats="$spliced_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- | ivector-randomize --randomize-prob=$ivector_randomize_prob ark:- ark:- |' ark:- |"
ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
else
ivector_dim=0
fi
echo $ivector_dim >$dir/ivector_dim
if [ -z "$lda_dim" ]; then
spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"
lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
fi
if [ $stage -le 0 ]; then
echo "$0: Accumulating LDA statistics."
rm $dir/lda.*.acc 2>/dev/null # in case any left over from before.
$cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
acc-lda --rand-prune=$rand_prune $alidir/final.mdl "$spliced_feats" ark,s,cs:- \
$dir/lda.JOB.acc || exit 1;
fi
echo $feat_dim > $dir/feat_dim
echo $lda_dim > $dir/lda_dim
echo $ivector_dim > $dir/ivector_dim
if [ $stage -le 1 ]; then
sum-lda-accs $dir/lda.acc $dir/lda.*.acc 2>$dir/log/lda_sum.log || exit 1;
rm $dir/lda.*.acc
fi
if [ $stage -le 2 ]; then
# There are various things that we sometimes (but not always) need
# the within-class covariance and its Cholesky factor for, and we
# write these to disk just in case.
nnet-get-feature-transform --write-cholesky=$dir/cholesky.tpmat \
--write-within-covar=$dir/within_covar.spmat \
--within-class-factor=$within_class_factor --dim=$lda_dim \
$dir/lda.mat $dir/lda.acc \
2>$dir/log/lda_est.log || exit 1;
fi
echo "$0: Finished estimating LDA"