dump_nnet_activations.sh
5.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash
# Copyright 2013 Daniel Povey
# Apache 2.0.
# This script was modified from ./extract_ivectors_online2.sh. It is to be used
# when retraining the top layer of a system that was trained on another,
# out-of-domain dataset, on some in-domain dataset. It takes as input a
# directory such as nnet_gpu_online as prepared by ./prepare_online_decoding.sh,
# and a data directory, and it processes the wave files to get features and iVectors,
# then puts it through all but the last layer of the neural net in that directory, and dumps
# those final activations in a feats.scp file in the output directory. These files
# might be quite large. A typical feature-dimension is 300; it's the p-norm output dim.
# We compress these files (note: the compression is lossy).
# Begin configuration section.
nj=30
cmd="run.pl"
stage=0
utts_per_spk_max=2 # maximum 2 utterances per "fake-speaker."
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [options] <data> <srcdir> <output-dir>"
echo " e.g.: $0 data/train exp/nnet2_online/nnet_a_online exp/nnet2_online/activations_train"
echo "Output is in <output-dir>/feats.scp"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue-opts>) # how to run jobs."
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)"
echo " --stage <stage|0> # To control partial reruns"
echo " --utts-per-spk-max <int;default=2> # Controls splitting into 'fake speakers'."
echo " # Set to 1 if compatibility with utterance-by-utterance"
echo " # decoding is the only factor, and to larger if you care "
echo " # also about adaptation over several utterances."
exit 1;
fi
data=$1
srcdir=$2
dir=$3
for f in $data/wav.scp $srcdir/conf/online_nnet2_decoding.conf $srcdir/final.mdl; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
# Set various variables.
mkdir -p $dir/log
echo $nj >$dir/num_jobs
sdata=$data/split$nj;
utils/split_data.sh $data $nj || exit 1;
mkdir -p $dir/conf $dir/feats
grep -v '^--endpoint' $srcdir/conf/online_nnet2_decoding.conf > $dir/conf/online_feature_pipeline.conf
if [ $stage -le 0 ]; then
ns=$(wc -l <$data/spk2utt)
if [ "$ns" == 1 -a "$utts_per_spk_max" != 1 ]; then
echo "$0: you seem to have just one speaker in your database. This is probably not a good idea."
echo " see http://kaldi-asr.org/doc/data_prep.html (search for 'bold') for why"
echo " Setting --utts-per-spk-max to 1."
utts_per_spk_max=1
fi
mkdir -p $dir/spk2utt_fake
for job in $(seq $nj); do
# create fake spk2utt files with reduced number of utterances per speaker,
# so the network is well adapted to using iVectors from small amounts of
# training data.
awk -v max=$utts_per_spk_max '{ n=2; count=0; while(n<=NF) {
nmax=n+max; count++; printf("%s-%06x", $1, count); for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
<$sdata/$job/spk2utt >$dir/spk2utt_fake/spk2utt.$job
done
fi
if [ $stage -le 1 ]; then
info=$dir/nnet_info
nnet-am-info $srcdir/final.mdl >$info
nc=$(grep num-components $info | awk '{print $2}');
if grep SumGroupComponent $info >/dev/null; then
nc_truncate=$[$nc-3] # we did mix-up: remove AffineComponent,
# SumGroupComponent, SoftmaxComponent
else
nc_truncate=$[$nc-2] # remove AffineComponent, SoftmaxComponent
fi
nnet-to-raw-nnet --truncate=$nc_truncate $srcdir/final.mdl $dir/nnet.raw
fi
if [ $stage -le 2 ]; then
echo "$0: dumping neural net activations"
# The next line is a no-op unless $dir/feats/storage/ exists; see utils/create_split_dir.pl.
for j in $(seq $nj); do utils/create_data_link.pl $dir/feats/feats.$j.ark; done
if [ -f $data/segments ]; then
wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
else
wav_rspecifier="scp,p:$sdata/JOB/wav.scp"
fi
$cmd JOB=1:$nj $dir/log/dump_activations.JOB.log \
online2-wav-dump-features --config=$dir/conf/online_feature_pipeline.conf \
ark:$dir/spk2utt_fake/spk2utt.JOB "$wav_rspecifier" ark:- \| \
nnet-compute $dir/nnet.raw ark:- ark:- \| \
copy-feats --compress=true ark:- \
ark,scp:$dir/feats/feats.JOB.ark,$dir/feats/feats.JOB.scp || exit 1;
fi
if [ $stage -le 3 ]; then
echo "$0: combining activations across jobs"
mkdir -p $dir/data
cp -r $data/* $dir/data
for j in $(seq $nj); do cat $dir/feats/feats.$j.scp; done >$dir/data/feats.scp || exit 1;
fi
if [ $stage -le 4 ]; then
echo "$0: computing [fake] CMVN stats."
# We shouldn't actually be doing CMVN, but the get_egs.sh script expects it,
# so create fake CMVN stats.
steps/compute_cmvn_stats.sh --fake $dir/data $dir/log $dir/feats || exit 1
fi
echo "$0: done. Output is in $dir/data/feats.scp"