make_bottleneck_features.sh
4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
# Copyright 2016 Pegah Ghahremani
# This script dumps bottleneck feature for model trained using nnet3.
# CAUTION! This script isn't very suitable for dumping features from recurrent
# architectures such as LSTMs, because it doesn't support setting the chunk size
# and left and right context. (Those would have to be passed into nnet3-compute).
# See also chain/get_phone_post.sh.
# Begin configuration section.
stage=1
nj=4
cmd=queue.pl
use_gpu=false
ivector_dir=
compress=true
# End configuration options.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;
if [[ ( $# -lt 4 ) || ( $# -gt 6 ) ]]; then
echo "usage: steps/nnet3/make_bottleneck_features.sh <bnf-node-name> <input-data-dir> <bnf-data-dir> <nnet-dir> [<log-dir> [<bnfdir>] ]"
echo "e.g.: steps/nnet3/make_bottleneck_features.sh tdnn_bn.renorm data/train data/train_bnf exp/nnet3/tdnn_bnf exp_bnf/dump_bnf bnf"
echo "Note: <log-dir> defaults to <bnf-data-dir>/log and <bnfdir> defaults to"
echo " <bnf-data-dir>/data"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --ivector-dir # directory for ivectors"
exit 1;
fi
bnf_name=$1 # the component-node name in nnet3 model used for bottleneck feature extraction
data=$2
bnf_data=$3
nnetdir=$4
if [ $# -gt 4 ]; then
logdir=$5
else
logdir=$bnf_data/log
fi
if [ $# -gt 5 ]; then
bnfdir=$6
else
bnfdir=$bnf_data/data
fi
# Assume that final.nnet is in nnetdir
cmvn_opts=`cat $nnetdir/cmvn_opts`;
bnf_nnet=$nnetdir/final.raw
if [ ! -f $bnf_nnet ] ; then
if [ ! -f $nnetdir/final.mdl ]; then
echo "$0: No such file $bnf_nnet or $nnetdir/final.mdl";
exit 1;
else
bnf_nnet=$nnetdir/final.mdl
fi
fi
if $use_gpu; then
compute_queue_opt="--gpu 1"
compute_gpu_opt="--use-gpu=yes"
if ! cuda-compiled; then
echo "$0: WARNING: you are running with one thread but you have not compiled"
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
exit 1
fi
else
echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads."
compute_gpu_opt="--use-gpu=no"
fi
## Set up input features of nnet
name=`basename $data`
sdata=$data/split$nj
mkdir -p $logdir
mkdir -p $bnf_data
mkdir -p $bnfdir
echo $nj > $nnetdir/num_jobs
[ ! -f $data/feats.scp ] && echo >&2 "The file $data/feats.scp does not exist!" && exit 1;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
use_ivector=false
if [ ! -z "$ivector_dir" ];then
use_ivector=true
steps/nnet2/check_ivectors_compatible.sh $nnetdir $ivector_dir || exit 1;
fi
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $ivector_dir/ivector_online.scp |"
if [ $stage -le 1 ]; then
echo "$0: Generating bottleneck (BNF) features using $bnf_nnet model as output of "
echo " component-node with name $bnf_name."
echo "output-node name=output input=$bnf_name" > $bnf_data/output.config
modified_bnf_nnet="nnet3-copy --nnet-config=$bnf_data/output.config $bnf_nnet - |"
ivector_opts=
if $use_ivector; then
ivector_period=$(cat $ivector_dir/ivector_period) || exit 1;
ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'"
fi
$cmd $compute_queue_opt JOB=1:$nj $logdir/make_bnf_$name.JOB.log \
nnet3-compute $compute_gpu_opt $ivector_opts "$modified_bnf_nnet" "$feats" ark:- \| \
copy-feats --compress=$compress ark:- ark,scp:$bnfdir/raw_bnfeat_$name.JOB.ark,$bnfdir/raw_bnfeat_$name.JOB.scp || exit 1;
fi
N0=$(cat $data/feats.scp | wc -l)
N1=$(cat $bnfdir/raw_bnfeat_$name.*.scp | wc -l)
if [[ "$N0" != "$N1" ]]; then
echo "$0: Error generating BNF features for $name (original:$N0 utterances, BNF:$N1 utterances)"
exit 1;
fi
# Concatenate feats.scp into bnf_data
for n in $(seq $nj); do cat $bnfdir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp
for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do
[ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
done
echo "$0: computing CMVN stats."
steps/compute_cmvn_stats.sh $bnf_data
echo "$0: done making BNF features."
exit 0;