make_bn_feats.sh
5.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example
# Begin configuration section.
nj=4
cmd=run.pl
remove_last_components=4 # remove N last components from the nnet
nnet_forward_opts=
use_gpu=no
htk_save=false
ivector= # rx-specifier with i-vectors (ark-with-vectors),
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
set -euo pipefail
if [ $# != 5 ]; then
echo "usage: $0 [options] <tgt-data-dir> <src-data-dir> <nnet-dir> <log-dir> <abs-path-to-bn-feat-dir>";
echo "options: "
echo " --cmd 'queue.pl <queue opts>' # how to run jobs."
echo " --nj <nj> # number of parallel jobs"
echo " --remove-last-components <N> # number of NNet Components to remove from the end"
echo " --use-gpu (no|yes|optional) # forwarding on GPU"
exit 1;
fi
if [ -f path.sh ]; then . ./path.sh; fi
data=$1
srcdata=$2
nndir=$3
logdir=$4
bnfeadir=$5
######## CONFIGURATION
# copy the dataset metadata from srcdata.
mkdir -p $data $logdir $bnfeadir || exit 1;
utils/copy_data_dir.sh $srcdata $data; rm -f $data/{feats,cmvn}.scp 2>/dev/null
# make $bnfeadir an absolute pathname.
[ '/' != ${bnfeadir:0:1} ] && bnfeadir=$PWD/$bnfeadir
required="$srcdata/feats.scp $nndir/final.nnet $nndir/final.feature_transform"
for f in $required; do
[ ! -f $f ] && echo "$0: Missing $f" && exit 1;
done
name=$(basename $srcdata)
sdata=$srcdata/split$nj
[[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;
# Concat feature transform with trimmed MLP:
nnet=$bnfeadir/feature_extractor.nnet
nnet-concat $nndir/final.feature_transform "nnet-copy --remove-last-components=$remove_last_components $nndir/final.nnet - |" $nnet 2>$logdir/feature_extractor.log || exit 1
nnet-info $nnet >$data/feature_extractor.nnet-info
echo "Creating bn-feats into $data"
# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
online_cmvn_opts=
cmvn_opts=
delta_opts=
D=$nndir
[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
# apply-cmvn-online (optional),
[ -n "$online_cmvn_opts" -a ! -f $nndir/global_cmvn_stats.mat ] && echo "$0: Missing $nndir/global_cmvn_stats.mat" && exit 1
[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $nndir/global_cmvn_stats.mat ark:- ark:- |"
# apply-cmvn (optional),
[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
[ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
# Get the tool,
ivector_append_tool=append-vector-to-feats # default,
[ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
# Check dims,
feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
dim_raw=$(feat-to-dim "$feats_job_1" -)
dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
dim_ivec=$((dim_raw_and_ivec - dim_raw))
[ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
exit 1
# Append to feats,
feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi
if [ $htk_save == false ]; then
# Run the forward pass,
$cmd JOB=1:$nj $logdir/make_bnfeats.JOB.log \
nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" \
ark,scp:$bnfeadir/raw_bnfea_$name.JOB.ark,$bnfeadir/raw_bnfea_$name.JOB.scp \
|| exit 1;
# concatenate the .scp files
for ((n=1; n<=nj; n++)); do
cat $bnfeadir/raw_bnfea_$name.$n.scp >> $data/feats.scp
done
# check sentence counts,
N0=$(cat $srcdata/feats.scp | wc -l)
N1=$(cat $data/feats.scp | wc -l)
[[ "$N0" != "$N1" ]] && echo "$0: sentence-count mismatch, $srcdata $N0, $data $N1" && exit 1
echo "Succeeded creating MLP-BN features '$data'"
else # htk_save == true
# Run the forward pass saving HTK features,
$cmd JOB=1:$nj $logdir/make_bnfeats_htk.JOB.log \
mkdir -p $data/htkfeats/JOB \; \
nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
copy-feats-to-htk --output-dir=$data/htkfeats/JOB ark:- || exit 1
# Make list of htk features,
find $data/htkfeats -name *.fea >$data/htkfeats.scp
# Check sentence counts,
N0=$(cat $srcdata/feats.scp | wc -l)
N1=$(find $data/htkfeats.scp | wc -l)
[[ "$N0" != "$N1" ]] && echo "$0: sentence-count mismatch, $srcdata $N0, $data/htk* $N1" && exit 1
echo "Succeeded creating MLP-BN features '$data/htkfeats.scp'"
fi