compute_vad_decision_gmm.sh
5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/bin/bash
# Copyright 2015 David Snyder
# Apache 2.0
#
# Compute GMM-based VAD output and optionally combine with
# the energy-based VAD decisions.
nj=10
cmd=run.pl
map_config=
merge_map_config=
priors=
use_energy_vad=false
num_gselect=20
norm_vars=false
center=true
stage=-4
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# -lt 5 ]; then
echo "Usage: $0 [options] <data-dir> <gmm-dir-1> ... <gmm-dir-N> <log-dir> <vad-dir>";
echo "e.g.: $0 data/train exp/music_gmm exp/speech_gmm exp/noise_gmm exp/gmm_vad exp/gmm_vad"
echo " Options:"
echo " --map-config <config-file> # config passed to compute-vad-from-frame-likes"
echo " --priors <comma-separated-floats> # list passed to compute-vad-from-frame-likes"
echo " --merge-map-config <config-file> # config passed to merge-vads"
echo " --use-energy-vad <true,false> # If true, look for a vad.scp file and combine it with this VAD"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
args=("$@")
gmm_dirs=(${@:2:$(($#-3))}) # The GMM directories
num_gmms=`expr $# - 3`
data=${args[0]}
log_dir=${args[$num_gmms+1]}
vad_dir=${args[$num_gmms+2]}
# make $vad_dir an absolute pathname.
vad_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${vad_dir} ${PWD}`
# use "name" as part of name of the archive.
name=`basename $data`
mkdir -p $vad_dir || exit 1;
mkdir -p $log_dir || exit 1;
if $use_energy_vad; then
for f in $data/vad.scp "$merge_map_config"; do
if [ ! -f $f ]; then
echo "compute_vad_decision_gmm.sh: no such file $f"
exit 1;
fi
done
fi
if [ ! -f $data/feats.scp ]; then
echo "compute_vad_decision_gmm.sh: no such file $f"
exit 1;
fi
utils/split_data.sh $data $nj || exit 1;
sdata=$data/split$nj;
# We assume that the same delta-opts is used for each
# GMM dir.
delta_opts=`cat ${gmm_dirs[0]}/delta_opts 2>/dev/null`
if [ -f ${gmm_dirs[0]}/delta_opts ]; then
cp ${gmm_dirs[0]}/delta_opts $dir/ 2>/dev/null
fi
## Set up features.
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=$norm_vars --center=$center --cmn-window=300 ark:- ark:- |"
if [ $stage -le -2 ]; then
for gmm_dir in "${gmm_dirs[@]}";
do
gmm_name=`basename $gmm_dir`
$cmd ${log_dir}/log/${gmm_name}_convert.log \
fgmm-global-to-gmm ${gmm_dir}/final.ubm ${vad_dir}/${gmm_name}_final.dubm || exit 1;
done
fi
if [ $stage -le -1 ]; then
echo "$0: doing Gaussian selection"
for gmm_dir in "${gmm_dirs[@]}";
do
gmm_name=`basename $gmm_dir`
$cmd JOB=1:$nj ${log_dir}/log/${gmm_name}_gselect.JOB.log \
gmm-gselect --n=$num_gselect ${vad_dir}/${gmm_name}_final.dubm "$feats" ark:- \| \
fgmm-gselect --gselect=ark,s,cs:- --n=${num_gselect} ${gmm_dir}/final.ubm \
"$feats" "ark:|gzip -c >${vad_dir}/${gmm_name}_gselect.JOB.gz" || exit 1;
done
fi
frame_likes=""
if [ $stage -le 0 ]; then
echo "$0: computing frame likelihoods"
for gmm_dir in "${gmm_dirs[@]}";
do
gmm_name=`basename $gmm_dir`
frame_likes="${frame_likes} ark:${vad_dir}/${gmm_name}_logprob.JOB.ark"
$cmd JOB=1:$nj ${log_dir}/log/get_${gmm_name}_logprob.JOB.log \
fgmm-global-get-frame-likes --average=false \
"--gselect=ark,s,cs:gunzip -c ${vad_dir}/${gmm_name}_gselect.JOB.gz|" ${gmm_dir}/final.ubm \
"$feats" ark:${vad_dir}/${gmm_name}_logprob.JOB.ark || exit 1;
done
echo "$0: computing VAD decisions from frame likelihoods"
$cmd JOB=1:$nj ${log_dir}/log/make_vad_gmm_${name}.JOB.log \
compute-vad-from-frame-likes --map=${map_config} --priors=$priors $frame_likes \
ark,scp:${vad_dir}/vad_gmm_${name}.JOB.ark,${vad_dir}/vad_gmm_${name}.JOB.scp \
|| exit 1;
if $use_energy_vad ; then
echo "$0: merging with energy-based VAD decisions"
$cmd JOB=1:$nj ${log_dir}/log/merge_vads_${name}.JOB.log \
merge-vads --map=${merge_map_config} scp:$sdata/JOB/vad.scp \
scp:${vad_dir}/vad_gmm_${name}.JOB.scp \
ark,scp:${vad_dir}/vad_merged_${name}.JOB.ark,${vad_dir}/vad_merged_${name}.JOB.scp \
|| exit 1;
fi
echo "$0: moving old vad.scp to ${data}/vad.scp.bak"
mv ${data}/vad.scp ${data}/vad.scp.bak
for ((n=1; n<=nj; n++)); do
if $use_energy_vad ; then
cat ${vad_dir}/vad_merged_${name}.$n.scp || exit 1;
else
cat ${vad_dir}/vad_gmm_${name}.$n.scp || exit 1;
fi
done > ${data}/vad.scp
fi
nc=`cat $data/vad.scp | wc -l`
nu=`cat $data/feats.scp | wc -l`
if [ $nc -ne $nu ]; then
echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);"
echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh"
[ $nc -eq 0 ] && exit 1;
fi
echo "$0 created GMM-based VAD output for $name"
if $cleanup ; then
for gmm_dir in "${gmm_dirs[@]}";
do
gmm_name=`basename $gmm_dir`
rm ${vad_dir}/${gmm_name}_gselect.*.gz
rm ${vad_dir}/${gmm_name}_logprob.*.ark
done
fi
exit 0;