prepare_targets_gmm.sh
12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#! /bin/bash
# Copyright 2017 Vimal Manohar
# Apache 2.0
# This script prepares targets for training neural network for
# speech activity detction.
# See steps/segmentation/lats_to_targets.sh for details about the
# format of the targets.
# The targets are obtained from a combination
# of supervision-constrained lattices and lattices obtained by decoding.
# Also, we assume that the out-of-segment regions are all silence (target
# values of [ 1 0 0 ]. We merge the targets from the multiple sources
# by a weighted average using weights specified by --weights. Also,
# the frames where the labels from multiple sources do not match are
# removed in the script steps/segmentation/merge_targets_dirs.sh.
# In this script, we use GMMs trained for ASR on in-domain data
# to generate the lattices required for creating the targets. To generate
# supervision-constrained lattices, we use speaker-adapted GMM models. To
# generate lattices without supervision, we use speaker-independent GMM models
# from the LDA+MLLT stage, but apply per-recording cepstral mean subtraction.
# The phones in the lattices are mapped deterministically to
# 0, 1, and 2 representing respectively silence, speech and garbage classes.
# The mapping is defined by --garbage-phones-list and --silence-phones-list
# options. But when these are unspecified, the silence phones other than
# oov are mapped to silence class and the oov is mapped to garbage class.
stage=-1
train_cmd=run.pl
decode_cmd=run.pl
nj=4
reco_nj=4
lang_test= # If different from $lang
graph_dir= # If not provided, a new one will be created using $lang_test
garbage_phones_list=
silence_phones_list=
# Uniform segmentation options for decoding whole recordings. All values are in
# seconds.
max_segment_duration=10
overlap_duration=2.5
max_remaining_duration=5 # If the last remaining piece when splitting uniformly
# is smaller than this duration, then the last piece
# is merged with the previous.
# List of weights on labels obtained from alignment,
# labels obtained from decoding and default labels in out-of-segment regions
merge_weights=1.0,0.1,0.5
[ -f ./path.sh ] && . ./path.sh
set -e -u -o pipefail
. utils/parse_options.sh
if [ $# -ne 6 ]; then
cat <<EOF
This script prepares targets for training neural network for
speech activity detction. The targets are obtained from a combination
of supervision-constrained lattices and lattices obtained by decoding.
See comments in the script for more details.
Usage: $0 <lang> <data> <whole-recording-data> <ali-model-dir> <model-dir> <dir>
e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a
Note: <whole-recording-data> is expected to have feats.scp and <data>
expected to have segments file. We will get the features for <data> by
using row ranges of <whole-recording-data>/feats.scp. This script will
work on a copy of <data> created to have the recording-id as the speaker-id.
EOF
exit 1
fi
lang=$1 # Must match the one used to train the models
in_data_dir=$2
in_whole_data_dir=$3
ali_model_dir=$4 # Model directory used to align the $data_dir to get target
# labels for training SAD. This should typically be a
# speaker-adapted system.
model_dir=$5 # Model direcotry used to decode the whole-recording version
# of the $data_dir to get target labels for training SAD. This
# should typically be a speaker-independent system like
# LDA+MLLT system.
dir=$6
mkdir -p $dir
if [ -z "$lang_test" ]; then
lang_test=$lang
fi
extra_files=
if [ -z "$graph_dir" ]; then
extra_files="$extra_files $lang_test/G.fst $lang_test/phones.txt"
else
extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt"
fi
for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \
$lang/phones.txt $garbage_phones_list $silence_phones_list \
$ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do
if [ ! -f $f ]; then
echo "$0: Could not find file $f"
exit 1
fi
done
utils/validate_data_dir.sh $in_data_dir || exit 1
utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1
if ! cat $garbage_phones_list $silence_phones_list | \
steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
echo "$0: Invalid $garbage_phones_list $silence_phones_list"
exit 1
fi
data_id=$(basename $in_data_dir)
whole_data_id=$(basename $in_whole_data_dir)
if [ $stage -le 0 ]; then
rm -r $dir/$data_id 2>/dev/null || true
mkdir -p $dir/$data_id
utils/data/modify_speaker_info_to_recording.sh \
$in_data_dir $dir/$data_id || exit 1
utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1
fi
# Work with a temporary data directory with recording-id as the speaker labels.
data_dir=$dir/${data_id}
###############################################################################
# Get feats for the manual segments
###############################################################################
if [ $stage -le 1 ]; then
utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp
cp $data_dir/tmp/feats.scp $data_dir
steps/compute_cmvn_stats.sh $data_dir || exit 1
fi
if [ $stage -le 2 ]; then
utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id
utils/fix_data_dir.sh $dir/$whole_data_id
# Copy the CMVN stats to the whole directory
cp $data_dir/cmvn.scp $dir/$whole_data_id
fi
# Work with a temporary data directory with CMVN stats computed using
# only the segments from the original data directory.
whole_data_dir=$dir/$whole_data_id
###############################################################################
# Obtain supervision-constrained lattices
###############################################################################
sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id}
if [ $stage -le 2 ]; then
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1
fi
###############################################################################
# Uniformly segment whole data directory for decoding
###############################################################################
uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec
uniform_seg_data_id=`basename $uniform_seg_data_dir`
if [ $stage -le 3 ]; then
utils/data/get_segments_for_data.sh ${whole_data_dir} > \
${whole_data_dir}/segments
mkdir -p $uniform_seg_data_dir
utils/data/get_uniform_subsegments.py \
--max-segment-duration $max_segment_duration \
--overlap-duration $overlap_duration \
--max-remaining-duration $max_remaining_duration \
${whole_data_dir}/segments > $uniform_seg_data_dir/sub_segments
utils/data/subsegment_data_dir.sh $whole_data_dir \
$uniform_seg_data_dir/sub_segments $uniform_seg_data_dir
cp $whole_data_dir/cmvn.scp $uniform_seg_data_dir/
fi
model_id=$(basename $model_dir)
###############################################################################
# Create graph dir for decoding
###############################################################################
if [ -z "$graph_dir" ]; then
graph_dir=$dir/$model_id/graph
if [ $stage -le 4 ]; then
if [ ! -f $graph_dir/HCLG.fst ]; then
rm -r $dir/lang_test 2>/dev/null || true
cp -r $lang_test/ $dir/lang_test
utils/mkgraph.sh $dir/lang_test $model_dir $graph_dir || exit 1
fi
fi
fi
###############################################################################
# Decode uniformly segmented data directory
###############################################################################
model_id=$(basename $model_dir)
decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id}
if [ $stage -le 5 ]; then
mkdir -p $decode_dir
cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id}
cp $model_dir/phones.txt $dir/$model_id
# We use a small beam and max-active since we are only interested in
# the speech / silence decisions, not the exact word sequences.
steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \
--max-active 1000 --beam 10.0 \
--decode-extra-opts "--word-determinize=false" --skip-scoring true \
$graph_dir $uniform_seg_data_dir $decode_dir
fi
ali_model_id=`basename $ali_model_dir`
###############################################################################
# Get frame-level targets from lattices for nnet training
# Targets are matrices of 3 columns -- silence, speech and garbage
# The target values are obtained by summing up posterior probabilites of
# arcs from lattice-arc-post over silence, speech and garbage phones.
###############################################################################
if [ $stage -le 6 ]; then
steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
--silence-phones "$silence_phones_list" \
--garbage-phones "$garbage_phones_list" \
--max-phone-duration 0.5 \
$data_dir $lang $sup_lats_dir \
$dir/${ali_model_id}_${data_id}_sup_targets
fi
if [ $stage -le 7 ]; then
steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
--silence-phones "$silence_phones_list" \
--garbage-phones "$garbage_phones_list" \
--max-phone-duration 0.5 \
$uniform_seg_data_dir $lang $decode_dir \
$dir/${model_id}_${uniform_seg_data_id}_targets
fi
###############################################################################
# Convert targets to be w.r.t. whole data directory and subsample the
# targets by a factor of 3.
# Since the targets from transcript-constrained lattices have only values
# for the manual segments, these are converted to whole recording-levels
# by inserting [ 0 0 0 ] for the out-of-manual segment regions.
###############################################################################
if [ $stage -le 8 ]; then
steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
$data_dir $whole_data_dir \
$dir/${ali_model_id}_${data_id}_sup_targets \
$dir/${ali_model_id}_${whole_data_id}_sup_targets
steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
$whole_data_dir \
$dir/${ali_model_id}_${whole_data_id}_sup_targets \
$dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3
fi
###############################################################################
# Convert the targets from decoding to whole recording.
###############################################################################
if [ $stage -le 9 ]; then
steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
$dir/${uniform_seg_data_id} $whole_data_dir \
$dir/${model_id}_${uniform_seg_data_id}_targets \
$dir/${model_id}_${whole_data_id}_targets
steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
$whole_data_dir \
$dir/${model_id}_${whole_data_id}_targets \
$dir/${model_id}_${whole_data_id}_targets_sub3
fi
###############################################################################
# "default targets" values for the out-of-manual-segment regions.
# We assume in this setup that this is silence i.e. [ 1 0 0 ].
###############################################################################
if [ $stage -le 10 ]; then
echo " [ 1 0 0 ]" > $dir/default_targets.vec
steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \
--nj $reco_nj --frame-subsampling-factor 3 \
--default-targets $dir/default_targets.vec \
$data_dir $whole_data_dir $dir/out_of_seg_${whole_data_id}_default_targets_sub3
fi
###############################################################################
# Merge targets for the same data from multiple sources (systems)
# --weights is used to weight targets from alignment with a higher weight
# the targets from decoding.
# If --remove-mismatch-frames is true, then if alignment and decoding
# disagree (more than 0.5 probability on different classes), then those frames
# are removed by setting targets to [ 0 0 0 ].
###############################################################################
if [ $stage -le 11 ]; then
steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \
--weights $merge_weights --remove-mismatch-frames true \
$whole_data_dir \
$dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \
$dir/${model_id}_${whole_data_id}_targets_sub3 \
$dir/out_of_seg_${whole_data_id}_default_targets_sub3 \
$dir/${whole_data_id}_combined_targets_sub3
fi
cp $dir/${whole_data_id}_combined_targets_sub3/targets.scp $dir/
echo "$0: Prepared targets in $dir/targets.scp"