subsegment_data_dir.sh
9.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/bin/bash
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
# This script allows you to specify a 'segments' file with segments
# relative to existing utterances, with lines like
# utterance_foo-1 utterance_foo 7.5 8.2
# utterance_foo-2 utterance_foo 8.9 10.1
# and a 'text' file with sub-segmented text like
# utterance_foo-1 hello there
# utterance_foo-2 how are you
# and combine this with an existing data-dir that was all relative
# to the original utterance-ids like 'utterance_foo', producing
# a new subsegmented output directory.
#
# It does the right thing for you on the various files that the
# data directory contained (except you have to recreate
# the CMVN stats).
segment_end_padding=0.0
cmd=run.pl
nj=1
. utils/parse_options.sh
if [ $# != 4 ] && [ $# != 3 ]; then
echo "Usage: "
echo " $0 [options] <srcdir> <subsegments-file> [<text-file>] <destdir>"
echo "This script sub-segments a data directory. <subsegments-file> is to"
echo "have lines of the form <new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>"
echo "and <text-file> is of the form <new-utt> <word1> <word2> ... <wordN>."
echo "This script appropriately combines the <subsegments-file> with the original"
echo "segments file, if necessary, and if not, creates a segments file."
echo "e.g.:"
echo " $0 data/train [options] exp/tri3b_resegment/segments exp/tri3b_resegment/text data/train_resegmented"
echo " Options:"
echo " --segment-end-padding <padding-time> # e.g. 0.02. Default 0.0. If provided,"
echo " # we will add this value to the end times of <destdir>/segments"
echo " # when creating it. This can be useful to account for"
echo " # end effects in feature generation. The reason this is"
echo " # not just applied to the input segments file, is that"
echo " # for purposes of computing the num-frames of the parts of"
echo " # matrices in feats.scp, the padding should not be done."
echo " See also: resolve_ctm_overlaps.py"
exit 1;
fi
export LC_ALL=C
srcdir=$1
subsegments=$2
add_subsegment_text=false
if [ $# -eq 4 ]; then
new_text=$3
dir=$4
add_subsegment_text=true
if [ ! -f "$new_text" ]; then
echo "$0: no such file $new_text"
exit 1
fi
else
dir=$3
fi
for f in "$subsegments" "$srcdir/utt2spk"; do
if [ ! -f "$f" ]; then
echo "$0: no such file $f"
exit 1;
fi
done
if ! mkdir -p $dir; then
echo "$0: failed to create directory $dir"
fi
if $add_subsegment_text; then
if ! cmp <(awk '{print $1}' <$subsegments) <(awk '{print $1}' <$new_text); then
echo "$0: expected the first fields of the files $subsegments and $new_text to be identical"
exit 1
fi
fi
# create the utt2spk in $dir
if ! awk '{if (NF != 4 || !($4 > $3)) { print("Bad line: " $0); exit(1) } }' <$subsegments; then
echo "$0: failed checking subsegments file $subsegments"
exit 1
fi
set -e
set -o pipefail
# Create a mapping from the new to old utterances. This file will be deleted later.
awk '{print $1, $2}' < $subsegments > $dir/new2old_utt
# Create the new utt2spk file [just map from the second field
utils/apply_map.pl -f 2 $srcdir/utt2spk < $dir/new2old_utt >$dir/utt2spk
# .. and the new spk2utt file.
utils/utt2spk_to_spk2utt.pl <$dir/utt2spk >$dir/spk2utt
if $add_subsegment_text; then
# the new text file is just what the user provides.
cp $new_text $dir/text
fi
# copy the source wav.scp
cp $srcdir/wav.scp $dir
if [ -f $srcdir/reco2file_and_channel ]; then
cp $srcdir/reco2file_and_channel $dir
fi
# copy the source reco2dur
if [ -f $srcdir/reco2dur ]; then
cp $srcdir/reco2dur $dir
fi
if [ -f $srcdir/segments ]; then
# we have to map the segments file.
# What's going on below is a little subtle.
# $srcdir/segments has lines like: <old-utt-id> <recording-id> <start-time> <end-time>
# and $subsegments has lines like: <new-utt-id> <old-utt-id> <start-time> <end-time>
# The apply-map command replaces <old-utt-id> [the 2nd field of $subsegments]
# with <recording-id> <start-time> <end-time>.
# so after that first command we have lines like
# <new-utt-id> <recording-id> <start-time-of-old-utt-within-recording> <end-time-old-utt-within-recording> \
# <start-time-of-new-utt-within-old-utt> <end-time-of-new-utt-within-old-utt>
# which the awk command turns into:
# <new-utt-id> <recording-id> <start-time-of-new-utt-within-recording> <end-time-of-new-utt-within-recording>
utils/apply_map.pl -f 2 $srcdir/segments <$subsegments | \
awk -v pad=$segment_end_padding '{ print $1, $2, $5+$3, $6+$3+pad; }' >$dir/segments
else
# the subsegments file just becomes the segments file.
awk -v pad=$segment_end_padding '{$4 += pad; print}' <$subsegments >$dir/segments
fi
if [ -f $srcdir/utt2uniq ]; then
utils/apply_map.pl -f 2 $srcdir/utt2uniq <$dir/new2old_utt >$dir/utt2uniq
fi
if [ -f $srcdir/feats.scp ]; then
# We want to avoid recomputing the features. We'll use sub-matrices of the
# original feature matrices, using the [] notation that is available for
# matrices in Kaldi.
if [ ! -s $srcdir/frame_shift ]; then
frame_shift=$(utils/data/get_frame_shift.sh $srcdir) || exit 1
else
frame_shift=$(cat $srcdir/frame_shift)
fi
echo "$0: note: frame shift is $frame_shift [affects feats.scp]"
# The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
# e.g. 'utt_foo-1 utt_foo 7.21 8.93'
# The first awk command replaces this with the format:
# <new-utt-id> <old-utt-id> <first-frame> <last-frame>
# e.g. 'utt_foo-1 utt_foo 721 893'
# and the apply_map.pl command replaces 'utt_foo' (the 2nd field) with its corresponding entry
# from the original wav.scp, so we get a line like:
# e.g. 'utt_foo-1 foo-bar.ark:514231 721 892'
# Note: the reason we subtract one from the last time is that it's going to
# represent the 'last' frame, not the 'end' frame [i.e. not one past the last],
# in the matlab-like, but zero-indexed [first:last] notion. For instance, a segment with 1 frame
# would have start-time 0.00 and end-time 0.01, which would become the frame range
# [0:0]
# The second awk command turns this into something like
# utt_foo-1 foo-bar.ark:514231[721:892]
# It has to be a bit careful because the format actually allows for more general things
# like pipes that might contain spaces, so it has to be able to produce output like the
# following:
# utt_foo-1 some command|[721:892]
# The 'end' frame is ensured to not exceed the feature archive size of
# <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
# e.g if the number of frames in foo-bar.ark is 891, then the features are
# truncated to that many frames.
# utt_foo-1 foo-bar.ark:514231[721:890]
# Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
# the original data-dir already had data-ranges in square brackets.
# Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
# This is equal to the number of frames in the feature archive for <old-utt-id>.
if [ ! -f $srcdir/utt2num_frames ]; then
echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
fi
utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
awk '{print $1" "$2}' $subsegments | \
utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
$dir/utt2max_frames
awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
utils/apply_map.pl -f 2 $srcdir/feats.scp | \
awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }
# Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
# and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "\n"' > \
$dir/utt2num_frames
# Here we add frame ranges to the elements of vad.scp, as we did for rows of feats.scp above.
if [ -f $srcdir/vad.scp ]; then
cat $subsegments | awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' | \
utils/apply_map.pl -f 2 $srcdir/vad.scp | \
awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
utils/data/normalize_data_range.pl >$dir/vad.scp
fi
fi
if [ -f $dir/cmvn.scp ]; then
rm $dir/cmvn.scp
echo "$0: warning: removing $dir/cmvn.scp, you will have to regenerate it from the features."
fi
# remove the utt2dur file in case it's now invalid-- it be regenerated from the segments file.
rm $dir/utt2dur 2>/dev/null || true
if [ -f $srcdir/spk2gender ]; then
cp $srcdir/spk2gender $dir
fi
if [ -f $srcdir/glm ]; then
cp $srcdir/glm $dir
fi
if [ -f $srcdir/stm ]; then
cp $srcdir/stm $dir
fi
for f in ctm; do
if [ -f $srcdir/$f ]; then
echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
echo " ... not implemented yet (and probably it's not needed.)"
fi
done
rm $dir/new2old_utt
echo "$0: subsegmented data from $srcdir to $dir"