segmentation.pl
16.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This program is for segmentation of data, e.g. long telephone conversations,
# into short chunks. The input (stdin) should be a sequence of lines like
# sw0-20348-A 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 ... 2 2 0 0 0
# where there is a number for each frame and the numbers mean 0 for silence, 1
# for noise, laughter and other nonspeech events, and 2 for speech. This will
# typically be derived from some kind of fast recognition (see
# ../steps/resegment_data.sh), followed by ali-to-phones --per-frame=true and
# then mapping phones to these classes 0, 1 and 2.
#
# The algorithm is as follows:
# (1) Find contiguous sequences of classes 1 or 2 (i.e. speech and/or noise), with e.g.
# "1 1 1 2 2" counted as a single contiguous sequence. Each such sequence is an
# initial segment.
# (2) While the proportion of silence in the segments is less than $silence_proportion,
# add a single silence frame to the left and right of each segment, as long
# as this does not take us past the ends of the file or into another segment.
# At this point, do not merge segments.
# (3) Merging segments:
# Get a list of all boundaries between segments that ended up touching each other
# during phase 2. Sort them according to the number of silence frames at the boundary,
# with those with the least silence to be processed first. Go through the boundaries
# in order, merging each pair of segments, as long as doing so does not create
# a segment larger than $max_segment_length.
# (4) Splitting excessively long segments:
# For all segments that are longer than $hard_max_segment_length, split them equally
# into the smallest number of pieces such that the pieces will be no longer than
# $hard_max_segment_length. Print a warning.
# (5) Removing any segments that contain no speech. (remove segments that have only silence
# and noise.
#
# By default, the utterance-ids will be of the form <RECORDING-ID>-<START-TIME>-<END-TIME>,
# where <START-TIME> and <END-TIME> are measured 0.01 seconds, using fixed-width
# integers with enough digits to print out all the segments (the number of digits being
# decided per line of the input). For instance, if the input recording-id was
# sw0-20348-A, an example line of the "segments-file" output would be:
# sw0-20348-A-00124-00298 sw0-20348-A 1.24 2.98
# (interpreted as <UTTERANCE-ID> <RECORDING-ID> <START-TIME> <END-TIME>)
# and the number of digits has to be that large because the same recording has
# a segment something like
# sw0-20348-A-13491-13606 sw0-20348-A 134.91 136.06
# The "_" and "-" in the output are separately configurable by means of the
# --first-separator and --second-separator options. However, generally speaking,
# it is safer to use "-" than, say, "_", because "-" appears very early in the
# ASCII table, and using it as the separator will tend to ensure than when
# you sort the utterances and the recording-ids they will sort the same way.
# This matters because recording-ids will often equal speaker-ids, and Kaldi scripts
# require that the utterance-ids and speaker-ids sort in the "same order".
use Getopt::Long;
$silence_proportion = 0.2; # The amount of silence at the sides of segments is
# tuned to give this proportion of silence.
$frame_shift = 0.01; # Affects the interpretation of the options such as max_segment_length,
# and the seconds in the "segments" file.
$max_segment_length = 15.0; # Maximum segment length while we are merging segments...
# it will not allow merging segments to make segments longer than this.
$hard_max_segment_length = 30.0; # A hard maximum on the segment length; it will
# break segments to get below this, even if there is
# no silence, and print a warning.
$first_separator = "-"; # separator between recording-id and start-time, in utterance-id.
$second_separator = "-"; # separator between start-time and end-time, in utterance-id.
$remove_noise_only_segments = "true"; # boolean option; if true,
# remove segments that have no speech.
GetOptions('silence-proportion:f' => \$silence_proportion,
'frame-shift:f' => \$frame_shift,
'max-segment-length:f' => \$max_segment_length,
'hard-max-segment-length:f' => \$hard_max_segment_length,
'first-separator:s' => \$first_separator,
'second-separator:s' => \$second_separator,
'remove-noise-only-segments:s' => \$remove_noise_only_segments);
if (@ARGV != 0) {
print STDERR "$0:\n" .
"Usage: segmentation.pl [options] < per-frame-archive > segments-file\n" .
"This program is called from steps/resegment_data.sh. Please see\n" .
"the extensive comment in the source. Options:\n" .
"--silence-proportion <float> (default: $silence_proportion)\n" .
"--frame-shift <float> (default: $frame_shift, in seconds)\n" .
"--max-segment-length <float> (default: $max_segment_length, in seconds)\n" .
"--hard-max-segment-length (default: $hard_max_segment_length, in seconds)\n" .
"--first-separator <string> (default: $first_separator), affects utterance-ids\n" .
"--second-separator <string> (default: $second_separator), affects utterance-ids\n" .
"--remove-noise-only-segments <true|false> (default: true)\n";
exit 1;
}
($silence_proportion > 0.01 && $silence_proportion < 0.99) ||
die "Invalid silence-proportion value '$silence_proportion'";
($frame_shift > 0.0001 && $frame_shift <= 1.0) ||
die "Very strange frame-shift value '$frame_shift'";
($max_segment_length > 1.0 && $max_segment_length < 100.0) ||
die "Very strange max-segment-length value '$max_segment_length'";
($hard_max_segment_length > 4.0 && $hard_max_segment_length < 500.0) ||
die "Very strange hard-max-segment-length value '$hard_max_segment_length'";
($hard_max_segment_length >= $max_segment_length) ||
die "hard-max-segment-length may not be less than max-segment-length";
($remove_noise_only_segments eq 'false' ||
$remove_noise_only_segments eq 'true') ||
die "Option --remove-noise-only-segments takes args true or false";
sub get_initial_segments {
# This operates on the global arrays @A, @S and @N. It sets the elements of
# @S to 1 if start of segment, and @E to 1 if end of segment, end of segment
# being defined as one past the last frame in the segment.
for (my $n = 0; $n < $N; $n++) {
if ($A[$n] == 0) {
if ($n > 0 && $A[$n-1] != 0) {
$E[$n] = 1;
}
} else {
if ($n == 0 || $A[$n-1] == 0) {
$S[$n] = 1;
}
}
}
if ($A[$N-1] != 0) { # Handle the special case
$E[$N] = 1; # where the last frame of the file is silence or noise.
}
}
sub set_silence_proportion {
$num_nonsil_frames = 0;
# Get number of frames that are inside segments. Initially, this will
# all be non-silence.
$in_segment = 0;
my @active_frames = (); # active_frames are segment start/end frames.
for (my $n = 0; $n <= $N; $n++) {
if ($n < $N && $S[$n] == 1) {
$in_segment == 0 || die;
$in_segment = 1;
push @active_frames, $n;
}
if ($E[$n] == 1) {
$in_segment == 1 || die;
$in_segment = 0;
push @active_frames, $n;
}
if ($n < $N) {
($in_segment == ($A[$n] != 0 ? 1 : 0)) || die; # Just a check.
if ($in_segment) { $num_nonsil_frames++; }
}
}
$in_segment == 0 || die; # should not be still in a segment after file-end.
if ($num_nonsil_frames == 0) {
print STDERR "$0: warning: no segments found for recording $recording_id\n";
return;
}
#(target-segment-frames - num-nonsil-frames) / target-segment-frames = sil-proportion
# -> target-segment-frames = (num-nonsil-frames) / (1 - sil-proportion).
my $target_segment_frames = int($num_nonsil_frames / (1.0 - $silence_proportion));
my $num_segment_frames = $num_nonsil_frames;
while ($num_segment_frames < $target_segment_frames) {
$changed = 0;
for (my $i = 0; $i < @active_frames; $i++) {
my $n = $active_frames[$i];
if ($E[$n] == 1 && $n < $N && $S[$n] != 1) {
# shift the end of this segment one frame to the right.
$E[$n] = 0;
$E[$n+1] = 1;
$active_frames[$i] = $n + 1;
$num_segment_frames++;
$changed = 1;
}
if ($n < $N && $S[$n] == 1 && $n > 0 && $E[$n] != 1) {
# shift the start of this segment one frame to the left
$S[$n] = 0;
$S[$n-1] = 1;
$active_frames[$i] = $n - 1;
$num_segment_frames++;
$changed = 1;
}
if ($num_segment_frames == $target_segment_frames) {
last;
}
}
if ($changed == 0) { last; } # avoid an infinite loop.
}
if ($num_segment_frames < $target_segment_frames) {
my $proportion =
($num_segment_frames - $num_nonsil_frames) / $num_segment_frames;
print STDERR "$0: warning: for recording $recording_id, only got a proportion " .
"$proportion of silence frames, versus target $silence_proportion\n";
}
}
sub merge_segments() {
my @boundaries = ();
my @num_silence_phones = (); # for any index into @S where there
# is a boundary between contiguous segments
# (i.e. an index which is both a segment-start
# and segment-end index), the number of silence
# frames at that boundary (i.e. at the end of the
# previous segment and the beginning of the next
# one.
for ($n = 0; $n < $N; $n++) {
if ($S[$n] == 1 && $E[$n] == 1) {
push @boundaries, $n;
my $num_sil = 0;
my $p;
# note: here we can count the silence phones without regard to the
# segment boundaries, since we'll hit nonsilence before we get to
# the end/beginning of these segments.
for ($p = $n; $p < $N; $p++) {
if ($A[$p] == 0) { $num_sil++; }
else { last; }
}
for ($p = $n - 1; $p >= 0; $p--) {
if ($A[$p] == 0) { $num_sil++; }
else { last; }
}
$num_silence_phones[$n] = $num_sil; # should be the num of silence
}
}
# Sort on increasing number of silence-phones, so we join the segments with
# the smallest amount of silence at the boundary first.
my @sorted_boundaries =
sort { $num_silence_phones[$a] <=> $num_silence_phones[$b] } @boundaries;
foreach $n (@sorted_boundaries) {
# Join the segments only if the length of the resulting segment would
# be no more than $max_segment_length.
($S[$n] == 1 && $E[$n] == 1) || die;
my $num_frames = 2; # total number of frames in the two segments we'll be merging..
# start the count from 2 because the loops below do not
# count the 1st frame of the segment to the right and
# the last frame of the segment to the left.
my $p;
for ($p = $n + 1; $p <= @A && $E[$p] == 0; $p++) {
$num_frames++;
}
$E[$p] == 1 || die;
for ($p = $n - 1; $p >= 0 && $S[$p] == 0; $p--) {
$num_frames++;
}
$S[$p] == 1 || die;
if ($num_frames * $frame_shift <= $max_segment_length) {
# Join this pair of segments.
$S[$n] = 0;
$E[$n] = 0;
}
}
}
sub split_long_segments {
for (my $n = 0; $n < @A; $n++) {
if ($S[$n] == 1) { # segment starts here...
my $p;
for ($p = $n + 1; $p <= @A; $p++) {
if ($E[$p] == 1) { last; }
}
($E[$p] == 1) || die;
my $segment_length = $p - $n;
my $max_frames = int($hard_max_segment_length / $frame_shift);
if ($segment_length > $max_frames) {
# The segment is too long, we need to split it. First work out
# how many pieces to split it into.
# We divide and round up to nearest larger int.
my $num_pieces = int(($segment_length / $max_frames) + 0.99999);
my $segment_length_in_seconds = $segment_length * $frame_shift;
print STDERR "$0: warning: for recording $recording_id, splitting segment of " .
"length $segment_length_in_seconds seconds into $num_pieces pieces " .
"(--hard-max-segment-length $hard_max_segment_length)\n";
my $frames_per_piece = int($segment_length / $num_pieces);
my $i;
for ($i = 1; $i < $num_pieces; $i++) {
my $q = $n + $i * $frames_per_piece;
# Insert a segment boundary at frame $q.
$S[$q] = 1;
$E[$q] = 1;
}
}
if ($p - 1 > $n) {
$n = $p - 1; # avoids some redundant work.
}
}
}
}
sub remove_noise_only_segments {
for (my $n = 0; $n < $N; $n++) {
if ($S[$n] == 1) { # segment starts here...
my $p;
my $saw_speech = 0;
for ($p = $n; $p <= $N; $p++) {
if ($E[$p] == 1 && $p != $n) { last; }
if ($A[$p] == 2) { $saw_speech = 1; }
}
$E[$p] == 1 || die;
if (! $saw_speech) { # There was no speech in this segment, so remove it.
$S[$n] = 0;
$E[$p] = 0;
}
if ($p - 1 > $n) {
$n = $p - 1; # Avoid some redundant work.
}
}
}
}
sub print_segments {
# We also do some sanity checking here.
my @segments = (); # each element will be a string start-time:end-time, in frames.
$N == @S || die; # check array size.
($N+1) == @E || die; # check array size.
my $max_end_time = 0;
for (my $n = 0; $n < $N; $n++) {
if ($E[$n] == 1 && $S[$n] != 1) {
die "Ending segment before starting it: n=$n.\n";
}
if ($S[$n]) {
my $p;
for ($p = $n + 1; $p < $N && $E[$p] != 1; $p++) {
$S[$p] && die; # should not start a segment again, before ending it.
}
$E[$p] == 1 || die;
push @segments, "$n:$p"; # push the start/end times.
$max_end_time = $p;
if ($p < $N && $S[$p] == 1) { $n = $p - 1; }
else { $n = $p; }
# note: we increment $n again before the next loop instance.
}
}
if (@segments == 0) {
print STDERR "$0: warning: no segments for recording $recording_id\n";
}
# we'll be printing the times out in hundredths of a second (regardless of the
# value of $frame_shift), and first need to know how many digits we need (we'll be
# printing with "%05d" or similar, for zero-padding.
$max_end_time_hundredths_second = int(100.0 * $frame_shift * $max_end_time);
$num_digits = 1;
my $i = 1;
while ($i < $max_end_time_hundredths_second) {
$i *= 10;
$num_digits++;
}
$format_str = "%0${num_digits}d"; # e.g. "%05d"
foreach $s (@segments) {
my ($start,$end) = split(":", $s);
($end > $start) || die;
my $start_seconds = sprintf("%.2f", $frame_shift * $start);
my $end_seconds = sprintf("%.2f", $frame_shift * $end);
my $start_str = sprintf($format_str, $start_seconds * 100);
my $end_str = sprintf($format_str, $end_seconds * 100);
my $utterance_id = "${recording_id}${first_separator}${start_str}${second_separator}${end_str}";
print "$utterance_id $recording_id $start_seconds $end_seconds\n"; # <-- Here is where the output happens.
}
}
while (<STDIN>) {
@A = split; # split line on whitespace.
if (@A <= 1) {
print STDERR "$0: warning: invalid input line $_";
next;
}
$recording_id = shift @A; # e.g. sw0-12430
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if ($a != 0 && $a != 1 && $a != 2) {
die "Invalid value $a: expecting 0, 1 or 2. Line is: $_";
}
$A[$n] = 0 + $a; # cast to integer, might be a bit faster.
}
# The array @S will contain 1 if a segment starts there and 0
# otherwise. The array @E will contain 1 if a segment ends there
# and 0 otherwise.
$N = @A; # number of elements in @A. Used globally.
@S = (0) x $N; # 0 repeated $N times.
@E = (0) x ($N + 1); # 0 repeated $N+1 times (one more since if the last frame is
# in a segment, the end-marker goes one past that, at index $N.)
get_initial_segments(); # stage (1) in the comment above.
set_silence_proportion(); # stage (2)
merge_segments(); # stage (3)
split_long_segments(); # stage (4)
if ($remove_noise_only_segments eq 'true') {
remove_noise_only_segments(); # stage (5)
}
print_segments();
}