choose_utts_to_combine.py
16.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#!/usr/bin/env python
# Copyright 2016 Vijayaditya Peddinti
# 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
from __future__ import print_function
import argparse
from random import randint
import sys
import os
from collections import defaultdict
parser = argparse.ArgumentParser(description="""
This script, called from data/utils/combine_short_segments.sh, chooses consecutive
utterances to concatenate that will satisfy the minimum segment length. It uses the
--spk2utt file to ensure that utterances from the same speaker are preferentially
combined (as far as possible while respecting the minimum segment length).
If it has to combine utterances across different speakers in order to satisfy the
duration constraint, it will assign the combined utterances to the speaker which
contributed the most to the duration of the combined utterances.
The utt2uts output of this program is a map from new
utterance-id to a list of old utterance-ids, so for example if the inputs were
utt1, utt2 and utt3, and utterances 2 and 3 were combined, the output might look
like:
utt1 utt1
utt2-combine2 utt2 utt3
The utt2spk output of this program assigns utterances to the speakers of the input;
in the (hopefully rare) case where utterances were combined across speakers, it
will assign the utterance to whichever of the original speakers contributed the most
to the grouped utterance.
""")
parser.add_argument("--min-duration", type = float, default = 1.55,
help="Minimum utterance duration")
parser.add_argument("spk2utt_in", type = str, metavar = "<spk2utt-in>",
help="Filename of [input] speaker to utterance map needed "
"because this script tries to merge utterances from the "
"same speaker as much as possible, and also needs to produce"
"an output utt2spk map.")
parser.add_argument("utt2dur_in", type = str, metavar = "<utt2dur-in>",
help="Filename of [input] utterance-to-duration map, with lines like 'utt1 1.23'.")
parser.add_argument("utt2utts_out", type = str, metavar = "<utt2utts-out>",
help="Filename of [output] new-utterance-to-old-utterances map, with lines "
"like 'utt1 utt1' or 'utt2-comb2 utt2 utt3'")
parser.add_argument("utt2spk_out", type = str, metavar = "<utt2spk-out>",
help="Filename of [output] utt2spk map, which maps new utterances to original "
"speakers. If utterances were combined across speakers, we map the new "
"utterance to the speaker that contributed the most to them.")
parser.add_argument("utt2dur_out", type = str, metavar = "<utt2spk-out>",
help="Filename of [output] utt2dur map, which is just the summations of "
"the durations of the source utterances.")
args = parser.parse_args()
# This LessThan is designed to be impervious to roundoff effects in cases where
# numbers are really always separated by a distance >> 1.0e-05. It will return
# false if x and y are almost identical, differing only by roundoff effects.
def LessThan(x, y):
return x < y - 1.0e-5
# This function implements the core of the utterance-combination code.
# The input 'durations' is a list of durations, which must all be
# >=0.0 This function tries to combine consecutive indexes
# into groups such that for each group, the total duration is at
# least 'min_duration'. It returns a list of (start,end) indexes.
# For example, CombineList(0.1, [5.0,6.0,7.0]) would return
# [ (0,1), (1,2), (2,3) ] because no combination is necessary; each
# returned pair represents a singleton group.
# Or CombineList(1.0, [0.5, 0.6, 0.7]) would return
# [ (0,3) ].
# Or CombineList(1.0, [0.5, 0.6, 1.7]) would return
# [ (0,2), (2,3) ].
# Note: if sum(durations) < min_duration, this function will
# return everything in one group but of course the sum of durations
# will be less than the total.
def CombineList(min_duration, durations):
assert min_duration >= 0.0 and min(durations) > 0.0
num_utts = len(durations)
# for each utterance-index i, group_start[i] gives us the
# start-index of the group of utterances of which it's currently
# a member.
group_start = list(range(num_utts))
# if utterance-index i currently corresponds to the start of a group
# of utterances, then group_durations[i] is the total duration of
# that utterance-group, otherwise undefined.
group_durations = list(durations)
# if utterance-index i currently corresponds to the start of a group
# of utterances, then group_end[i] is the end-index (i.e. last index plus one
# of that utterance-group, otherwise undefined.
group_end = [ x + 1 for x in range(num_utts) ]
queue = [ i for i in range(num_utts) if LessThan(group_durations[i], min_duration) ]
while len(queue) > 0:
i = queue.pop()
if group_start[i] != i or not LessThan(group_durations[i], min_duration):
# this group no longer exists or already has at least the minimum duration.
continue
this_dur = group_durations[i]
# left_dur is the duration of the group to the left of this group,
# or 0.0 if there is no such group.
left_dur = group_durations[group_start[i-1]] if i > 0 else 0.0
# right_dur is the duration of the group to the right of this group,
# or 0.0 if there is no such group.
right_dur = group_durations[group_end[i]] if group_end[i] < num_utts else 0.0
if left_dur == 0.0 and right_dur == 0.0:
# there is only one group. Nothing more to merge; break
assert group_start[i] == 0 and group_end[i] == num_utts
break
# work out whether to combine left or right,
# by means of the combine_left variable [ True or False ]
if left_dur == 0.0:
combine_left = False
elif right_dur == 0.0:
combine_left = True
elif LessThan(left_dur + this_dur, min_duration):
# combining left would still be below the minimum duration->
# combine right... if it's above the min duration then good;
# otherwise it still doesn't really matter so we might as well
# pick one.
combine_left = False
elif LessThan(right_dur + this_dur, min_duration):
# combining right would still be below the minimum duration,
# and combining left would be >= the min duration (else we wouldn't
# have reached this line) -> combine left.
combine_left = True
elif LessThan(left_dur, right_dur):
# if we reached here then combining either way would take us >= the
# minimum duration; but if left_dur < right_dur then we combine left
# because that would give us more evenly sized segments.
combine_left = True
else:
# if we reached here then combining either way would take us >= the
# minimum duration; but left_dur >= right_dur, so we combine right
# because that would give us more evenly sized segments.
combine_left = False
if combine_left:
assert left_dur != 0.0
new_group_start = group_start[i-1]
group_end[new_group_start] = group_end[i]
for j in range(group_start[i], group_end[i]):
group_start[j] = new_group_start
group_durations[new_group_start] += durations[j]
# note: there is no need to add group_durations[new_group_start] to
# the queue even if it is still below the minimum length, because it
# would have previously had to have been below the minimum length,
# therefore it would already be in the queue.
else:
assert right_dur != 0.0
# group start doesn't change, group end changes.
old_group_end = group_end[i]
new_group_end = group_end[old_group_end]
group_end[i] = new_group_end
for j in range(old_group_end, new_group_end):
group_durations[i] += durations[j]
group_start[j] = i
if LessThan(group_durations[i], min_duration):
# the group starting at i is still below the minimum length, so
# we need to put it back on the queue.
queue.append(i)
ans = []
cur_group_start = 0
while cur_group_start < num_utts:
ans.append( (cur_group_start, group_end[cur_group_start]) )
cur_group_start = group_end[cur_group_start]
return ans
def SelfTest():
assert CombineList(0.1, [5.0, 6.0, 7.0]) == [ (0,1), (1,2), (2,3) ]
assert CombineList(0.5, [0.1, 6.0, 7.0]) == [ (0,2), (2,3) ]
assert CombineList(0.5, [6.0, 7.0, 0.1]) == [ (0,1), (1,3) ]
# in the two examples below, it combines with the shorter one if both would
# be above min-dur.
assert CombineList(0.5, [6.0, 0.1, 7.0]) == [ (0,2), (2,3) ]
assert CombineList(0.5, [7.0, 0.1, 6.0]) == [ (0,1), (1,3) ]
# in the example below, it combines with whichever one would
# take it above the min-dur, if there is only one such.
# note, it tests the 0.1 first as the queue is popped from the end.
assert CombineList(1.0, [1.0, 0.5, 0.1, 6.0]) == [ (0,2), (2,4) ]
for x in range(100):
min_duration = 0.05
num_utts = randint(1, 15)
durations = []
for i in range(num_utts):
durations.append(0.01 * randint(1, 10))
ranges = CombineList(min_duration, durations)
if len(ranges) > 1: # check that each range's duration is >= min_duration
for j in range(len(ranges)):
(start, end) = ranges[j]
this_dur = sum([ durations[k] for k in range(start, end) ])
assert not LessThan(this_dur, min_duration)
# check that the list returned is not affected by very tiny differences
# in the inputs.
durations2 = list(durations)
for i in range(len(durations2)):
durations2[i] += 1.0e-07 * randint(-5, 5)
ranges2 = CombineList(min_duration, durations2)
assert ranges2 == ranges
# This function figures out the grouping of utterances.
# The input is:
# 'min_duration' which is the minimum utterance length in seconds.
# 'spk2utt' which is a list of pairs (speaker-id, [list-of-utterances])
# 'utt2dur' which is a dict from utterance-id to duration (as a float)
# It returns a lists of lists of utterances; each list corresponds to
# a group, e.g.
# [ ['utt1'], ['utt2', 'utt3'] ]
def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
# utt_groups will be a list of lists of utterance-ids formed from the
# first pass of combination.
utt_groups = []
# group_durations will be the durations of the corresponding elements of
# 'utt_groups'.
group_durations = []
# This block calls CombineList for the utterances of each speaker
# separately, in the 'first pass' of combination.
for i in range(len(spk2utt)):
(spk, utts) = spk2utt[i]
durations = [] # durations for this group of utts.
for utt in utts:
try:
durations.append(utt2dur[utt])
except:
sys.exit("choose_utts_to_combine.py: no duration available "
"in utt2dur file {0} for utterance {1}".format(
args.utt2dur_in, utt))
ranges = CombineList(min_duration, durations)
for start, end in ranges: # each element of 'ranges' is a 2-tuple (start, end)
utt_groups.append( [ utts[i] for i in range(start, end) ])
group_durations.append(sum([ durations[i] for i in range(start, end) ]))
old_dur_sum = sum(utt2dur.values())
new_dur_sum = sum(group_durations)
if abs(old_dur_sum - new_dur_sum) > 0.0001 * old_dur_sum:
print("choose_utts_to_combine.py: large difference in total "
"durations: {0} vs {1} ".format(old_dur_sum, new_dur_sum),
file = sys.stderr)
# Now we combine the groups obtained above, in case we had situations where
# the combination of all the utterances of one speaker were still below
# the minimum duration.
new_utt_groups = []
ranges = CombineList(min_duration, group_durations)
for start, end in ranges:
# the following code is destructive of 'utt_groups' but it doesn't
# matter.
this_group = utt_groups[start]
for i in range(start + 1, end):
this_group += utt_groups[i]
new_utt_groups.append(this_group)
print("choose_utts_to_combine.py: combined {0} utterances to {1} utterances "
"while respecting speaker boundaries, and then to {2} utterances "
"with merging across speaker boundaries.".format(
len(utt2dur), len(utt_groups), len(new_utt_groups)),
file = sys.stderr)
return new_utt_groups
SelfTest()
if args.min_duration < 0.0:
print("choose_utts_to_combine.py: bad minium duration {0}".format(
args.min_duration))
# spk2utt is a list of 2-tuples (speaker-id, [list-of-utterances])
spk2utt = []
# utt2spk is a dict from speaker-id to utternace-id.
utt2spk = dict()
try:
f = open(args.spk2utt_in)
except:
sys.exit("choose_utts_to_combine.py: error opening --spk2utt={0}".format(args.spk2utt_in))
while True:
line = f.readline()
if line == '':
break
a = line.split()
if len(a) < 2:
sys.exit("choose_utts_to_combine.py: bad line in spk2utt file: " + line)
spk = a[0]
utts = a[1:]
spk2utt.append((spk, utts))
for utt in utts:
if utt in utt2spk:
sys.exit("choose_utts_to_combine.py: utterance {0} is listed more than once"
"in the spk2utt file {1}".format(utt, args.spk2utt_in))
utt2spk[utt] = spk
f.close()
# utt2dur is a dict from utterance-id (as a string) to duration in seconds (as a float)
utt2dur = dict()
try:
f = open(args.utt2dur_in)
except:
sys.exit("choose_utts_to_combine.py: error opening utt2dur file {0}".format(args.utt2dur_in))
while True:
line = f.readline()
if line == '':
break
try:
[ utt, dur ] = line.split()
dur = float(dur)
utt2dur[utt] = dur
except:
sys.exit("choose_utts_to_combine.py: bad line in utt2dur file {0}: {1}".format(
args.utt2dur_in, line))
utt_groups = GetUtteranceGroups(args.min_duration, spk2utt, utt2dur)
# set utt_group names to an array like [ 'utt1', 'utt2-comb2', 'utt4', ... ]
utt_group_names = [ group[0] if len(group)==1 else "{0}-comb{1}".format(group[0], len(group))
for group in utt_groups ]
# write the utt2utts file.
try:
with open(args.utt2utts_out, 'w') as f:
for i in range(len(utt_groups)):
print(utt_group_names[i], ' '.join(utt_groups[i]), file = f)
except Exception as e:
sys.exit("choose_utts_to_combine.py: exception writing to "
"<utt2utts-out>={0}: {1}".format(args.utt2utts_out, str(e)))
# write the utt2spk file.
try:
with open(args.utt2spk_out, 'w') as f:
for i in range(len(utt_groups)):
utt_group = utt_groups[i]
spk_list = [ utt2spk[utt] for utt in utt_group ]
if spk_list == [ spk_list[0] ] * len(utt_group):
spk = spk_list[0]
else:
spk2dur = defaultdict(float)
# spk2dur is a map from the speaker-id to the duration within this
# utt, that it comprises.
for utt in utt_group:
spk2dur[utt2spk[utt]] += utt2dur[utt]
# the following code, which picks the speaker that contributed
# the most to the duration of this utterance, is a little
# complex because we want to break ties in a deterministic way
# picking the earlier spaker in case of a tied duration.
longest_spk_dur = -1.0
spk = None
for this_spk in sorted(spk2dur.keys()):
if LessThan(longest_spk_dur, spk2dur[this_spk]):
longest_spk_dur = spk2dur[this_spk]
spk = this_spk
assert spk != None
new_utt = utt_group_names[i]
print(new_utt, spk, file = f)
except Exception as e:
sys.exit("choose_utts_to_combine.py: exception writing to "
"<utt2spk-out>={0}: {1}".format(args.utt2spk_out, str(e)))
# write the utt2dur file.
try:
with open(args.utt2dur_out, 'w') as f:
for i in range(len(utt_groups)):
utt_name = utt_group_names[i]
duration = sum([ utt2dur[utt] for utt in utt_groups[i]])
print(utt_name, duration, file = f)
except Exception as e:
sys.exit("choose_utts_to_combine.py: exception writing to "
"<utt2dur-out>={0}: {1}".format(args.utt2dur_out, str(e)))