refine_annotations_bn.py
3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# Copyright 2015 David Snyder
# Apache 2.0.
#
# This script refines the annotation files produced by
# make_annotations_bn.py. In order to create unambiguous annotations,
# we remove any part of a segment that overlaps with another. Also,
# this script merges together contiguous segments that have the
# same annotation, and ensures that only segments longer than a
# designated length are created.
#
# This file is meant to be invoked from make_bn.sh.
from __future__ import division
import sys, os
def seg_to_string(seg):
start = seg[0]
end = seg[1]
if start < end:
return str(start) + " " + str(end) + "\n"
else:
return ""
def process_segs(raw_segs):
segs = []
for seg in raw_segs:
lower, upper = [float(i) for i in seg.rstrip().split(" ")]
segs.append((lower, upper))
return segs
def resegment(music, speech, other, frame_length, min_seg):
frame2classes = []
max_duration = 0
all_segs = music + speech + other
for (start, end) in all_segs:
if end > max_duration:
max_duration = end
num_frames = int(max_duration) * frame_length
for i in range(0, num_frames):
frame2classes.append([])
annotate_frames(frame2classes, music, "music", frame_length, num_frames)
annotate_frames(frame2classes, speech, "speech", frame_length, num_frames)
annotate_frames(frame2classes, other, "other", frame_length, num_frames)
curr_class = None
for i in range(0, len(frame2classes)):
if len(frame2classes[i]) != 1 or frame2classes[i][0] == "other":
curr_class = "other"
elif frame2classes[i][0] == "music":
curr_class = "music"
elif frame2classes[i][0] == "speech":
curr_class = "speech"
else:
curr_class = "other"
frame2classes[i] = curr_class
new_music = []
new_speech = []
curr_class = frame2classes[0]
start_frame = 0
for i in range(1, len(frame2classes)):
if curr_class != frame2classes[i]:
start = float(start_frame)/frame_length
end = float(i)/frame_length
if end - start > min_seg:
if curr_class == "music":
new_music.append((start, end))
elif curr_class == "speech":
new_speech.append((start, end))
start_frame = i
curr_class = frame2classes[i]
return new_music, new_speech
def annotate_frames(frame2classes, segs, annotation, frame_length, max_duration):
for (start, end) in segs:
frame_start = min(int(start * frame_length), max_duration)
frame_end = min(int(end * frame_length), max_duration)
for i in range(frame_start, frame_end):
frame2classes[i].append(annotation)
def main():
out_dir = sys.argv[1]
frames_per_sec = int(sys.argv[2])
min_seg_length = float(sys.argv[3])
utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines()
for line in utts:
speech_filename = os.path.join(out_dir, line.rstrip() + "_speech.key")
music_filename = os.path.join(out_dir, line.rstrip() + "_music.key")
other_filename = os.path.join(out_dir, line.rstrip() + "_other.key")
raw_speech_segs = open(speech_filename, 'r').readlines()
raw_music_segs = open(music_filename, 'r').readlines()
raw_other_segs = open(other_filename, 'r').readlines()
speech_segs = process_segs(raw_speech_segs)
music_segs = process_segs(raw_music_segs)
other_segs = process_segs(raw_other_segs)
music_segs, speech_segs = resegment(music_segs, speech_segs, other_segs, frames_per_sec, min_seg_length)
speech_output = ""
music_output = ""
for seg in music_segs:
music_output = music_output + seg_to_string(seg)
for seg in speech_segs:
speech_output = speech_output + seg_to_string(seg)
speech_fi = open(speech_filename + ".refined", 'w')
music_fi = open(music_filename + ".refined", 'w')
speech_fi.write(speech_output)
music_fi.write(music_output)
speech_fi.close()
music_fi.close()
if __name__=="__main__":
main()