make_annotations_bn.py
4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# Copyright 2015 David Snyder
# Apache 2.0.
#
# This script creates four files for each HUB4 Broadcast News
# transcript file. The four files are for the music, speech, ad,
# and other transcripts. Each line of the output files define the
# start and end times of the individual events.
#
# This file is meant to be invoked by make_bn.sh.
from __future__ import print_function
import sys, re, os
def is_speech(line):
if "<Segment" in line and "Speaker=" in line:
return True
return False
def is_other_type2(line):
if "Type=Commercial" in line or "Type=Filler" in line or "Type=Local_News" in line:
return True
return False
def is_music(line):
if "Type=Music" in line:
return True
return False
def is_other_type1(line):
if "Type=Other" in line:
return True
return False
def extract_speech(line):
m = re.search('(?<=S_time=)\d+.\d+', line)
start = float(m.group(0))
m = re.search('(?<=E_time=)\d+.\d+', line)
end = float(m.group(0))
if start > end:
print("Skipping annotation where end time is before start time: {}".format(line))
return start, end
def extract_other_type2(line):
m = re.search('(?<=S_time=)\d+.\d+', line)
start = float(m.group(0))
m = re.search('(?<=E_time=)\d+.\d+', line)
end = float(m.group(0))
if start > end:
print("Skipping annotation where end time is before start time: {}".format(line))
return start, end
def extract_music(line):
m = re.search('(?<=Time=)\d+.\d+', line)
time = float(m.group(0))
m = re.search('(?<=Level=)\w', line)
level = m.group(0)
is_on = False
if level == "L" or level == "H":
is_on = True
elif level == "O":
is_on = False
else:
print("Encountered bad token on line: {}".format(line))
sys.exit()
return time, is_on
def extract_other_type1(line):
m = re.search('(?<=Time=)\d+.\d+', line)
time = float(m.group(0))
m = re.search('(?<=Level=)\w', line)
level = m.group(0)
is_on = False
if level == "L" or level == "H":
is_on = True
elif level == "O":
is_on = False
else:
print("Encountered bad token on line: {}".format(line))
sys.exit()
return time, is_on
def process_file(annos):
speech = ""
music = ""
other_type2 = ""
other_type1 = ""
start_new_music_segment = True
start_new_other_segment = True
max_time = 0.0
prev_music_time = "0.0"
prev_other_time = "0.0"
for line in annos:
if is_speech(line):
speech_start, speech_end = extract_speech(line)
speech = "{}{} {}\n".format(speech, speech_start, speech_end)
max_time = max(speech_end, max_time)
elif is_other_type2(line):
other_type2_start, other_type2_end = extract_other_type2(line)
other_type2 = "{}{} {}\n".format(other_type2, other_type2_start, other_type2_end)
max_time = max(other_type2_end, max_time)
elif is_music(line):
time, is_on = extract_music(line)
max_time = max(time, max_time)
if is_on and start_new_music_segment:
prev_music_time = time
start_new_music_segment = False
elif not is_on and not start_new_music_segment:
music = "{}{} {}\n".format(music, prev_music_time, time)
start_new_music_segment = True
elif is_other_type1(line):
time, is_on = extract_other_type1(line)
max_time = max(time, max_time)
if is_on and start_new_other_segment:
prev_other_time = time
start_new_other_segment = False
elif not is_on and not start_new_other_segment:
other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, time)
start_new_other_segment = True
if not start_new_music_segment:
music = "{}{} {}\n".format(music, prev_music_time, max_time)
if not start_new_other_segment:
other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, max_time)
other = other_type1 + other_type2
return speech, music, other
def main():
in_dir = sys.argv[1]
out_dir = sys.argv[2]
utts = ""
for root, dirs, files in os.walk(in_dir):
for file in files:
if file.endswith(".txt"):
anno_in = open(os.path.join(root, file), 'r').readlines()
speech, music, other = process_file(anno_in)
utt = file.replace(".txt", "")
utts = utts + utt + "\n"
speech_fi_str = utt + "_speech.key"
music_fi_str = utt + "_music.key"
other_fi_str = utt + "_other.key"
speech_fi = open(os.path.join(out_dir, speech_fi_str), 'w')
speech_fi.write(speech)
music_fi = open(os.path.join(out_dir, music_fi_str), 'w')
music_fi.write(music)
other_fi = open(os.path.join(out_dir, other_fi_str), 'w')
other_fi.write(other)
utts_fi = open(os.path.join(out_dir, "utt_list"), 'w')
utts_fi.write(utts)
if __name__=="__main__":
main()