make_annotations_bn.py 4.68 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154


#!/usr/bin/env python
# Copyright 2015   David Snyder
# Apache 2.0.
#
# This script creates four files for each HUB4 Broadcast News
# transcript file. The four files are for the music, speech, ad,
# and other transcripts. Each line of the output files define the
# start and end times of the individual events.
#
# This file is meant to be invoked by make_bn.sh.

from __future__ import print_function
import sys, re, os

def is_speech(line):
  if "<Segment" in line and "Speaker=" in line:
    return True
  return False

def is_other_type2(line):
  if "Type=Commercial" in line or "Type=Filler" in line or "Type=Local_News" in line:
    return True
  return False

def is_music(line):
  if "Type=Music" in line:
    return True
  return False

def is_other_type1(line):
  if "Type=Other" in line:
    return True
  return False

def extract_speech(line):
  m = re.search('(?<=S_time=)\d+.\d+', line)
  start = float(m.group(0))
  m = re.search('(?<=E_time=)\d+.\d+', line)
  end = float(m.group(0))
  if start > end:
    print("Skipping annotation where end time is before start time: {}".format(line))
  return start, end

def extract_other_type2(line):
  m = re.search('(?<=S_time=)\d+.\d+', line)
  start = float(m.group(0))
  m = re.search('(?<=E_time=)\d+.\d+', line)
  end = float(m.group(0))
  if start > end:
    print("Skipping annotation where end time is before start time: {}".format(line))
  return start, end

def extract_music(line):
  m = re.search('(?<=Time=)\d+.\d+', line)
  time = float(m.group(0))
  m = re.search('(?<=Level=)\w', line)
  level = m.group(0)
  is_on = False
  if level == "L" or level == "H":
    is_on = True
  elif level == "O":
    is_on = False
  else:
    print("Encountered bad token on line: {}".format(line))
    sys.exit()
  return time, is_on

def extract_other_type1(line):
  m = re.search('(?<=Time=)\d+.\d+', line)
  time = float(m.group(0))
  m = re.search('(?<=Level=)\w', line)
  level = m.group(0)
  is_on = False
  if level == "L" or level == "H":
    is_on = True
  elif level == "O":
    is_on = False
  else:
    print("Encountered bad token on line: {}".format(line))
    sys.exit()
  return time, is_on

def process_file(annos):
  speech = ""
  music = ""
  other_type2 = ""
  other_type1 = ""
  start_new_music_segment = True
  start_new_other_segment = True
  max_time = 0.0
  prev_music_time = "0.0"
  prev_other_time = "0.0"
  for line in annos:
    if is_speech(line):
      speech_start, speech_end = extract_speech(line)
      speech = "{}{} {}\n".format(speech, speech_start, speech_end)
      max_time = max(speech_end, max_time)
    elif is_other_type2(line):
      other_type2_start, other_type2_end = extract_other_type2(line)
      other_type2 = "{}{} {}\n".format(other_type2, other_type2_start, other_type2_end)
      max_time = max(other_type2_end, max_time)
    elif is_music(line):
      time, is_on = extract_music(line)
      max_time = max(time, max_time)
      if is_on and start_new_music_segment:
        prev_music_time = time
        start_new_music_segment = False
      elif not is_on and not start_new_music_segment:
        music = "{}{} {}\n".format(music, prev_music_time, time)
        start_new_music_segment = True
    elif is_other_type1(line):
      time, is_on = extract_other_type1(line)
      max_time = max(time, max_time)
      if is_on and start_new_other_segment:
        prev_other_time = time
        start_new_other_segment = False
      elif not is_on and not start_new_other_segment:
        other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, time)
        start_new_other_segment = True

  if not start_new_music_segment:
    music = "{}{} {}\n".format(music, prev_music_time, max_time)
  if not start_new_other_segment:
    other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, max_time)

  other = other_type1 + other_type2
  return speech, music, other

def main():
  in_dir = sys.argv[1]
  out_dir = sys.argv[2]
  utts = ""
  for root, dirs, files in os.walk(in_dir):
    for file in files:
      if file.endswith(".txt"):
        anno_in = open(os.path.join(root, file), 'r').readlines()
        speech, music, other = process_file(anno_in)
        utt = file.replace(".txt", "")
        utts = utts + utt + "\n"
        speech_fi_str = utt + "_speech.key"
        music_fi_str = utt +  "_music.key"
        other_fi_str = utt +  "_other.key"
        speech_fi = open(os.path.join(out_dir, speech_fi_str), 'w')
        speech_fi.write(speech)
        music_fi = open(os.path.join(out_dir, music_fi_str), 'w')
        music_fi.write(music)
        other_fi = open(os.path.join(out_dir, other_fi_str), 'w')
        other_fi.write(other)
  utts_fi = open(os.path.join(out_dir, "utt_list"), 'w')
  utts_fi.write(utts)

if __name__=="__main__":
  main()