refine_annotations_bn.py 3.83 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117


#!/usr/bin/env python
# Copyright 2015   David Snyder
# Apache 2.0.
#
# This script refines the annotation files produced by
# make_annotations_bn.py. In order to create unambiguous annotations,
# we remove any part of a segment that overlaps with another. Also,
# this script merges together contiguous segments that have the
# same annotation, and ensures that only segments longer than a
# designated length are created.
#
# This file is meant to be invoked from make_bn.sh.
from __future__ import division
import sys, os

def seg_to_string(seg):
  start = seg[0]
  end = seg[1]
  if start < end:
    return str(start) + " " + str(end) + "\n"
  else:
    return ""

def process_segs(raw_segs):
  segs = []
  for seg in raw_segs:
    lower, upper = [float(i) for i in seg.rstrip().split(" ")]
    segs.append((lower, upper))
  return segs

def resegment(music, speech, other, frame_length, min_seg):
  frame2classes = []
  max_duration = 0
  all_segs = music + speech + other
  for (start, end) in all_segs:
    if end > max_duration:
      max_duration = end
  num_frames = int(max_duration) * frame_length
  for i in range(0, num_frames):
    frame2classes.append([])

  annotate_frames(frame2classes, music, "music", frame_length, num_frames)
  annotate_frames(frame2classes, speech, "speech", frame_length, num_frames)
  annotate_frames(frame2classes, other,  "other", frame_length, num_frames)

  curr_class = None
  for i in range(0, len(frame2classes)):
    if len(frame2classes[i]) != 1 or frame2classes[i][0] == "other":
      curr_class = "other"
    elif frame2classes[i][0] == "music":
      curr_class = "music"
    elif frame2classes[i][0] == "speech":
      curr_class = "speech"
    else:
      curr_class = "other"
    frame2classes[i] = curr_class

  new_music = []
  new_speech = []
  curr_class = frame2classes[0]
  start_frame = 0
  for i in range(1, len(frame2classes)):
    if curr_class != frame2classes[i]:
      start = float(start_frame)/frame_length
      end = float(i)/frame_length
      if end - start > min_seg:
        if curr_class == "music":
          new_music.append((start, end))
        elif curr_class == "speech":
          new_speech.append((start, end))
      start_frame = i
      curr_class = frame2classes[i]

  return new_music, new_speech


def annotate_frames(frame2classes, segs, annotation, frame_length, max_duration):
  for (start, end) in segs:
    frame_start = min(int(start * frame_length), max_duration)
    frame_end = min(int(end * frame_length), max_duration)
    for i in range(frame_start, frame_end):
      frame2classes[i].append(annotation)

def main():
  out_dir = sys.argv[1]
  frames_per_sec = int(sys.argv[2])
  min_seg_length = float(sys.argv[3])

  utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines()
  for line in utts:
    speech_filename = os.path.join(out_dir, line.rstrip() + "_speech.key")
    music_filename = os.path.join(out_dir, line.rstrip() + "_music.key")
    other_filename = os.path.join(out_dir, line.rstrip() + "_other.key")
    raw_speech_segs = open(speech_filename, 'r').readlines()
    raw_music_segs = open(music_filename, 'r').readlines()
    raw_other_segs = open(other_filename, 'r').readlines()
    speech_segs = process_segs(raw_speech_segs)
    music_segs = process_segs(raw_music_segs)
    other_segs = process_segs(raw_other_segs)
    music_segs, speech_segs = resegment(music_segs, speech_segs, other_segs, frames_per_sec, min_seg_length)

    speech_output = ""
    music_output = ""
    for seg in music_segs:
      music_output = music_output + seg_to_string(seg)
    for seg in speech_segs:
      speech_output = speech_output + seg_to_string(seg)

    speech_fi = open(speech_filename + ".refined", 'w')
    music_fi = open(music_filename + ".refined", 'w')
    speech_fi.write(speech_output)
    music_fi.write(music_output)
    speech_fi.close()
    music_fi.close()

if __name__=="__main__":
  main()