Blame view

egs/bn_music_speech/v1/local/refine_annotations_bn.py 3.83 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  #!/usr/bin/env python
  # Copyright 2015   David Snyder
  # Apache 2.0.
  #
  # This script refines the annotation files produced by
  # make_annotations_bn.py. In order to create unambiguous annotations,
  # we remove any part of a segment that overlaps with another. Also,
  # this script merges together contiguous segments that have the
  # same annotation, and ensures that only segments longer than a
  # designated length are created.
  #
  # This file is meant to be invoked from make_bn.sh.
  from __future__ import division
  import sys, os
  
  def seg_to_string(seg):
    start = seg[0]
    end = seg[1]
    if start < end:
      return str(start) + " " + str(end) + "
  "
    else:
      return ""
  
  def process_segs(raw_segs):
    segs = []
    for seg in raw_segs:
      lower, upper = [float(i) for i in seg.rstrip().split(" ")]
      segs.append((lower, upper))
    return segs
  
  def resegment(music, speech, other, frame_length, min_seg):
    frame2classes = []
    max_duration = 0
    all_segs = music + speech + other
    for (start, end) in all_segs:
      if end > max_duration:
        max_duration = end
    num_frames = int(max_duration) * frame_length
    for i in range(0, num_frames):
      frame2classes.append([])
  
    annotate_frames(frame2classes, music, "music", frame_length, num_frames)
    annotate_frames(frame2classes, speech, "speech", frame_length, num_frames)
    annotate_frames(frame2classes, other,  "other", frame_length, num_frames)
  
    curr_class = None
    for i in range(0, len(frame2classes)):
      if len(frame2classes[i]) != 1 or frame2classes[i][0] == "other":
        curr_class = "other"
      elif frame2classes[i][0] == "music":
        curr_class = "music"
      elif frame2classes[i][0] == "speech":
        curr_class = "speech"
      else:
        curr_class = "other"
      frame2classes[i] = curr_class
  
    new_music = []
    new_speech = []
    curr_class = frame2classes[0]
    start_frame = 0
    for i in range(1, len(frame2classes)):
      if curr_class != frame2classes[i]:
        start = float(start_frame)/frame_length
        end = float(i)/frame_length
        if end - start > min_seg:
          if curr_class == "music":
            new_music.append((start, end))
          elif curr_class == "speech":
            new_speech.append((start, end))
        start_frame = i
        curr_class = frame2classes[i]
  
    return new_music, new_speech
  
  
  def annotate_frames(frame2classes, segs, annotation, frame_length, max_duration):
    for (start, end) in segs:
      frame_start = min(int(start * frame_length), max_duration)
      frame_end = min(int(end * frame_length), max_duration)
      for i in range(frame_start, frame_end):
        frame2classes[i].append(annotation)
  
  def main():
    out_dir = sys.argv[1]
    frames_per_sec = int(sys.argv[2])
    min_seg_length = float(sys.argv[3])
  
    utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines()
    for line in utts:
      speech_filename = os.path.join(out_dir, line.rstrip() + "_speech.key")
      music_filename = os.path.join(out_dir, line.rstrip() + "_music.key")
      other_filename = os.path.join(out_dir, line.rstrip() + "_other.key")
      raw_speech_segs = open(speech_filename, 'r').readlines()
      raw_music_segs = open(music_filename, 'r').readlines()
      raw_other_segs = open(other_filename, 'r').readlines()
      speech_segs = process_segs(raw_speech_segs)
      music_segs = process_segs(raw_music_segs)
      other_segs = process_segs(raw_other_segs)
      music_segs, speech_segs = resegment(music_segs, speech_segs, other_segs, frames_per_sec, min_seg_length)
  
      speech_output = ""
      music_output = ""
      for seg in music_segs:
        music_output = music_output + seg_to_string(seg)
      for seg in speech_segs:
        speech_output = speech_output + seg_to_string(seg)
  
      speech_fi = open(speech_filename + ".refined", 'w')
      music_fi = open(music_filename + ".refined", 'w')
      speech_fi.write(speech_output)
      music_fi.write(music_output)
      speech_fi.close()
      music_fi.close()
  
  if __name__=="__main__":
    main()