Blame view
egs/bn_music_speech/v1/local/refine_annotations_bn.py
3.83 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
#!/usr/bin/env python # Copyright 2015 David Snyder # Apache 2.0. # # This script refines the annotation files produced by # make_annotations_bn.py. In order to create unambiguous annotations, # we remove any part of a segment that overlaps with another. Also, # this script merges together contiguous segments that have the # same annotation, and ensures that only segments longer than a # designated length are created. # # This file is meant to be invoked from make_bn.sh. from __future__ import division import sys, os def seg_to_string(seg): start = seg[0] end = seg[1] if start < end: return str(start) + " " + str(end) + " " else: return "" def process_segs(raw_segs): segs = [] for seg in raw_segs: lower, upper = [float(i) for i in seg.rstrip().split(" ")] segs.append((lower, upper)) return segs def resegment(music, speech, other, frame_length, min_seg): frame2classes = [] max_duration = 0 all_segs = music + speech + other for (start, end) in all_segs: if end > max_duration: max_duration = end num_frames = int(max_duration) * frame_length for i in range(0, num_frames): frame2classes.append([]) annotate_frames(frame2classes, music, "music", frame_length, num_frames) annotate_frames(frame2classes, speech, "speech", frame_length, num_frames) annotate_frames(frame2classes, other, "other", frame_length, num_frames) curr_class = None for i in range(0, len(frame2classes)): if len(frame2classes[i]) != 1 or frame2classes[i][0] == "other": curr_class = "other" elif frame2classes[i][0] == "music": curr_class = "music" elif frame2classes[i][0] == "speech": curr_class = "speech" else: curr_class = "other" frame2classes[i] = curr_class new_music = [] new_speech = [] curr_class = frame2classes[0] start_frame = 0 for i in range(1, len(frame2classes)): if curr_class != frame2classes[i]: start = float(start_frame)/frame_length end = float(i)/frame_length if end - start > min_seg: if curr_class == "music": new_music.append((start, end)) elif curr_class == "speech": new_speech.append((start, end)) start_frame = i curr_class = frame2classes[i] return new_music, new_speech def annotate_frames(frame2classes, segs, annotation, frame_length, max_duration): for (start, end) in segs: frame_start = min(int(start * frame_length), max_duration) frame_end = min(int(end * frame_length), max_duration) for i in range(frame_start, frame_end): frame2classes[i].append(annotation) def main(): out_dir = sys.argv[1] frames_per_sec = int(sys.argv[2]) min_seg_length = float(sys.argv[3]) utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines() for line in utts: speech_filename = os.path.join(out_dir, line.rstrip() + "_speech.key") music_filename = os.path.join(out_dir, line.rstrip() + "_music.key") other_filename = os.path.join(out_dir, line.rstrip() + "_other.key") raw_speech_segs = open(speech_filename, 'r').readlines() raw_music_segs = open(music_filename, 'r').readlines() raw_other_segs = open(other_filename, 'r').readlines() speech_segs = process_segs(raw_speech_segs) music_segs = process_segs(raw_music_segs) other_segs = process_segs(raw_other_segs) music_segs, speech_segs = resegment(music_segs, speech_segs, other_segs, frames_per_sec, min_seg_length) speech_output = "" music_output = "" for seg in music_segs: music_output = music_output + seg_to_string(seg) for seg in speech_segs: speech_output = speech_output + seg_to_string(seg) speech_fi = open(speech_filename + ".refined", 'w') music_fi = open(music_filename + ".refined", 'w') speech_fi.write(speech_output) music_fi.write(music_output) speech_fi.close() music_fi.close() if __name__=="__main__": main() |