Yannick Estève / ONTRAC-Kaldi

Blame view

egs/bn_music_speech/v1/local/make_annotations_bn.py 4.68 KB
  #!/usr/bin/env python
  # Copyright 2015   David Snyder
  # Apache 2.0.
  #
  # This script creates four files for each HUB4 Broadcast News
  # transcript file. The four files are for the music, speech, ad,
  # and other transcripts. Each line of the output files define the
  # start and end times of the individual events.
  #
  # This file is meant to be invoked by make_bn.sh.
  
  from __future__ import print_function
  import sys, re, os
  
  def is_speech(line):
    if "<Segment" in line and "Speaker=" in line:
      return True
    return False
  
  def is_other_type2(line):
    if "Type=Commercial" in line or "Type=Filler" in line or "Type=Local_News" in line:
      return True
    return False
  
  def is_music(line):
    if "Type=Music" in line:
      return True
    return False
  
  def is_other_type1(line):
    if "Type=Other" in line:
      return True
    return False
  
  def extract_speech(line):
    m = re.search('(?<=S_time=)\d+.\d+', line)
    start = float(m.group(0))
    m = re.search('(?<=E_time=)\d+.\d+', line)
    end = float(m.group(0))
    if start > end:
      print("Skipping annotation where end time is before start time: {}".format(line))
    return start, end
  
  def extract_other_type2(line):
    m = re.search('(?<=S_time=)\d+.\d+', line)
    start = float(m.group(0))
    m = re.search('(?<=E_time=)\d+.\d+', line)
    end = float(m.group(0))
    if start > end:
      print("Skipping annotation where end time is before start time: {}".format(line))
    return start, end
  
  def extract_music(line):
    m = re.search('(?<=Time=)\d+.\d+', line)
    time = float(m.group(0))
    m = re.search('(?<=Level=)\w', line)
    level = m.group(0)
    is_on = False
    if level == "L" or level == "H":
      is_on = True
    elif level == "O":
      is_on = False
    else:
      print("Encountered bad token on line: {}".format(line))
      sys.exit()
    return time, is_on
  
  def extract_other_type1(line):
    m = re.search('(?<=Time=)\d+.\d+', line)
    time = float(m.group(0))
    m = re.search('(?<=Level=)\w', line)
    level = m.group(0)
    is_on = False
    if level == "L" or level == "H":
      is_on = True
    elif level == "O":
      is_on = False
    else:
      print("Encountered bad token on line: {}".format(line))
      sys.exit()
    return time, is_on
  
  def process_file(annos):
    speech = ""
    music = ""
    other_type2 = ""
    other_type1 = ""
    start_new_music_segment = True
    start_new_other_segment = True
    max_time = 0.0
    prev_music_time = "0.0"
    prev_other_time = "0.0"
    for line in annos:
      if is_speech(line):
        speech_start, speech_end = extract_speech(line)
        speech = "{}{} {}
  ".format(speech, speech_start, speech_end)
        max_time = max(speech_end, max_time)
      elif is_other_type2(line):
        other_type2_start, other_type2_end = extract_other_type2(line)
        other_type2 = "{}{} {}
  ".format(other_type2, other_type2_start, other_type2_end)
        max_time = max(other_type2_end, max_time)
      elif is_music(line):
        time, is_on = extract_music(line)
        max_time = max(time, max_time)
        if is_on and start_new_music_segment:
          prev_music_time = time
          start_new_music_segment = False
        elif not is_on and not start_new_music_segment:
          music = "{}{} {}
  ".format(music, prev_music_time, time)
          start_new_music_segment = True
      elif is_other_type1(line):
        time, is_on = extract_other_type1(line)
        max_time = max(time, max_time)
        if is_on and start_new_other_segment:
          prev_other_time = time
          start_new_other_segment = False
        elif not is_on and not start_new_other_segment:
          other_type1 = "{}{} {}
  ".format(other_type1, prev_other_time, time)
          start_new_other_segment = True
  
    if not start_new_music_segment:
      music = "{}{} {}
  ".format(music, prev_music_time, max_time)
    if not start_new_other_segment:
      other_type1 = "{}{} {}
  ".format(other_type1, prev_other_time, max_time)
  
    other = other_type1 + other_type2
    return speech, music, other
  
  def main():
    in_dir = sys.argv[1]
    out_dir = sys.argv[2]
    utts = ""
    for root, dirs, files in os.walk(in_dir):
      for file in files:
        if file.endswith(".txt"):
          anno_in = open(os.path.join(root, file), 'r').readlines()
          speech, music, other = process_file(anno_in)
          utt = file.replace(".txt", "")
          utts = utts + utt + "
  "
          speech_fi_str = utt + "_speech.key"
          music_fi_str = utt +  "_music.key"
          other_fi_str = utt +  "_other.key"
          speech_fi = open(os.path.join(out_dir, speech_fi_str), 'w')
          speech_fi.write(speech)
          music_fi = open(os.path.join(out_dir, music_fi_str), 'w')
          music_fi.write(music)
          other_fi = open(os.path.join(out_dir, other_fi_str), 'w')
          other_fi.write(other)
    utts_fi = open(os.path.join(out_dir, "utt_list"), 'w')
    utts_fi.write(utts)
  
  if __name__=="__main__":
    main()