Yannick Estève / ONTRAC-Kaldi

Blame view

egs/hub4_english/s5/local/data_prep/hub4_utils.py 5 KB
  # Copyright 2016    Vimal Manohar
  # Apache 2.0.
  
  """This module contains utilities for preparing the HUB4 broadcast news
  evaluation corpora.
  """
  
  import os
  import re
  import sys
  
  
  def parse_uem_line(reco, line):
      """This method parses a 'line' from the UEM for recording 'reco'
      and returns the line converted to kaldi segments format.
      The format of UEM is
      <file-id> <channel> <start-time> <end-time>
  
      We force the channel to be 1 and take the file-id to be the recording-id.
      """
      line = line.strip()
      if len(line) == 0 or line[0:2] == ";;":
          return None
      parts = line.split()
  
      if reco is None:
          reco = parts[0]
  
      # The channel ID is expected to be 1.
      if parts[1] != "1":
          raise TypeError("Invalid line {0}".format(line))
  
      start_time = float(parts[2])
      end_time = float(parts[3])
  
      utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
                                         int(end_time * 100))
      return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
  
  
  def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
      """This line parses a 'line' from the CMU automatic segmentation for
      recording.
      The CMU segmentation has the following format:
      <file> <channel> <speaker> <start-time> <end-time> <condition>
  
      e.g.:
      h4e_98_1 1 F0-0000     0.00    28.22 F0
  
      We force the channel to be 1 and take the file-id to be the recording-id.
      """
      line = line.strip()
      if len(line) == 0 or line[0:2] == ";;":
          return None
      parts = line.split()
  
      # Actually a file, but we assuming 1-1 mapping to recording and force
      # channel to be 1.
      reco = parts[0]
  
      # The channel ID is expected to be 1.
      if parts[1] != "1":
          raise TypeError("Invalid line {0}".format(line))
      spk = parts[2]
  
      start_time = float(parts[3])
      end_time = float(parts[4])
  
      if prepend_reco_to_spk:
          spk = reco + '-' + spk
          utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
                                               int(end_time * 100), spk=spk)
      else:
          utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
                                                      int(end_time * 100),
                                                      reco=reco, spk=spk)
  
      segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
          utt, reco, st=start_time, end=end_time)
      utt2spk_line = "{0} {1}".format(utt, spk)
  
      return (segment_line, utt2spk_line)
  
  
  def normalize_csr_transcript(text, noise_word, spoken_noise_word):
      """Normalize broadcast news transcript for audio."""
      text = text.upper()
  
      # Remove long event markings
      text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
      # Remove comments
      text = re.sub(r"\{\{[^}]*\}\}", "", text)
      # Replace alternative words with a single one (second alternative)
      text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
      # Remove partial word completions
      text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
      # Remove accent marks and diacritics
      text = re.sub(r"\\[3-8]", "", text)
  
      # Remove unclear speech markings
      text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
      text = re.sub(r"#", "", text)   # Remove overlapped speech markings
      # Remove invented word markings
      text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
      # Replace speaker-made noises with <SPOKEN_NOISE>
      text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
                    spoken_noise_word, text)
      # Replace noise with <NOISE>
      text = re.sub(r"\[[^]]+\]", noise_word, text)
      text = re.sub(r"\+([^+]+)\+", r"\1", text)
  
      # Remove periods after letter.
      text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
      # Replace \. with .
      text = re.sub(r"\\.", r".", text)
  
      text1 = []
      for word in text.split():
          if word == spoken_noise_word or word == noise_word:
              text1.append(word)
              continue
  
          # Remove mispronunciation brackets
          word = re.sub(r"^@(\w+)$", r"\1", word)
          # Remove everything other than the standard ASCII symbols
          word = re.sub("[^A-Za-z0-9.' _-]", "", word)
          text1.append(word)
      return " ".join(text1)
  
  
  def remove_punctuations(text):
      """Remove punctuations and some other processing for text sentence."""
      # Remove HTML new lines that are not end of sentences
      text1 = re.sub("
  ", " ", text)
  
      # Remove some markers like double dash that are normally used to separate
      # name titles in newspapers.
      text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
  
      # Remove quotation marks
      text1 = re.sub(r"''|``|\(|\)", " ", text1)
  
      # Remove everything other than the standard ASCII symbols
      text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
  
      # Replace multiple .'s with single and then remove isolated '.'
      text1 = re.sub(r"\.[.]+ ", ".", text1)
      text1 = re.sub(r" \. ", " ", text1)
  
      # Remove isolated '-'
      text1 = re.sub(r" - ", " ", text1)
  
      # Replace multiple spaces with single.
      text1 = re.sub(r"[ ]+", " ", text1)
  
      return text1