Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/dict/internal/get_subsegments.py 4.88 KB
  #!/usr/bin/env python
  
  # Copyright 2018 Xiaohui Zhang
  # Apache 2.0.
  
  # we're using python 3.x style print but want it to work in python 2.x,
  from __future__ import print_function
  import argparse
  import sys
  import string
  
  def GetArgs():
      parser = argparse.ArgumentParser(
          description = "The purpose of this script is to use a ctm and a vocab file"
          "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances"
          "are all the strings of consecutive in-vocab words from the ctm"
          "surrounded by an out-of-vocab word at each end if present.",
          epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\"
          "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\"
          "exp/tri3_lex_0.4_work/resegmentation/text"
          "See steps/dict/learn_lexicon_greedy.sh for an example.")
  
      parser.add_argument("ctm", metavar='<ctm>', type = str,
                          help = "Input ctm file."
                          "each line must be <utt-id> <chanel> <start-time> <duration> <word>")
      parser.add_argument("vocab", metavar='<vocab>', type = str,
                          help = "Vocab file."
                          "each line must be <word>")
      parser.add_argument("subsegment", metavar='<subsegtment>', type = str,
                          help = "Subsegment file. Each line is in format:"
                          "<new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>")
      parser.add_argument("text", metavar='<text>', type = str,
                          help = "Text file. Each line is in format:"
                          " <new-utt> <word1> <word2> ... <wordN>.")
    
      print (' '.join(sys.argv), file = sys.stderr)
  
      args = parser.parse_args()
      args = CheckArgs(args)
  
      return args
  
  def CheckArgs(args):
      if args.ctm == "-":
          args.ctm_handle = sys.stdin
      else:
          args.ctm_handle = open(args.ctm)
  
      if args.vocab is not '':
          if args.vocab == "-":
              args.vocab_handle = sys.stdout
          else:
              args.vocab_handle = open(args.vocab)
  
      args.subsegment_handle = open(args.subsegment, 'w')
      args.text_handle = open(args.text, 'w')
  
      return args
  
  def GetSubsegments(args, vocab):
      sub_utt = list()
      last_is_oov = False
      is_oov = False
      utt_id_last = None
      start_times = {}
      end_times = {}
      sub_utts = {}
      sub_utt_id = 1
      sub_utt_id_last = 1
      end_time_last = 0.0
      for line in args.ctm_handle:
          splits = line.strip().split()
          if len(splits) < 5:
              raise Exception("problematic line",line)
  
          utt_id = splits[0]
          start = float(splits[2])
          dur = float(splits[3])
          word = splits[4]
          if utt_id != utt_id_last:
              sub_utt_id = 1
              if len(sub_utt)>1:
                  sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt)
                  end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last
              sub_utt = []
              start_times[utt_id+'-'+str(sub_utt_id)] = start
              is_oov_last = False
          if word == '<eps>':
              is_oov = True
              end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
          elif word in vocab:
              is_oov = True
              sub_utt.append(word)
              end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
          else:
              is_oov = False
              if is_oov_last == True:
                  sub_utt.append(word)
                  sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
                  end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
                  sub_utt_id += 1
              sub_utt = [word]
              start_times[utt_id+'-'+str(sub_utt_id)] = start
          utt_id_last = utt_id
          sub_utt_id_last = sub_utt_id
          is_oov_last = is_oov
          ent_time_last = start + dur
          
      if is_oov:
          if word != '<eps>':
              sub_utt.append(word)
          sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
          end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
  
      for utt,v in sorted(sub_utts.items()):
          print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle)
          print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle)
  
  def ReadVocab(vocab_file_handle):
      vocab = set()
      if vocab_file_handle:
          for line in vocab_file_handle.readlines():
              splits = line.strip().split()
              if len(splits) == 0:
                  continue
              if len(splits) > 1:
                  raise Exception('Invalid format of line ' + line
                                      + ' in vocab file.')
              word = splits[0]
              vocab.add(word)
      return vocab
  
  def Main():
      args = GetArgs()
  
      vocab = ReadVocab(args.vocab_handle)
      GetSubsegments(args, vocab)
     
  if __name__ == "__main__":
      Main()