Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/dict/internal/sum_arc_info.py 5.09 KB
  #!/usr/bin/env python
  
  # Copyright 2018   Xiaohui Zhang
  # Apache 2.0
  
  from __future__ import print_function
  from collections import defaultdict
  import argparse
  import sys
  
  class StrToBoolAction(argparse.Action):
      """ A custom action to convert bools from shell format i.e., true/false
          to python format i.e., True/False """
      def __call__(self, parser, namespace, values, option_string=None):
          if values == "true":
              setattr(namespace, self.dest, True)
          elif values == "false":
              setattr(namespace, self.dest, False)
          else:
              raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
  
  
  def GetArgs():
      parser = argparse.ArgumentParser(
          description = "Accumulate statistics from per arc lattice statitics"
          "for lexicon learning",
          epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
  
      parser.add_argument("--set-sum-to-one", type = str, default = True,
                          action = StrToBoolAction, choices = ["true", "false"],
                          help = "If normalize posteriors such that the sum of "
                          "pronunciation posteriors of a word in an utterance is 1.")
      parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                          help = "File containing per arc statistics; "
                          "each line must be <utt-id> <word> <start-frame> <duration> <posterior>"
                          "<phones-with-word-boundary-markers>")
      parser.add_argument("phone_map", metavar = "<phone-map>", type = str,
                          help = "An input phone map used to remove word boundary markers from phones;"
                          "generated in steps/cleanup/debug_lexicon.sh")
      parser.add_argument("stats_file", metavar = "<out-stats-file>", type = str,
                          help = "Write accumulated statitistics to this file"
                          "each line is <utt-id> <word> <start-frame> <posterior>"
                          "<phones-without-word-boundary-markers>")
  
      print (' '.join(sys.argv), file=sys.stderr)
  
      args = parser.parse_args()
      args = CheckArgs(args)
  
      return args
  
  def CheckArgs(args):
      if args.arc_info_file == "-":
          args.arc_info_file_handle = sys.stdin
      else:
          args.arc_info_file_handle = open(args.arc_info_file)
      
      args.phone_map_handle = open(args.phone_map)
  
      if args.stats_file == "-":
          args.stats_file_handle = sys.stdout
      else:
          args.stats_file_handle = open(args.stats_file, "w")
  
      return args
  
  def Main():
      args = GetArgs()
  
      lexicon = defaultdict(list)
      prons = defaultdict(list)
      start_frames = {}
      stats = defaultdict(lambda : defaultdict(float))
      sum_tot = defaultdict(float)
  
      phone_map = {}
      for line in args.phone_map_handle.readlines():
          splits = line.strip().split()
          phone_map[splits[0]] = splits[1]
  
      for line in args.arc_info_file_handle.readlines():
          splits = line.strip().split()
  
          if (len(splits) == 0):
              continue
  
          if (len(splits) < 6):
              raise Exception('Invalid format of line ' + line
                                  + ' in ' + args.arc_info_file)
  
          utt = splits[0]
          start_frame = int(splits[1])
          word = splits[4]
          count = float(splits[3])
          phones_unmapped = splits[5:]   
          phones = [phone_map[phone] for phone in phones_unmapped]
          phones = ' '.join(phones)
          overlap = False
          if word == '<eps>':
              continue
          if (word, utt) not in start_frames:
              start_frames[(word, utt)] = start_frame
  
          if (word, utt) in stats:
              stats[word, utt][phones] = stats[word, utt].get(phones, 0) + count
          else:
              stats[(word, utt)][phones] = count
          sum_tot[(word, utt)] += count
  
          if phones not in prons[word]:
              prons[word].append(phones)
  
      for (word, utt) in stats:
         count_sum = 0.0
         counts = dict()
         for phones in stats[(word, utt)]:
             count = stats[(word, utt)][phones]
             count_sum += count
             counts[phones] = count
         # By default we normalize the pron posteriors of each word in each utterance,
         # so that they sum up exactly to one. If a word occurs two times in a utterance,
         # the effect of this operation is to average the posteriors of these two occurences
         # so that there's only one "equivalent occurence" of this word in the utterance.
         # However, this case should be extremely rare if the utterances are already
         # short sub-utterances produced by steps/dict/internal/get_subsegments.py
         for phones in stats[(word, utt)]:
             count = counts[phones] / count_sum
             print(word, utt, start_frames[(word, utt)], count, phones, file=args.stats_file_handle)
         # # Diagnostics info implying incomplete arc_info or multiple occurences of a word in a utterance:
         # if count_sum < 0.9 or count_sum > 1.1:
         #    print(word, utt, start_frame, count_sum, stats[word, utt], file=sys.stderr)
  
      args.stats_file_handle.close()
  
  if __name__ == "__main__":
      Main()