Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/dict/select_prons_greedy.py 18.7 KB
  #!/usr/bin/env python
  
  # Copyright 2018  Xiaohui Zhang
  # Apache 2.0.
  
  from __future__ import print_function
  from collections import defaultdict
  import argparse
  import sys
  import math
  
  def GetArgs():
      parser = argparse.ArgumentParser(
          description = "Use a greedy framework to select pronunciation candidates"
          "from three sources: a reference lexicon, G2P lexicon and phonetic-decoding"
          "(PD) lexicon. Basically, this script implements the Alg. 1 in the paper:"
          "Acoustic data-driven lexicon learning based on a greedy pronunciation "
          "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
          "Interspeech 2017. The inputs are an arc-stats file, containing "
          "acoustic evidence (tau_{uwb} in the paper) and three source lexicons "
          "(phonetic-decoding(PD)/G2P/ref). The outputs is the learned lexicon for"
          "all words in the arc_stats (acoustic evidence) file.",
          epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
      parser.add_argument("--alpha", type = str, default = "0,0,0",
                          help = "Scaling factors for the likelihood reduction threshold."
                          "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
                          "G2P and reference. The valid range of each dimension is [0, 1], and"
                          "a large value means we prune pronunciations from this source more"
                          "aggressively. Setting a dimension to zero means we never want to remove"
                          "pronunciaiton from that source. See Section 4.3 in the paper for details.")
      parser.add_argument("--beta", type = str, default = "0,0,0",
                          help = "smoothing factors for the likelihood reduction term."
                          "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
                          "G2P and reference. The valid range of each dimension is [0, 100], and"
                          "a large value means we prune pronunciations from this source more"
                          "aggressively. See Section 4.3 in the paper for details.")
      parser.add_argument("--delta", type = float, default = 0.000000001,
                          help = "Floor value of the pronunciation posterior statistics."
                          "The valid range is (0, 0.01),"
                          "See Section 3 in the paper for details.")
      parser.add_argument("silence_phones_file", metavar = "<silphone-file>", type = str,
                          help = "File containing a list of silence phones.")
      parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
                          help = "File containing word-pronunciation statistics obtained from lattices; "
                          "each line must be <word> <utt-id> <start-frame> <count> <phones>")
      parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
                          help = "File containing word counts in acoustic training data; "
                          "each line must be <word> <count>.")
      parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
                          help = "The reference lexicon (most probably hand-derived)."
                          "Each line must be <word> <phones>")
      parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
                          help = "Candidate ronouciations from G2P results."
                          "Each line must be <word> <phones>")
      parser.add_argument("pd_lexicon", metavar = "<phonetic-decoding-lexicon>", type = str,
                          help = "Candidate ronouciations from phonetic decoding results."
                          "Each line must be <word> <phones>")
      parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
                          help = "Learned lexicon.")
  
  
      print (' '.join(sys.argv), file=sys.stderr)
  
      args = parser.parse_args()
      args = CheckArgs(args)
  
      return args
  
  def CheckArgs(args):
      args.silence_phones_file_handle = open(args.silence_phones_file)
      if args.arc_stats_file == "-":
          args.arc_stats_file_handle = sys.stdin
      else:
          args.arc_stats_file_handle = open(args.arc_stats_file)
      args.word_counts_file_handle = open(args.word_counts_file)
      args.ref_lexicon_handle = open(args.ref_lexicon)
      args.g2p_lexicon_handle = open(args.g2p_lexicon)
      args.pd_lexicon_handle = open(args.pd_lexicon)
      args.learned_lexicon_handle = open(args.learned_lexicon, "w")
      
      alpha = args.alpha.strip().split(',')
      if len(alpha) is not 3:
          raise Exception('Invalid alpha ', args.alpha)
      for i in range(0,3):
          if float(alpha[i]) < 0 or float(alpha[i]) > 1:
              raise Exception('alaph ', alpha[i], 
                              ' is invalid, it must be within [0, 1].')
          if float(alpha[i]) == 0:
              alpha[i] = -1e-3
          # The absolute likelihood loss (search for loss_abs) is supposed to be positive.
          # But it could be negative near zero because of numerical precision limit.
          # In this case, even if alpha is set to be zero, which means we never want to
          # remove pronunciation from that source, the quality score (search for q_b)
          # could still be negative, which means this pron could be potentially removed.
          # To prevent this, we set alpha as a negative value near zero to ensure
          # q_b is always positive.
  
      args.alpha = [float(alpha[0]), float(alpha[1]), float(alpha[2])]
      print("[alpha_{pd}, alpha_{g2p}, alpha_{ref}] is: ", args.alpha)
      exit
      beta = args.beta.strip().split(',')
      if len(beta) is not 3:
          raise Exception('Invalid beta ', args.beta)
      for i in range(0,3):
          if float(beta[i]) < 0 or float(beta[i]) > 100:
              raise Exception('beta ', beta[i], 
                              ' is invalid, it must be within [0, 100].')
      args.beta = [float(beta[0]), float(beta[1]), float(beta[2])]
      print("[beta_{pd}, beta_{g2p}, beta_{ref}] is: ", args.beta)
  
      if args.delta <= 0 or args.delta > 0.1:
          raise Exception('delta ', args.delta, ' is invalid, it must be within'
                          '(0, 0.01).')
      print("delta is: ", args.delta)
  
      return args
  
  def ReadArcStats(arc_stats_file_handle):
      stats = defaultdict(lambda : defaultdict(dict))
      stats_summed = defaultdict(float)
      for line in arc_stats_file_handle.readlines():
          splits = line.strip().split()
  
          if (len(splits) == 0):
              continue
  
          if (len(splits) < 5):
              raise Exception('Invalid format of line ' + line
                                  + ' in ' + arc_stats_file)
          utt = splits[1]
          start_frame = int(splits[2])
          word = splits[0]
          count = float(splits[3])
          phones = splits[4:]
          phones = ' '.join(phones)
          stats[word][(utt, start_frame)][phones] = count
          stats_summed[(word, phones)] += count
      return stats, stats_summed
  
  def ReadWordCounts(word_counts_file_handle):
      counts = {}
      for line in word_counts_file_handle.readlines():
          splits = line.strip().split()
          if len(splits) < 2:
              raise Exception('Invalid format of line ' + line
                                  + ' in counts file.')
          word = splits[0]
          count = int(splits[1])
          counts[word] = count
      return counts
  
  def ReadLexicon(args, lexicon_file_handle, counts):
      # we're skipping any word not in counts (not seen in training data),
      # cause we're only learning prons for words who have acoustic examples.
      lexicon = defaultdict(set)
      for line in lexicon_file_handle.readlines():
          splits = line.strip().split()
          if len(splits) == 0:
              continue
          if len(splits) < 2:
              raise Exception('Invalid format of line ' + line
                                  + ' in lexicon file.')
          word = splits[0]
          if word not in counts:
              continue
          phones = ' '.join(splits[1:])
          lexicon[word].add(phones)
      return lexicon
  
  def FilterPhoneticDecodingLexicon(args, pd_lexicon):
      # We want to remove all candidates which contain silence phones
      silphones = set()
      for line in args.silence_phones_file_handle:
          silphones.add(line.strip())
      rejected_candidates = set()
      for word, prons in pd_lexicon.iteritems():
          for pron in prons:
              for phone in pron.split():
                  if phone in silphones:
                     rejected_candidates.add((word, pron))
                     break
      for word, pron in rejected_candidates:
          pd_lexicon[word].remove(pron)
      return pd_lexicon
  
  # One iteration of Expectation-Maximization computation (Eq. 3-4 in the paper).
  def OneEMIter(args, word, stats, prons, pron_probs, debug=False):
      prob_acc = [0.0 for i in range(len(prons[word]))]
      s = sum(pron_probs)
      for i in range(len(pron_probs)):
          pron_probs[i] = pron_probs[i] / s
      log_like = 0.0
      for (utt, start_frame) in stats[word]:
          prob = []
          soft_counts = []
          for i in range(len(prons[word])):
              phones = prons[word][i]
              soft_count = stats[word][(utt, start_frame)].get(phones, 0)
              if soft_count < args.delta: 
                  soft_count = args.delta
              soft_counts.append(soft_count)
          prob = [i[0] * i[1] for i in zip(soft_counts, pron_probs)]
          for i in range(len(prons[word])):
              prob_acc[i] += prob[i] / sum(prob)
          log_like += math.log(sum(prob))
      pron_probs = [1.0 / float(len(stats[word])) * p for p in prob_acc]
      log_like = 1.0 / float(len(stats[word])) * log_like
      if debug:
          print("Log_like of the word: ", log_like, "pron probs: ", pron_probs)
      return pron_probs, log_like
  
  def SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon, dianostic_info=False):
      prons = defaultdict(list) # Put all possible prons from three source lexicons into this dictionary
      src = {} # Source of each (word, pron) pair: 'P' = phonetic-decoding, 'G' = G2P, 'R' = reference
      learned_lexicon = defaultdict(set) # Put all selected prons in this dictionary
      for lexicon in ref_lexicon, g2p_lexicon, pd_lexicon:
          for word in lexicon:
              for pron in lexicon[word]:
                  prons[word].append(pron)
      for word in prons:
          for pron in prons[word]:
              if word in pd_lexicon and pron in pd_lexicon[word]:
                  src[(word, pron)] = 'P'
              if word in g2p_lexicon and pron in g2p_lexicon[word]:
                  src[(word, pron)] = 'G'
              if word in ref_lexicon and pron in ref_lexicon[word]:
                  src[(word, pron)] = 'R'
     
      for word in prons:
          if word not in stats:
              continue
          n = len(prons[word])
          pron_probs = [1/float(n) for i in range(n)]
          if dianostic_info:
              print("pronunciations of word '{}': {}".format(word, prons[word]))
          active_indexes = set(range(len(prons[word])))
         
          deleted_prons = [] # indexes of prons to be deleted
          soft_counts_normalized = []
          while len(active_indexes) > 1:
              log_like = 1.0
              log_like_last = -1.0
              num_iters = 0
              while abs(log_like - log_like_last) > 1e-7:
                  num_iters += 1
                  log_like_last = log_like
                  pron_probs, log_like = OneEMIter(args, word, stats, prons, pron_probs, False)
                  if log_like_last == 1.0 and len(soft_counts_normalized) == 0: # the first iteration
                      soft_counts_normalized = pron_probs
                      if dianostic_info: 
                          print("Avg.(over all egs) soft counts: {}".format(soft_counts_normalized))
              if dianostic_info:
                  print("
   Log_like after {} iters of EM: {}, estimated pron_probs: {} 
  ".format(
                          num_iters, log_like, pron_probs))
              candidates_to_delete = []
              
              for i in active_indexes:
                  pron_probs_mod = [p for p in pron_probs]
                  pron_probs_mod[i] = 0.0
                  for j in range(len(pron_probs_mod)):
                      if j in active_indexes and j != i:
                          pron_probs_mod[j] += 0.01
                  pron_probs_mod = [s / sum(pron_probs_mod) for s in pron_probs_mod]
                  log_like2 = 1.0
                  log_like2_last = -1.0
                  num_iters2 = 0
                  # Running EM until convengence
                  while abs(log_like2 - log_like2_last) > 0.001 :
                      num_iters2 += 1
                      log_like2_last = log_like2
                      pron_probs_mod, log_like2 = OneEMIter(args, word, stats,
                                                            prons, pron_probs_mod, False)
                  
                  loss_abs = log_like - log_like2 # absolute likelihood loss before normalization
                  # (supposed to be positive, but could be negative near zero because of numerical precision limit).
                  log_delta = math.log(args.delta)
                  thr = -log_delta
                  loss = loss_abs
                  source = src[(word, prons[word][i])]
                  if dianostic_info:
                      print("
   set the pron_prob of '{}' whose source is {}, to zero results in {}"
                      " loss in avg. log-likelihood; Num. iters until converging:{}. ".format(
                        prons[word][i], source, loss, num_iters2))
                  # Compute quality score q_b = loss_abs * / (M_w + beta_s(b)) + alpha_s(b) * log_delta
                  # See Sec. 4.3 and Alg. 1 in the paper.
                  if source == 'P':
                     thr *= args.alpha[0]
                     loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[0])
                  if source == 'G':
                     thr *= args.alpha[1]
                     loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[1])
                  if source == 'R':
                     thr *= args.alpha[2]
                     loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[2])
                  if loss - thr < 0: # loss - thr here is just q_b
                     if dianostic_info:
                         print("Smoothed log-like loss {} is smaller than threshold {} so that the quality"
                               "score {} is negative, adding the pron to the list of candidates to delete"
                               ". ".format(loss, thr, loss-thr))
                     candidates_to_delete.append((loss-thr, i))
              if len(candidates_to_delete) == 0:
                  break
              candidates_to_delete_sorted = sorted(candidates_to_delete, 
                                                   key=lambda candidates_to_delete: candidates_to_delete[0])
  
              deleted_candidate = candidates_to_delete_sorted[0]
              active_indexes.remove(deleted_candidate[1])
              pron_probs[deleted_candidate[1]] = 0.0
              for i in range(len(pron_probs)):
                  if i in active_indexes:
                      pron_probs[i] += 0.01
              pron_probs = [s / sum(pron_probs) for s in pron_probs]
              source = src[(word, prons[word][deleted_candidate[1]])]
              pron = prons[word][deleted_candidate[1]]
              soft_count = soft_counts_normalized[deleted_candidate[1]]
              quality_score = deleted_candidate[0]
              # This part of diagnostic info provides hints to the user on how to adjust the parameters.
              if dianostic_info:
                  print("removed pron {}, from source {} with quality score {:.5f}".format(
                          pron, source, quality_score)) 
                  if (source == 'P' and soft_count > 0.7 and len(stats[word]) > 5):
                      print("WARNING: alpha_{pd} or beta_{pd} may be too large!"
                            "    For the word '{}' whose count is {}, the candidate "
                            "    pronunciation from phonetic decoding '{}' with normalized "
                            "    soft count {} (out of 1) is rejected. It shouldn't have been"
                            "    rejected if alpha_{pd} is smaller than {}".format(
                              word, len(stats[word]), pron, soft_count, -loss / log_delta, 
                              -args.alpha[0] * len(stats[word]) + (objf_change + args.beta[0])),
                              file=sys.stderr)
                      if loss_abs > thr:
                          print("    or beta_{pd} is smaller than {}".format(
                                  (loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
                  if (source == 'G' and soft_count > 0.7 and len(stats[word]) > 5):
                      print("WARNING: alpha_{g2p} or beta_{g2p} may be too large!"
                            "    For the word '{}' whose count is {}, the candidate "
                            "    pronunciation from G2P '{}' with normalized "
                            "    soft count {} (out of 1) is rejected. It shouldn't have been"
                            "    rejected if alpha_{g2p} is smaller than {} ".format(
                              word, len(stats[word]), pron, soft_count, -loss / log_delta, 
                              -args.alpha[1] * len(stats[word]) + (objf_change + args.beta[1])),
                            file=sys.stderr)
                      if loss_abs > thr:
                          print("    or beta_{g2p} is smaller than {}.".format((
                                  loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
              deleted_prons.append(deleted_candidate[1])
          for i in range(len(prons[word])):
              if i not in deleted_prons:
                  learned_lexicon[word].add(prons[word][i])
  
      return learned_lexicon
  
  def WriteLearnedLexicon(learned_lexicon, file_handle):
      for word, prons in learned_lexicon.iteritems():
          for pron in prons:
              print('{0} {1}'.format(word, pron), file=file_handle)
      file_handle.close()
  
  def Main():
      args = GetArgs()
      
      # Read in three lexicon sources, word counts, and pron stats.
      counts = ReadWordCounts(args.word_counts_file_handle)
      ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
      g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
      pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
      stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
      pd_lexicon = FilterPhoneticDecodingLexicon(args, pd_lexicon)
                    
      # Select prons to construct the learned lexicon.
      learned_lexicon = SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon)
      
      # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
      WriteLearnedLexicon(learned_lexicon, args.learned_lexicon_handle)
  
  if __name__ == "__main__":
      Main()