Blame view

egs/wsj/s5/steps/dict/apply_lexicon_edits.py 4.44 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
  #!/usr/bin/env python
  
  # Copyright 2016  Xiaohui Zhang
  # Apache 2.0.
  
  from __future__ import print_function
  import argparse
  import sys
  
  def GetArgs():
      parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
                                       "to produce a learned lexicon.",
                                       epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
  
      parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
                          help = "Input lexicon. Each line must be <word> <phones>.")
      parser.add_argument("lexicon_edits_file", metavar='<lexicon-edits-file>', type = str,
                          help = "Input lexicon edits file containing human-readable & editable"
                                 "pronounciation info.  The info for each word is like:"
                           "------------ an 4086.0 --------------"
                           "R  | Y |  2401.6 |  AH N"
                           "R  | Y |  640.8 |  AE N"
                           "P  | Y |  1035.5 |  IH N"
                           "R(ef), P(hone-decoding) represents the pronunciation source"
                           "Y/N means the recommended decision of including this pron or not"
                           "and the numbers are soft counts accumulated from lattice-align-word outputs. See steps/dict/select_prons_bayesian.py for more details.")
      parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
                          help = "Output lexicon to this file.")
  
      print (' '.join(sys.argv), file=sys.stderr)
  
      args = parser.parse_args()
      args = CheckArgs(args)
  
      return args
  
  def CheckArgs(args):
      if args.in_lexicon == "-":
          args.in_lexicon = sys.stdin
      else:
          args.in_lexicon_handle = open(args.in_lexicon)
      args.lexicon_edits_file_handle = open(args.lexicon_edits_file)
  
      if args.out_lexicon == "-":
          args.out_lexicon_handle = sys.stdout
      else:
          args.out_lexicon_handle = open(args.out_lexicon, "w")
  
      return args
  
  def ReadLexicon(lexicon_file_handle):
      lexicon = set()
      if lexicon_file_handle:
          for line in lexicon_file_handle.readlines():
              splits = line.strip().split()
              if len(splits) == 0:
                  continue
              if len(splits) < 2:
                  raise Exception('Invalid format of line ' + line
                                      + ' in lexicon file.')
              word = splits[0]
              phones = ' '.join(splits[1:])
              lexicon.add((word, phones))
      return lexicon
  
  def ApplyLexiconEdits(lexicon, lexicon_edits_file_handle):
      if lexicon_edits_file_handle:
          for line in lexicon_edits_file_handle.readlines():
              # skip all commented lines
              if line.startswith('#'):
                  continue
              # read a word from a line like "---- MICROPHONES 200.0 ----".
              if line.startswith('---'):
                  splits = line.strip().strip('-').strip().split()
                  if len(splits) != 2:
                      print(splits, file=sys.stderr)
                      raise Exception('Invalid format of line ' + line
                                          + ' in lexicon edits file.')
                  word = splits[0].strip()
              else:
              # parse the pron and decision 'Y/N' of accepting the pron or not,
              # from a line like: 'P  | Y |  42.0 |  M AY K R AH F OW N Z'
                  splits = line.split('|')
                  if len(splits) != 4:
                      raise Exception('Invalid format of line ' + line
                                          + ' in lexicon edits file.')
                  pron = splits[3].strip()
                  if splits[1].strip() == 'Y':
                      lexicon.add((word, pron))
                  elif splits[1].strip() == 'N':
                      lexicon.discard((word, pron))
                  else:
                      raise Exception('Invalid format of line ' + line
                                          + ' in lexicon edits file.')
      return lexicon
  
  
  def WriteLexicon(lexicon, out_lexicon_handle):
      for word, pron in lexicon:
          print('{0} {1}'.format(word, pron), file=out_lexicon_handle)
      out_lexicon_handle.close()
  
  def Main():
      args = GetArgs()
      lexicon = ReadLexicon(args.in_lexicon_handle)
      ApplyLexiconEdits(lexicon, args.lexicon_edits_file_handle)
      WriteLexicon(lexicon, args.out_lexicon_handle)
  
  if __name__ == "__main__":
      Main()