Blame view

egs/wsj/s5/utils/lang/internal/modify_unk_pron.py 3.92 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
  #!/usr/bin/env python
  
  # Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0.
  
  from __future__ import print_function
  import sys
  import os
  import argparse
  from collections import defaultdict
  
  # note, this was originally based
  
  parser = argparse.ArgumentParser(description="""
  This script replaces the existing pronunciation of the
  unknown word in the provided lexicon, with a pronunciation
  consisting of three disambiguation symbols: #1 followed by #2
  followed by #3.
  The #2 will later be replaced by a phone-level LM by
  apply_unk_lm.sh (called later on by prepare_lang.sh).
  Caution: this script is sensitive to the basename of the
  lexicon: it should be called either lexiconp.txt, in which
  case the format is 'word pron-prob p1 p2 p3 ...'
  or lexiconp_silprob.txt, in which case the format is
  'word pron-prob sil-prob1 sil-prob2 sil-prob3 p1 p2 p3....'.
  It is an error if there is not exactly one pronunciation of
  the unknown word in the lexicon.""",
  epilog="""E.g.: modify_unk_pron.py data/local/lang/lexiconp.txt '<unk>'.
  This script is called from prepare_lang.sh.""")
  
  parser.add_argument('lexicon_file', type = str,
                      help = 'Filename of the lexicon file to operate on (this is '
                      'both an input and output of this script).')
  parser.add_argument('unk_word', type = str,
                      help = "The printed form of the unknown/OOV word, normally '<unk>'.")
  
  args = parser.parse_args()
  
  if len(args.unk_word.split()) != 1:
      sys.exit("{0}: invalid unknown-word '{1}'".format(
          sys.argv[0], args.unk_word))
  
  basename = os.path.basename(args.lexicon_file)
  if basename != 'lexiconp.txt' and basename != 'lexiconp_silprob.txt':
      sys.exit("{0}: expected the basename of the lexicon file to be either "
               "'lexiconp.txt' or 'lexiconp_silprob.txt', got: {1}".format(
                   sys.argv[0], args.lexicon_file))
  # the lexiconp.txt format is: word pron-prob p1 p2 p3...
  # lexiconp_silprob.txt has 3 extra real-valued fields after the pron-prob.
  num_fields_before_pron = 2 if basename == 'lexiconp.txt' else 5
  
  print(' '.join(sys.argv), file = sys.stderr)
  
  try:
      lexicon_in = open(args.lexicon_file, 'r')
  except:
      sys.exit("{0}: failed to open lexicon file {1}".format(
          sys.argv[0], args.lexicon_file))
  
  split_lines = []
  unk_index = -1
  while True:
      line = lexicon_in.readline()
      if line == '':
          break
      this_split_line = line.split()
      if this_split_line[0] == args.unk_word:
          if unk_index != -1:
              sys.exit("{0}: expected there to be exactly one pronunciation of the "
                       "unknown word {1} in {2}, but there are more than one.".format(
                           sys.argv[0], args.lexicon_file, args.unk_word))
          unk_index = len(split_lines)
      if len(this_split_line) <= num_fields_before_pron:
          sys.exit("{0}: input file {1} had a bad line (too few fields): {2}".format(
              sys.argv[0], args.lexicon_file, line[:-1]))
      split_lines.append(this_split_line)
  
  if len(split_lines) == 0:
      sys.exit("{0}: read no data from lexicon file {1}.".format(
          sys.argv[0], args.lexicon_file))
  
  
  if unk_index == -1:
      sys.exit("{0}: expected there to be exactly one pronunciation of the "
               "unknown word {1} in {2}, but there are none.".format(
                   sys.argv[0], args.unk_word, args.lexicon_file))
  
  lexicon_in.close()
  
  # now modify the pron.
  split_lines[unk_index] = split_lines[unk_index][0:num_fields_before_pron] + [ '#1', '#2', '#3' ]
  
  
  try:
      # write to the same file.
      lexicon_out = open(args.lexicon_file, 'w')
  except:
      sys.exit("{0}: failed to open lexicon file {1} for writing (permissions probleM?)".format(
          sys.argv[0], args.lexicon_file))
  
  for split_line in split_lines:
      print(' '.join(split_line), file = lexicon_out)
  
  try:
      lexicon_out.close()
  except:
      sys.exit("{0}: failed to close lexicon file {1} after writing (disk full?)".format(
          sys.argv[0], args.lexicon_file))