Blame view

egs/hub4_spanish/s5/local/prepare_unicode_dict.py 8.9 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
  #!/usr/bin/env python
  
  # Copyright 2016 Johns Hopkins University (Author: Matthew Wiesner)
  # Apache 2.0
  
  # ======= Prepare dictionary directory (e.g. data/local) from lexicon.txt =====
  # This script takes a valid kaldi format lexicon (lexicon.txt) as input and
  # from it creates the rest of the files in the dictionary directory.
  # The lexicon.txt can be created from, 
  # 
  #    local/lexicon/make_unicode_lexicon.py <wordlist> <lexicon> <grapheme_map> 
  #
  # using a list of words found in the training language(s) for example. But any
  # valid kaldi format lexicon should work.
  #
  # The files created are:
  #
  # 1. nonsilence_phones.txt
  # 2. silence_phones.txt
  # 3. optional_silence.txt
  # 4. extra_questions.txt
  #
  # You should probably just create these files in the same directory as you
  # created lexicon.txt (via local/lexicon/make_unicode_lexicon.py), otherwise
  # you will have to copy lexicon.txt into the output directory of this script.
  #
  # Since silence and non-silence phonemes are treated separately, this script
  # requires that the list of words whose pronunciations contain silence phones,
  # (phones that should be in silence_phones.txt), be entered using the 
  # 
  #   --silence-lexicon <path-to-silence-lexicon.txt> 
  #
  # option. If the option is not provided, two dictionary entries are created 
  # automatically: 
  # 1. !SIL SIL
  # 2. <unk> <oov>
  #
  # corresponding to entries for silence and unknown words respectively.
  #
  #
  # Any tokens in lexicon.txt occurring in columns other than the first are
  # considered to represent an acoustic unit. The set of all such tokens, that
  # do not also occur in silence_lexicon.txt (or that are not SIL), are 
  # written to nonsilence_phones.txt. Each line in nonsilence_phones.txt
  # corresponds to an acoustic unit and its tagged versions seen in the lexicon.
  # A tagged acoustic unit is represented in lexicon.txt as a token followed by an
  # underscore and the name of the tag. 
  #
  # Example: a a_tag1 a_tag2 a_tag1_tag2
  # 
  # These tags determine the extra questions
  # to ask in a later tree-building stage and are written to extra_questions.txt.
  #
  # The set of all such tokens that occur in silence_lexicon.txt are written to
  # silence_phones.txt.
  #
  # The acoustic units used in the lexicon can be phonemes,
  # graphemic-acoustic-units (units derived from a word's orthography in segmental
  # writing systems), units discovered from an unsupervised clustering procedure,
  # or other. For the purposes of this script, however, they are all referred to
  # as phonemes.
  #
  # # ============================================================================
  
  from __future__ import print_function
  import codecs
  import sys
  import os
  import argparse
  
  
  # Extract a sorted set of distinct phonemes from the lexicon
  def extract_phonemes(lexicon):
      '''
          Extract a sorted set of distinct phonemes from the lexicon.
  
          Usage: extract_phones(dictionary of lexical entries)
  
          Arguments:
              lexicon -- dictionary lexical entries
  
          Output:
              phonemes      -- the sorted set of distinct phonemes
                               that occurred in the lexicon.
              phonemes_dict -- the dictionary of keys as untagged base
                               phonemes, and values as all types of tags,
                               including untagged versions of the base phoneme.
      '''
      # Read all baseform units into dictionary with {a: [a, a_1, a_2],
      #                                               b: [b_1, b_3], ...}
      phonemes_dict = {}
      for word, pron in lexicon.items():
          for p in pron.split():
              try:
                  base = p.split("_",1)[0]
                  phonemes_dict[base] += [p]
              except KeyError:
                  phonemes_dict[base] = [p]
  
      # Makes sure there are no repeats in the list
      phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()}
  
      # Get all unique phonemes
      phonemes = []
      for v in phonemes_dict.values():
          for p in v:
              phonemes.append(p)
  
      phonemes = sorted(set(phonemes))
  
      return phonemes, phonemes_dict
  
  
  def write_phonemes(phonemes_dict, phonesfile):
      with codecs.open(phonesfile, "w", "utf-8") as fp:
          # Write each base phoneme with all tags on the same line
          for base_phoneme in sorted(phonemes_dict.keys()):
              line = ""
              for phoneme in sorted(phonemes_dict[base_phoneme]):
                  line += phoneme + " "
              fp.write("%s
  " % line.strip())
  
  
  def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict,
                            sil_phonemes, sil_phonemes_dict,
                            tags, extra_questions):
      with codecs.open(extra_questions, "w", "utf-8") as fp:
          # Write all unique "nonsilence_phones" to a single line.
          for p in nonsil_phonemes:
              fp.write("%s " % p)
          fp.write("
  ")
  
          # Write the silence_lexicon
          for p in sil_phonemes:
              fp.write("%s " % p)
          fp.write("
  ")
  
          # Write all possible phone_tag combinations that occur in the lexicon
          for tag in tags:
              for p in nonsil_phonemes_dict.keys():
                  tagged_phoneme = "_".join([p, tag])
                  if(tagged_phoneme in nonsil_phonemes_dict[p]):
                      fp.write("%s " % tagged_phoneme)
              for p in sil_phonemes_dict.keys():
                  tagged_phoneme = "_".join([p, tag])
                  if(tagged_phoneme in sil_phonemes_dict[p]):
                      fp.write("%s " % tagged_phoneme)
              fp.write("
  ")
  
  
  def main():
      # ----------------- Parse input arguments ---------------------------
      if(len(sys.argv[1:]) == 0):
          print("Usage: local/prepare_unicode_lexicon.txt <lexicon>"
                " <lexicon_dir>", file=sys.sterr)
          sys.exit(1)
  
      parser = argparse.ArgumentParser()
      parser.add_argument("lexicon", help="A kaldi format lexicon.")
      parser.add_argument("lexicon_dir", help="Directory to which all files"
                          " should be written")
      parser.add_argument("--silence-lexicon", help="File with silence words "
                          "and tab-separated pronunciations", action="store",
                          default=None)
      args = parser.parse_args() 
  
      # ---------------- Prepare the dictionary directory -----------------
      # Create the data/local(/dict) directory for instance if it does not exist
      if not os.path.exists(args.lexicon_dir):
          os.makedirs(args.lexicon_dir)
  
      # ----------- Extract silence words and phonemes -----------------
      sil_lexicon = {}
      try:
          with codecs.open(args.silence_lexicon, "r", encoding="utf-8") as fi:
              for line in fi:
                  sil_word, sil_pron = line.strip().split(None, 1)
                  sil_lexicon[sil_word] = sil_pron
      except TypeError:
          # Default silence token and pron (required for using optional silence)
          # Also default unk token and pron.
          sil_lexicon = {'!SIL': 'SIL', '<unk>': '<oov>'}
      except IOError:
          print("Could not find file", args.silence_lexicon)
          sys.exit(1)
  
      sil_phonemes, sil_phonemes_dict = extract_phonemes(sil_lexicon)
  
      # This catches the optional silence symbol, which we want to include
      if 'SIL' not in sil_phonemes:
          sil_phonemes = sil_phonemes.union(['SIL'])
          sil_phonemes_dict['SIL'] = ['SIL']
  
      # ---------- Extract nonsilence words and phonemes ---------------
      nonsil_lexicon = {}
      try:
          with codecs.open(args.lexicon, "r", encoding="utf-8") as fi:
              for line in fi:
                  word, pron = line.strip().split(None, 1)
                  if word not in sil_lexicon:
                      nonsil_lexicon[word] = pron
      except TypeError:
          print("Invalid lexicon argument")
          sys.exit(1)
      except IOError:
          print("Could not find file", args.lexicon)
  
      nonsil_phonemes, nonsil_phonemes_dict = extract_phonemes(nonsil_lexicon)
      
      # Write silence_phones.txt
      write_phonemes(sil_phonemes_dict,
                     os.path.join(args.lexicon_dir, "silence_phones.txt"))
  
      # Write nonsilence_phones.txt
      write_phonemes(nonsil_phonemes_dict,
                     os.path.join(args.lexicon_dir, "nonsilence_phones.txt"))
  
      # Write the optional_silence.txt file
      with open(os.path.join(args.lexicon_dir, "optional_silence.txt"), "w") as fp:
          fp.write("SIL
  ")
  
      # ------------------------- Extract tags ---------------------------------
      tags = []
      for p in set(nonsil_phonemes).union(set(sil_phonemes)):
          # Only consider phonemes with tags
          p_tags = p.split("_")
          if(len(p_tags) > 1):
              tag = "_".join(p_tags[1:])
              if(tag not in tags):
                  tags.append(tag)
  
      # --------------- Write the extra questions file -------------------------
      write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict,
                            sil_phonemes, sil_phonemes_dict, tags,
                            os.path.join(args.lexicon_dir, "extra_questions.txt"))
  
  
  if __name__ == "__main__":
      main()