Blame view
egs/hub4_spanish/s5/local/prepare_unicode_dict.py
8.9 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
#!/usr/bin/env python # Copyright 2016 Johns Hopkins University (Author: Matthew Wiesner) # Apache 2.0 # ======= Prepare dictionary directory (e.g. data/local) from lexicon.txt ===== # This script takes a valid kaldi format lexicon (lexicon.txt) as input and # from it creates the rest of the files in the dictionary directory. # The lexicon.txt can be created from, # # local/lexicon/make_unicode_lexicon.py <wordlist> <lexicon> <grapheme_map> # # using a list of words found in the training language(s) for example. But any # valid kaldi format lexicon should work. # # The files created are: # # 1. nonsilence_phones.txt # 2. silence_phones.txt # 3. optional_silence.txt # 4. extra_questions.txt # # You should probably just create these files in the same directory as you # created lexicon.txt (via local/lexicon/make_unicode_lexicon.py), otherwise # you will have to copy lexicon.txt into the output directory of this script. # # Since silence and non-silence phonemes are treated separately, this script # requires that the list of words whose pronunciations contain silence phones, # (phones that should be in silence_phones.txt), be entered using the # # --silence-lexicon <path-to-silence-lexicon.txt> # # option. If the option is not provided, two dictionary entries are created # automatically: # 1. !SIL SIL # 2. <unk> <oov> # # corresponding to entries for silence and unknown words respectively. # # # Any tokens in lexicon.txt occurring in columns other than the first are # considered to represent an acoustic unit. The set of all such tokens, that # do not also occur in silence_lexicon.txt (or that are not SIL), are # written to nonsilence_phones.txt. Each line in nonsilence_phones.txt # corresponds to an acoustic unit and its tagged versions seen in the lexicon. # A tagged acoustic unit is represented in lexicon.txt as a token followed by an # underscore and the name of the tag. # # Example: a a_tag1 a_tag2 a_tag1_tag2 # # These tags determine the extra questions # to ask in a later tree-building stage and are written to extra_questions.txt. # # The set of all such tokens that occur in silence_lexicon.txt are written to # silence_phones.txt. # # The acoustic units used in the lexicon can be phonemes, # graphemic-acoustic-units (units derived from a word's orthography in segmental # writing systems), units discovered from an unsupervised clustering procedure, # or other. For the purposes of this script, however, they are all referred to # as phonemes. # # # ============================================================================ from __future__ import print_function import codecs import sys import os import argparse # Extract a sorted set of distinct phonemes from the lexicon def extract_phonemes(lexicon): ''' Extract a sorted set of distinct phonemes from the lexicon. Usage: extract_phones(dictionary of lexical entries) Arguments: lexicon -- dictionary lexical entries Output: phonemes -- the sorted set of distinct phonemes that occurred in the lexicon. phonemes_dict -- the dictionary of keys as untagged base phonemes, and values as all types of tags, including untagged versions of the base phoneme. ''' # Read all baseform units into dictionary with {a: [a, a_1, a_2], # b: [b_1, b_3], ...} phonemes_dict = {} for word, pron in lexicon.items(): for p in pron.split(): try: base = p.split("_",1)[0] phonemes_dict[base] += [p] except KeyError: phonemes_dict[base] = [p] # Makes sure there are no repeats in the list phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()} # Get all unique phonemes phonemes = [] for v in phonemes_dict.values(): for p in v: phonemes.append(p) phonemes = sorted(set(phonemes)) return phonemes, phonemes_dict def write_phonemes(phonemes_dict, phonesfile): with codecs.open(phonesfile, "w", "utf-8") as fp: # Write each base phoneme with all tags on the same line for base_phoneme in sorted(phonemes_dict.keys()): line = "" for phoneme in sorted(phonemes_dict[base_phoneme]): line += phoneme + " " fp.write("%s " % line.strip()) def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict, sil_phonemes, sil_phonemes_dict, tags, extra_questions): with codecs.open(extra_questions, "w", "utf-8") as fp: # Write all unique "nonsilence_phones" to a single line. for p in nonsil_phonemes: fp.write("%s " % p) fp.write(" ") # Write the silence_lexicon for p in sil_phonemes: fp.write("%s " % p) fp.write(" ") # Write all possible phone_tag combinations that occur in the lexicon for tag in tags: for p in nonsil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in nonsil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) for p in sil_phonemes_dict.keys(): tagged_phoneme = "_".join([p, tag]) if(tagged_phoneme in sil_phonemes_dict[p]): fp.write("%s " % tagged_phoneme) fp.write(" ") def main(): # ----------------- Parse input arguments --------------------------- if(len(sys.argv[1:]) == 0): print("Usage: local/prepare_unicode_lexicon.txt <lexicon>" " <lexicon_dir>", file=sys.sterr) sys.exit(1) parser = argparse.ArgumentParser() parser.add_argument("lexicon", help="A kaldi format lexicon.") parser.add_argument("lexicon_dir", help="Directory to which all files" " should be written") parser.add_argument("--silence-lexicon", help="File with silence words " "and tab-separated pronunciations", action="store", default=None) args = parser.parse_args() # ---------------- Prepare the dictionary directory ----------------- # Create the data/local(/dict) directory for instance if it does not exist if not os.path.exists(args.lexicon_dir): os.makedirs(args.lexicon_dir) # ----------- Extract silence words and phonemes ----------------- sil_lexicon = {} try: with codecs.open(args.silence_lexicon, "r", encoding="utf-8") as fi: for line in fi: sil_word, sil_pron = line.strip().split(None, 1) sil_lexicon[sil_word] = sil_pron except TypeError: # Default silence token and pron (required for using optional silence) # Also default unk token and pron. sil_lexicon = {'!SIL': 'SIL', '<unk>': '<oov>'} except IOError: print("Could not find file", args.silence_lexicon) sys.exit(1) sil_phonemes, sil_phonemes_dict = extract_phonemes(sil_lexicon) # This catches the optional silence symbol, which we want to include if 'SIL' not in sil_phonemes: sil_phonemes = sil_phonemes.union(['SIL']) sil_phonemes_dict['SIL'] = ['SIL'] # ---------- Extract nonsilence words and phonemes --------------- nonsil_lexicon = {} try: with codecs.open(args.lexicon, "r", encoding="utf-8") as fi: for line in fi: word, pron = line.strip().split(None, 1) if word not in sil_lexicon: nonsil_lexicon[word] = pron except TypeError: print("Invalid lexicon argument") sys.exit(1) except IOError: print("Could not find file", args.lexicon) nonsil_phonemes, nonsil_phonemes_dict = extract_phonemes(nonsil_lexicon) # Write silence_phones.txt write_phonemes(sil_phonemes_dict, os.path.join(args.lexicon_dir, "silence_phones.txt")) # Write nonsilence_phones.txt write_phonemes(nonsil_phonemes_dict, os.path.join(args.lexicon_dir, "nonsilence_phones.txt")) # Write the optional_silence.txt file with open(os.path.join(args.lexicon_dir, "optional_silence.txt"), "w") as fp: fp.write("SIL ") # ------------------------- Extract tags --------------------------------- tags = [] for p in set(nonsil_phonemes).union(set(sil_phonemes)): # Only consider phonemes with tags p_tags = p.split("_") if(len(p_tags) > 1): tag = "_".join(p_tags[1:]) if(tag not in tags): tags.append(tag) # --------------- Write the extra questions file ------------------------- write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict, sil_phonemes, sil_phonemes_dict, tags, os.path.join(args.lexicon_dir, "extra_questions.txt")) if __name__ == "__main__": main() |