Blame view
egs/wsj/s5/utils/lang/internal/modify_unk_pron.py
3.92 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
#!/usr/bin/env python # Copyright 2016 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0. from __future__ import print_function import sys import os import argparse from collections import defaultdict # note, this was originally based parser = argparse.ArgumentParser(description=""" This script replaces the existing pronunciation of the unknown word in the provided lexicon, with a pronunciation consisting of three disambiguation symbols: #1 followed by #2 followed by #3. The #2 will later be replaced by a phone-level LM by apply_unk_lm.sh (called later on by prepare_lang.sh). Caution: this script is sensitive to the basename of the lexicon: it should be called either lexiconp.txt, in which case the format is 'word pron-prob p1 p2 p3 ...' or lexiconp_silprob.txt, in which case the format is 'word pron-prob sil-prob1 sil-prob2 sil-prob3 p1 p2 p3....'. It is an error if there is not exactly one pronunciation of the unknown word in the lexicon.""", epilog="""E.g.: modify_unk_pron.py data/local/lang/lexiconp.txt '<unk>'. This script is called from prepare_lang.sh.""") parser.add_argument('lexicon_file', type = str, help = 'Filename of the lexicon file to operate on (this is ' 'both an input and output of this script).') parser.add_argument('unk_word', type = str, help = "The printed form of the unknown/OOV word, normally '<unk>'.") args = parser.parse_args() if len(args.unk_word.split()) != 1: sys.exit("{0}: invalid unknown-word '{1}'".format( sys.argv[0], args.unk_word)) basename = os.path.basename(args.lexicon_file) if basename != 'lexiconp.txt' and basename != 'lexiconp_silprob.txt': sys.exit("{0}: expected the basename of the lexicon file to be either " "'lexiconp.txt' or 'lexiconp_silprob.txt', got: {1}".format( sys.argv[0], args.lexicon_file)) # the lexiconp.txt format is: word pron-prob p1 p2 p3... # lexiconp_silprob.txt has 3 extra real-valued fields after the pron-prob. num_fields_before_pron = 2 if basename == 'lexiconp.txt' else 5 print(' '.join(sys.argv), file = sys.stderr) try: lexicon_in = open(args.lexicon_file, 'r') except: sys.exit("{0}: failed to open lexicon file {1}".format( sys.argv[0], args.lexicon_file)) split_lines = [] unk_index = -1 while True: line = lexicon_in.readline() if line == '': break this_split_line = line.split() if this_split_line[0] == args.unk_word: if unk_index != -1: sys.exit("{0}: expected there to be exactly one pronunciation of the " "unknown word {1} in {2}, but there are more than one.".format( sys.argv[0], args.lexicon_file, args.unk_word)) unk_index = len(split_lines) if len(this_split_line) <= num_fields_before_pron: sys.exit("{0}: input file {1} had a bad line (too few fields): {2}".format( sys.argv[0], args.lexicon_file, line[:-1])) split_lines.append(this_split_line) if len(split_lines) == 0: sys.exit("{0}: read no data from lexicon file {1}.".format( sys.argv[0], args.lexicon_file)) if unk_index == -1: sys.exit("{0}: expected there to be exactly one pronunciation of the " "unknown word {1} in {2}, but there are none.".format( sys.argv[0], args.unk_word, args.lexicon_file)) lexicon_in.close() # now modify the pron. split_lines[unk_index] = split_lines[unk_index][0:num_fields_before_pron] + [ '#1', '#2', '#3' ] try: # write to the same file. lexicon_out = open(args.lexicon_file, 'w') except: sys.exit("{0}: failed to open lexicon file {1} for writing (permissions probleM?)".format( sys.argv[0], args.lexicon_file)) for split_line in split_lines: print(' '.join(split_line), file = lexicon_out) try: lexicon_out.close() except: sys.exit("{0}: failed to close lexicon file {1} after writing (disk full?)".format( sys.argv[0], args.lexicon_file)) |