modify_unk_pron.py
3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# Copyright 2016 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
from __future__ import print_function
import sys
import os
import argparse
from collections import defaultdict
# note, this was originally based
parser = argparse.ArgumentParser(description="""
This script replaces the existing pronunciation of the
unknown word in the provided lexicon, with a pronunciation
consisting of three disambiguation symbols: #1 followed by #2
followed by #3.
The #2 will later be replaced by a phone-level LM by
apply_unk_lm.sh (called later on by prepare_lang.sh).
Caution: this script is sensitive to the basename of the
lexicon: it should be called either lexiconp.txt, in which
case the format is 'word pron-prob p1 p2 p3 ...'
or lexiconp_silprob.txt, in which case the format is
'word pron-prob sil-prob1 sil-prob2 sil-prob3 p1 p2 p3....'.
It is an error if there is not exactly one pronunciation of
the unknown word in the lexicon.""",
epilog="""E.g.: modify_unk_pron.py data/local/lang/lexiconp.txt '<unk>'.
This script is called from prepare_lang.sh.""")
parser.add_argument('lexicon_file', type = str,
help = 'Filename of the lexicon file to operate on (this is '
'both an input and output of this script).')
parser.add_argument('unk_word', type = str,
help = "The printed form of the unknown/OOV word, normally '<unk>'.")
args = parser.parse_args()
if len(args.unk_word.split()) != 1:
sys.exit("{0}: invalid unknown-word '{1}'".format(
sys.argv[0], args.unk_word))
basename = os.path.basename(args.lexicon_file)
if basename != 'lexiconp.txt' and basename != 'lexiconp_silprob.txt':
sys.exit("{0}: expected the basename of the lexicon file to be either "
"'lexiconp.txt' or 'lexiconp_silprob.txt', got: {1}".format(
sys.argv[0], args.lexicon_file))
# the lexiconp.txt format is: word pron-prob p1 p2 p3...
# lexiconp_silprob.txt has 3 extra real-valued fields after the pron-prob.
num_fields_before_pron = 2 if basename == 'lexiconp.txt' else 5
print(' '.join(sys.argv), file = sys.stderr)
try:
lexicon_in = open(args.lexicon_file, 'r')
except:
sys.exit("{0}: failed to open lexicon file {1}".format(
sys.argv[0], args.lexicon_file))
split_lines = []
unk_index = -1
while True:
line = lexicon_in.readline()
if line == '':
break
this_split_line = line.split()
if this_split_line[0] == args.unk_word:
if unk_index != -1:
sys.exit("{0}: expected there to be exactly one pronunciation of the "
"unknown word {1} in {2}, but there are more than one.".format(
sys.argv[0], args.lexicon_file, args.unk_word))
unk_index = len(split_lines)
if len(this_split_line) <= num_fields_before_pron:
sys.exit("{0}: input file {1} had a bad line (too few fields): {2}".format(
sys.argv[0], args.lexicon_file, line[:-1]))
split_lines.append(this_split_line)
if len(split_lines) == 0:
sys.exit("{0}: read no data from lexicon file {1}.".format(
sys.argv[0], args.lexicon_file))
if unk_index == -1:
sys.exit("{0}: expected there to be exactly one pronunciation of the "
"unknown word {1} in {2}, but there are none.".format(
sys.argv[0], args.unk_word, args.lexicon_file))
lexicon_in.close()
# now modify the pron.
split_lines[unk_index] = split_lines[unk_index][0:num_fields_before_pron] + [ '#1', '#2', '#3' ]
try:
# write to the same file.
lexicon_out = open(args.lexicon_file, 'w')
except:
sys.exit("{0}: failed to open lexicon file {1} for writing (permissions probleM?)".format(
sys.argv[0], args.lexicon_file))
for split_line in split_lines:
print(' '.join(split_line), file = lexicon_out)
try:
lexicon_out.close()
except:
sys.exit("{0}: failed to close lexicon file {1} after writing (disk full?)".format(
sys.argv[0], args.lexicon_file))