get_candidate_prons.pl
7.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env perl
# This script takes three command-line arguments (typically files, or "-"):
# the suffix rules (as output by get_rules.pl), the rule-hierarchy
# (from get_rule_hierarchy.pl), and the words that we want prons to be
# generated for (one per line).
# The output consists of candidate generated pronunciations for those words,
# together with information about how we generated those pronunciations.
# This does not do pruning of the candidates using the restriction
# "you can't use a more general rule when a more specific one is applicable".
# That is done by limit_candidate_prons.pl.
# Each line of the output consists of a 4-tuple, separated by ";", of the
# form:
# word;pron;base-word;base-pron;rule-name;destress[;rule-score]
# [the last field is only present if you supplied rules with score information].
# where:
# - "word" is the input word that we queried for, e.g. WASTED
# - "pron" is the generated pronunciation, e.g. "W EY1 S T AH0 D"
# - rule-name is a 4-tuple separated by commas that describes the rule, e.g.
# "STED,STING,D,NG",
# - "base-word" is the base-word we're getting the pron from,
# e.g. WASTING
# - "base-pron" is the pron of the base-word, e.g. "W EY1 S T IH0 NG"
# - "destress" is either "yes" or "no" and corresponds to whether we destressed the
# base-word or not [de-stressing just corresponds to just taking any 2's down to 1's,
# although we may extend this in future]...
# - "rule-score" is a numeric score of the rule (this field is only present
# if there was score information in your rules.
(@ARGV == 2 || @ARGV == 3) || die "Usage: get_candidate_prons.pl rules base-dict [ words ]";
$min_prefix_len = 3; # this should probably match with get_rules.pl
$rules = shift @ARGV; # Note: rules may be with destress "yes/no" indicators or without...
# if without, it's treated as if both "yes" and "no" are present.
$dict = shift @ARGV;
open(R, "<$rules") || die "Opening rules file: $rules";
sub process_word;
while(<R>) {
chop $_;
my ($rule, $destress, $rule_score) = split(";", $_); # We may have "destress" markings (yes|no),
# and scores, or we may have just rule, in which case
# $destress and $rule_score will be undefined.
my @R = split(",", $rule, 4); # "my" means new instance of @R each
# time we do this loop -> important because we'll be creating
# a reference to @R below.
# Note: the last arg to SPLIT tells it how many fields max to get.
# This stops it from omitting empty trailing fields.
@R == 4 || die "Bad rule $_";
$suffix = $R[0]; # Suffix of word we want pron for.
if (!defined $isrule{$rule}) {
$isrule{$rule} = 1; # make sure we do this only once for each rule
# (don't repeate for different stresses).
if (!defined $suffix2rule{$suffix}) {
# The syntax [ $x, $y, ... ] means a reference to a newly created array
# containing $x, $y, etc. \@R creates an array reference to R.
# so suffix2rule is a hash from suffix to ref to array of refs to
# 4-dimensional arrays.
$suffix2rule{$suffix} = [ \@R ];
} else {
# Below, the syntax @{$suffix2rule{$suffix}} dereferences the array
# reference inside the hash; \@R pushes onto that array a new array
# reference pointing to @R.
push @{$suffix2rule{$suffix}}, \@R;
}
}
if (!defined $rule_score) { $rule_score = -1; } # -1 means we don't have the score info.
# Now store information on which destress markings (yes|no) this rule
# is valid for, and the associated scores (if supplied)
# If just the rule is given (i.e. no destress marking specified),
# assume valid for both.
if (!defined $destress) { # treat as if both "yes" and "no" are valid.
$rule_and_destress_to_rule_score{$rule.";yes"} = $rule_score;
$rule_and_destress_to_rule_score{$rule.";no"} = $rule_score;
} else {
$rule_and_destress_to_rule_score{$rule.";".$destress} = $rule_score;
}
}
open(D, "<$dict") || die "Opening base dictionary: $dict";
while(<D>) {
@A = split(" ", $_);
$word = shift @A;
$pron = join(" ", @A);
if (!defined $word2prons{$word}) {
$word2prons{$word} = [ $pron ]; # Ref to new anonymous array containing just "pron".
} else {
push @{$word2prons{$word}}, $pron; # Push $pron onto array referred to (@$ref derefs array).
}
}
foreach $word (%word2prons) {
# Set up the hash "prefixcount", which says how many times a char-sequence
# is a prefix (not necessarily a strict prefix) of a word in the dict.
$len = length($word);
for ($l = 0; $l <= $len; $l++) {
$prefixcount{substr($word, 0, $l)}++;
}
}
open(R, "<$rules") || die "Opening rules file: $rules";
while(<>) {
chop;
m/^\S+$/ || die;
process_word($_);
}
sub process_word {
my $word = shift @_;
$len = length($word);
# $owncount is used in evaluating whether a particular prefix is a prefix
# of some other word in the dict... if a word itself may be in the dict
# (usually because we're running this on the dict itself), we need to
# correct for this.
if (defined $word2prons{$word}) { $owncount = 1; } else { $owncount = 0; }
for ($prefix_len = $min_prefix_len; $prefix_len <= $len; $prefix_len++) {
my $prefix = substr($word, 0, $prefix_len);
my $suffix = substr($word, $prefix_len);
if ($prefixcount{$prefix} - $owncount == 0) {
# This prefix is not a prefix of any word in the dict, so no point
# checking the rules below-- none of them can match.
next;
}
$rules_array_ref = $suffix2rule{$suffix};
if (defined $rules_array_ref) {
foreach $R (@$rules_array_ref) { # @$rules_array_ref dereferences the array.
# $R is a refernce to a 4-dimensional array, whose elements we access with
# $$R[0], etc.
my $base_suffix = $$R[1];
my $base_word = $prefix . $base_suffix;
my $base_prons_ref = $word2prons{$base_word};
if (defined $base_prons_ref) {
my $psuffix = $$R[2];
my $base_psuffix = $$R[3];
if ($base_psuffix ne "") {
$base_psuffix = " " . $base_psuffix;
# Include " ", the space between phones, to prevent
# matching partial phones below.
}
my $base_psuffix_len = length($base_psuffix);
foreach $base_pron (@$base_prons_ref) { # @$base_prons_ref derefs
# that reference to an array.
my $base_pron_prefix_len = length($base_pron) - $base_psuffix_len;
# Note: these lengths are in characters, not phones.
if ($base_pron_prefix_len >= 0 &&
substr($base_pron, $base_pron_prefix_len) eq $base_psuffix) {
# The suffix of the base_pron is what it should be.
my $pron_prefix = substr($base_pron, 0, $base_pron_prefix_len);
my $rule = join(",", @$R); # we'll output this..
my $len = @R;
for ($destress = 0; $destress <= 1; $destress++) { # Two versions
# of each rule: with destressing and without.
# pron is the generated pron.
if ($destress) { $pron_prefix =~ s/2/1/g; }
my $pron;
if ($psuffix ne "") { $pron = $pron_prefix . " " . $psuffix; }
else { $pron = $pron_prefix; }
# Now print out the info about the generated pron.
my $destress_mark = ($destress ? "yes" : "no");
my $rule_score = $rule_and_destress_to_rule_score{$rule.";".$destress_mark};
if (defined $rule_score) { # Means that the (rule,destress) combination was
# seen [note: this if-statement may be pointless, as currently we don't
# do any pruning of rules].
my @output = ($word, $pron, $base_word, $base_pron, $rule, $destress_mark);
if ($rule_score != -1) { push @output, $rule_score; } # If scores were supplied,
# we also output the score info.
print join(";", @output) . "\n";
}
}
}
}
}
}
}
}
}