limit_candidate_prons.pl
3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env perl
# This program enforces the rule that
# if a "more specific" rule applies, we cannot use the more general rule.
# It takes in tuples generated by get_candidate_prons (one per line, separated
# by ";"), of the form:
# word;pron;base-word;base-pron;rule-name;de-stress[;rule-score]
# [note: we mean that the last element, the numeric score of the rule, is optional]
# and it outputs a (generally shorter) list
# of the same form.
# For each word:
# For each (base-word,base-pron):
# Eliminate "more-general" rules as follows:
# For each pair of rules applying to this (base-word, base-pron):
# If pair is in more-general hash, disallow more general one.
# Let the output be: for each (base-word, base-pron, rule):
# for (destress-prefix) in [yes, no], do:
# print out the word input, the rule-name, [destressed:yes|no], and the new pron.
if (@ARGV != 1 && @ARGV != 2) {
die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons";
}
$hierarchy = shift @ARGV;
open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy";
while(<H>) {
chop;
m:.+;.+: || die "Bad rule-hierarchy line $_";
$hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule
# and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2}
# is defined, else undefined.
}
sub process_word;
undef $cur_word;
@cur_lines = ();
while(<>) {
# input, output is:
# word;pron;base-word;base-pron;rule-name;destress;score
chop;
m:^([^;]+);: || die "Unexpected input: $_";
$word = $1;
if (!defined $cur_word || $word eq $cur_word) {
if (!defined $cur_word) { $cur_word = $word; }
push @cur_lines, $_;
} else {
process_word(@cur_lines); # Process a series of suggested prons
# for a particular word.
$cur_word = $word;
@cur_lines = ( $_ );
}
}
process_word(@cur_lines);
sub process_word {
my %pair2rule_list; # hash from $baseword.";".$baseword to ref
# to array of [ line1, line2, ... ].
my @cur_lines = @_;
foreach my $line (@cur_lines) {
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
my $key = $baseword.";".$basepron;
if (defined $pair2rule_list{$key}) {
push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to
# by the array ref inside {}.
} else {
$pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x)
}
}
while ( my ($key, $value) = each(%pair2rule_list) ) {
my @lines = @$value; # array of lines that are for this (baseword,basepron).
my @stress, @rules; # Arrays of stress markers and rule names, indexed by
# same index that indexes @lines.
for (my $n = 0; $n < @lines; $n++) {
my $line = $lines[$n];
my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
$stress[$n] = $destress;
$rules[$n] = $rulename;
}
for (my $m = 0; $m < @lines; $m++) {
my $ok = 1; # if stays 1, this line is OK.
for (my $n = 0; $n < @lines; $n++) {
if ($m != $n && $stress[$m] eq $stress[$n]) {
if (defined $hierarchy{$rules[$n].";".$rules[$m]}) {
# Note: this "hierarchy" variable is defined if $rules[$n] is a more
# specific instances of $rules[$m], thus invalidating $rules[$m].
$ok = 0;
last; # no point iterating further.
}
}
}
if ($ok != 0) {
print $lines[$m] . "\n";
}
}
}
}