Blame view
egs/sprakbanken/s5/local/dict/limit_candidate_prons.pl
3.52 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
#!/usr/bin/env perl # This program enforces the rule that # if a "more specific" rule applies, we cannot use the more general rule. # It takes in tuples generated by get_candidate_prons (one per line, separated # by ";"), of the form: # word;pron;base-word;base-pron;rule-name;de-stress[;rule-score] # [note: we mean that the last element, the numeric score of the rule, is optional] # and it outputs a (generally shorter) list # of the same form. # For each word: # For each (base-word,base-pron): # Eliminate "more-general" rules as follows: # For each pair of rules applying to this (base-word, base-pron): # If pair is in more-general hash, disallow more general one. # Let the output be: for each (base-word, base-pron, rule): # for (destress-prefix) in [yes, no], do: # print out the word input, the rule-name, [destressed:yes|no], and the new pron. if (@ARGV != 1 && @ARGV != 2) { die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons"; } $hierarchy = shift @ARGV; open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy"; while(<H>) { chop; m:.+;.+: || die "Bad rule-hierarchy line $_"; $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2} # is defined, else undefined. } sub process_word; undef $cur_word; @cur_lines = (); while(<>) { # input, output is: # word;pron;base-word;base-pron;rule-name;destress;score chop; m:^([^;]+);: || die "Unexpected input: $_"; $word = $1; if (!defined $cur_word || $word eq $cur_word) { if (!defined $cur_word) { $cur_word = $word; } push @cur_lines, $_; } else { process_word(@cur_lines); # Process a series of suggested prons # for a particular word. $cur_word = $word; @cur_lines = ( $_ ); } } process_word(@cur_lines); sub process_word { my %pair2rule_list; # hash from $baseword.";".$baseword to ref # to array of [ line1, line2, ... ]. my @cur_lines = @_; foreach my $line (@cur_lines) { my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); my $key = $baseword.";".$basepron; if (defined $pair2rule_list{$key}) { push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to # by the array ref inside {}. } else { $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x) } } while ( my ($key, $value) = each(%pair2rule_list) ) { my @lines = @$value; # array of lines that are for this (baseword,basepron). my @stress, @rules; # Arrays of stress markers and rule names, indexed by # same index that indexes @lines. for (my $n = 0; $n < @lines; $n++) { my $line = $lines[$n]; my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); $stress[$n] = $destress; $rules[$n] = $rulename; } for (my $m = 0; $m < @lines; $m++) { my $ok = 1; # if stays 1, this line is OK. for (my $n = 0; $n < @lines; $n++) { if ($m != $n && $stress[$m] eq $stress[$n]) { if (defined $hierarchy{$rules[$n].";".$rules[$m]}) { # Note: this "hierarchy" variable is defined if $rules[$n] is a more # specific instances of $rules[$m], thus invalidating $rules[$m]. $ok = 0; last; # no point iterating further. } } } if ($ok != 0) { print $lines[$m] . " "; } } } } |