Blame view

egs/wsj/s5/local/dict/limit_candidate_prons.pl 3.52 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
  #!/usr/bin/env perl
  
  # This program enforces the rule that
  # if a "more specific" rule applies, we cannot use the more general rule.
  # It takes in tuples generated by get_candidate_prons (one per line, separated
  # by ";"), of the form:
  # word;pron;base-word;base-pron;rule-name;de-stress[;rule-score]
  # [note: we mean that the last element, the numeric score of the rule, is optional]
  # and it outputs a (generally shorter) list
  # of the same form.
  
  
  # For each word:
    # For each (base-word,base-pron):
    #  Eliminate "more-general" rules as follows:
    #    For each pair of rules applying to this (base-word, base-pron):
    #      If pair is in more-general hash, disallow more general one.
    # Let the output be: for each (base-word, base-pron, rule):
    # for (destress-prefix) in [yes, no], do:
    #  print out the word input, the rule-name, [destressed:yes|no], and the new pron.
  
  
  if (@ARGV != 1 && @ARGV != 2) {
    die "Usage: limit_candidate_prons.pl rule_hierarchy [candidate_prons] > limited_candidate_prons";
  }
  
  $hierarchy = shift @ARGV;
  open(H, "<$hierarchy") || die "Opening rule hierarchy $hierarchy";
  
  while(<H>) {
    chop;
    m:.+;.+: || die "Bad rule-hierarchy line $_";
    $hierarchy{$_} = 1; # Format is: if $rule1 is the string form of the more specific rule
    # and $rule21 is that string form of the more general rule, then $hierarchy{$rule1.";".$rule2}
    # is defined, else undefined.
  }
  
  
  sub process_word;
  
  undef $cur_word;
  @cur_lines = ();
  
  while(<>) {
    # input, output is:
    # word;pron;base-word;base-pron;rule-name;destress;score
    chop;
    m:^([^;]+);: || die "Unexpected input: $_";
    $word = $1;
    if (!defined $cur_word || $word eq $cur_word) {
      if (!defined $cur_word) { $cur_word = $word; }
      push @cur_lines, $_;
    } else {
      process_word(@cur_lines); # Process a series of suggested prons
      # for a particular word.
      $cur_word = $word;
      @cur_lines = ( $_ ); 
    }
  }
  process_word(@cur_lines);
    
  sub process_word {
    my %pair2rule_list; # hash from $baseword.";".$baseword to ref
    # to array of [ line1, line2, ... ].
    my @cur_lines = @_;
    foreach my $line (@cur_lines) {
      my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
      my $key = $baseword.";".$basepron;
      if (defined $pair2rule_list{$key}) {
        push @{$pair2rule_list{$key}}, $line; # @{...} derefs the array pointed to 
        # by the array ref inside {}. 
      } else {
        $pair2rule_list{$key} = [ $line ]; # [ $x ] is new anonymous array with 1 elem ($x)
      }
    }
    while ( my ($key, $value) = each(%pair2rule_list) ) {
      my @lines = @$value; # array of lines that are for this (baseword,basepron).
      my @stress, @rules; # Arrays of stress markers and rule names, indexed by
      # same index that indexes @lines.
      for (my $n = 0; $n < @lines; $n++) {
        my $line = $lines[$n];
        my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line);
        $stress[$n] = $destress;
        $rules[$n] = $rulename;
      }
      for (my $m = 0; $m < @lines; $m++) {
        my $ok = 1; # if stays 1, this line is OK.
        for (my $n = 0; $n < @lines; $n++) {
          if ($m != $n && $stress[$m] eq $stress[$n]) {
            if (defined $hierarchy{$rules[$n].";".$rules[$m]}) {
              # Note: this "hierarchy" variable is defined if $rules[$n] is a more
              # specific instances of $rules[$m], thus invalidating $rules[$m].
              $ok = 0;
              last; # no point iterating further.
            }
          }
        }
        if ($ok != 0) {
          print $lines[$m] . "
  ";
        }
      }
    }
  }