Blame view
egs/wsj/s5/local/dict/select_candidate_prons.pl
2.6 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
#!/usr/bin/env perl # This takes the output of e.g. get_candidate_prons.pl or limit_candidate_prons.pl # or reverse_candidates.pl, which is 7-tuples, one per line, of the form: # # word;pron;base-word;base-pron;rule-name;de-stress;rule-score # # and selects the most likely prons for the words based on rule # score. It outputs in the same format as the input (thus, it is # similar to limit_candidates.pl in its input and output format, # except it has a different way of selecting the prons to put out). # # This script will select the $max_prons best pronunciations for # each candidate word, subject to the constraint that no pron should # have a rule score worse than $min_rule_score. # It first merges the candidates by, if there are multiple candidates # generating the same pron, selecting the candidate that had the # best associated score. It then sorts the prons on score and # selects the n best prons (but doesn't print out candidates with # score beneath the threshold). $max_prons = 4; $min_rule_score = 0.35; for ($n = 1; $n <= 3; $n++) { if ($ARGV[0] eq "--max-prons") { shift @ARGV; $max_prons = shift @ARGV; } if ($ARGV[0] eq "--min-rule-score") { shift @ARGV; $min_rule_score = shift @ARGV; } } if (@ARGV != 0 && @ARGV != 1) { die "Usage: select_candidates_prons.pl [candidate_prons] > selected_candidate_prons"; } sub process_word; undef $cur_word; @cur_lines = (); while(<>) { # input, output is: # word;pron;base-word;base-pron;rule-name;destress;score chop; m:^([^;]+);: || die "Unexpected input: $_"; $word = $1; if (!defined $cur_word || $word eq $cur_word) { if (!defined $cur_word) { $cur_word = $word; } push @cur_lines, $_; } else { process_word(@cur_lines); # Process a series of suggested prons # for a particular word. $cur_word = $word; @cur_lines = ( $_ ); } } process_word(@cur_lines); sub process_word { my %pron2rule_score; # hash from generated pron to rule score for that pron. my %pron2line; # hash from generated pron to best line for that pron. my @cur_lines = @_; foreach my $line (@cur_lines) { my ($word, $pron, $baseword, $basepron, $rulename, $destress, $rule_score) = split(";", $line); if (!defined $pron2rule_score{$pron} || $rule_score > $pron2rule_score{$pron}) { $pron2rule_score{$pron} = $rule_score; $pron2line{$pron} = $line; } } my @prons = sort { $pron2rule_score{$b} <=> $pron2rule_score{$a} } keys %pron2rule_score; for (my $n = 0; $n < @prons && $n < $max_prons && $pron2rule_score{$prons[$n]} >= $min_rule_score; $n++) { print $pron2line{$prons[$n]} . " "; } } |