Blame view

egs/wsj/s5/local/dict/get_acronym_prons.pl 3.78 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
  #!/usr/bin/env perl
  
  # Reads a dictionary (for prons of letters), and an OOV list,
  # and puts out candidate pronunciations of words in that list
  # that could plausibly be acronyms.
  # We judge that a word can plausibly be an acronym if it is
  # a sequence of just letters (no non-letter characters such
  # as "'"),  or something like U.K.,
  # and the number of letters is four or less.
  #
  # If the text were not already pre-normalized, there would
  # be other hints such as capitalization.
  
  # This program appends
  # the prons of the individual letters (A., B. and so on) to work out
  # the pron of the acronym.
  # Note: this is somewhat dependent on the convention used in CMUduct, that
  # the individual letters are spelled this way (e.g. "A."). [it seems
  # to also have the separated versions.
  
  if (!(@ARGV == 1 || @ARGV == 2)) { 
    print "Usage: get_acronym_prons.pl dict [oovlist]";
  }
  
  $max_length = 4; # Max #letters in an acronym. (Longer 
   # acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
  
  $dict = shift @ARGV;
  open(D, "<$dict") || die "Opening dictionary $dict";
  
  while(<D>) { # Read the dict, to get the prons of the letters.
    chop;
    @A = split(" ", $_);
    $word = shift @A;
    $pron = join(" ", @A);
    if ($word =~ m/^([A-Z])\.$/ ) {
      chop $word; # Remove trailing "." to get just the letter
      $letter = $1;
      if (!defined $letter_prons{$letter} ) { 
        $letter_prons{$letter} = [ ]; # new anonymous array
      }
      $arrayref = $letter_prons{$letter};
      push @$arrayref, $pron;
    } elsif( length($word) <= $max_length ) {
      $pronof{$word . "," . $pron} = 1;
      $isword{$word} = 1;
      #if (!defined $prons{$word} ) {
      #  $prons{$word} = [ ];
      #}
      #  push @{$prons{$word}}, $pron;
    }
  }
  
  sub get_letter_prons;
  
  while(<>) { # Read OOVs.
    # For now, just do the simple cases without "." in 
    # between... things with "." in the OOV list seem to
    # be mostly errors.
    chop;
    $word = $_;
    if ($word =~ m/^[A-Z]{1,5}$/) {
      foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
        print "$word  $pron
  ";
      }
    } elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K.  Make the final "." optional.
      $letters = $word;
      $letters =~ s:\.::g;
      foreach $pron ( get_letter_prons($letters) ) { 
        print "$word  $pron
  ";
      }
    }
  }
  
  sub get_letter_prons {
    @acronym = split("", shift); # The letters in the word.
    my @prons = ( "" );
    
    while (@acronym > 0) {
      $l = shift @acronym;
      $n = 1; # num-repeats of letter $l.
      while (@acronym > 0 && $acronym[0] eq $l) {
        $n++;
        shift @acronym;
      }
      my $arrayref = $letter_prons{$l};
      my @prons_of_block = ();
      if ($n == 1) { # Just one repeat.
        foreach $lpron ( @$arrayref ) {
          push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
        }
      } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
        foreach $lpron ( @$arrayref ) {
          push @prons_of_block, "D AH1 B AH0 L " . $lpron;
          push @prons_of_block, $lpron . " " . $lpron;
        }
      } elsif ($n == 3) { # can be "triple a" or "a a a"
        foreach $lpron ( @$arrayref ) {
          push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
          push @prons_of_block, "$lpron $lpron $lpron";
        }
      } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
        # not sure really.
        foreach $lpron ( @$arrayref ) {
          $nlpron = $lpron;
          for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
          push @prons_of_block, $nlpron;
        }
      }
      my @new_prons = ();
      foreach $pron (@prons) {
        foreach $pron_of_block(@prons_of_block) {
          if ($pron eq "") {
            push @new_prons, $pron_of_block;
          } else {
            push @new_prons, $pron . " " . $pron_of_block;
          }
        }
      }
      @prons = @new_prons;
    }
    return @prons;
  }