Blame view

egs/sprakbanken/s5/local/dict/find_acronyms.pl 2.78 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  #!/usr/bin/env perl
  
  # Reads a dictionary, and prints out a list of words that seem to be pronounced
  # as acronyms (not including plurals of acronyms, just acronyms).  Uses
  # the prons of the individual letters (A., B. and so on) to judge this.
  # Note: this is somewhat dependent on the convention used in CMUduct, that
  # the individual letters are spelled this way (e.g. "A.").
  
  $max_length = 6; # Max length of words that might be
   # acronyms.
  
  while(<>) { # Read the dict.
    chop;
    @A = split(" ", $_);
    $word = shift @A;
    $pron = join(" ", @A);
    if ($word =~ m/^([A-Z])\.$/ ) {
      chop $word; # Remove trailing "." to get just the letter
      $letter = $1;
      if (!defined $letter_prons{$letter} ) { 
        $letter_prons{$letter} = [ ]; # new anonymous array
      }
      $arrayref = $letter_prons{$letter};
      push @$arrayref, $pron;
    } elsif( length($word) <= $max_length ) {
      $pronof{$word . "," . $pron} = 1;
      $isword{$word} = 1;
      #if (!defined $prons{$word} ) {
      #  $prons{$word} = [ ];
      #}
      #  push @{$prons{$word}}, $pron;
    }
  }
  
  sub get_letter_prons;
  
  foreach $word (keys %isword) {
    my @letter_prons = get_letter_prons($word);
    foreach $pron (@letter_prons) {
      if (defined $pronof{$word.",".$pron}) {
        print "$word  $pron
  ";
      }
    }
  }
  
  
  sub get_letter_prons {
    @acronym = split("", shift); # The letters in the word.
    my @prons = ( "" );
    
    while (@acronym > 0) {
      $l = shift @acronym;
      $n = 1; # num-repeats of letter $l.
      while (@acronym > 0 && $acronym[0] eq $l) {
        $n++;
        shift @acronym;
      }
      my $arrayref = $letter_prons{$l};
      my @prons_of_block = ();
      if ($n == 1) { # Just one repeat.
        foreach $lpron ( @$arrayref ) {
          push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
        }
      } elsif ($n == 2) { # Two repeats.  Can be "double a" or "a a"
        foreach $lpron ( @$arrayref ) {
          push @prons_of_block, "D AH1 B AH0 L " . $lpron;
          push @prons_of_block, $lpron . $lpron;
        }
      } elsif ($n == 3) { # can be "triple a" or "a a a"
        foreach $lpron ( @$arrayref ) {
          push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
          push @prons_of_block, $lpron . $lpron . $lpron;
        }
      } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
        # not sure really.
        foreach $lpron ( @$arrayref ) {
          $nlpron = "";
          for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; }
          push @prons_of_block, $nlpron;
        }
      }
      my @new_prons = ();
      foreach $pron (@prons) {
        foreach $pron_of_block(@prons_of_block) {
          if ($pron eq "") {
            push @new_prons, $pron_of_block;
          } else {
            push @new_prons, $pron . " " . $pron_of_block;
          }
        }
      }
      @prons = @new_prons;
    }
    return @prons;
  }