Blame view
egs/sprakbanken/s5/local/dict/find_acronyms.pl
2.78 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
#!/usr/bin/env perl # Reads a dictionary, and prints out a list of words that seem to be pronounced # as acronyms (not including plurals of acronyms, just acronyms). Uses # the prons of the individual letters (A., B. and so on) to judge this. # Note: this is somewhat dependent on the convention used in CMUduct, that # the individual letters are spelled this way (e.g. "A."). $max_length = 6; # Max length of words that might be # acronyms. while(<>) { # Read the dict. chop; @A = split(" ", $_); $word = shift @A; $pron = join(" ", @A); if ($word =~ m/^([A-Z])\.$/ ) { chop $word; # Remove trailing "." to get just the letter $letter = $1; if (!defined $letter_prons{$letter} ) { $letter_prons{$letter} = [ ]; # new anonymous array } $arrayref = $letter_prons{$letter}; push @$arrayref, $pron; } elsif( length($word) <= $max_length ) { $pronof{$word . "," . $pron} = 1; $isword{$word} = 1; #if (!defined $prons{$word} ) { # $prons{$word} = [ ]; #} # push @{$prons{$word}}, $pron; } } sub get_letter_prons; foreach $word (keys %isword) { my @letter_prons = get_letter_prons($word); foreach $pron (@letter_prons) { if (defined $pronof{$word.",".$pron}) { print "$word $pron "; } } } sub get_letter_prons { @acronym = split("", shift); # The letters in the word. my @prons = ( "" ); while (@acronym > 0) { $l = shift @acronym; $n = 1; # num-repeats of letter $l. while (@acronym > 0 && $acronym[0] eq $l) { $n++; shift @acronym; } my $arrayref = $letter_prons{$l}; my @prons_of_block = (); if ($n == 1) { # Just one repeat. foreach $lpron ( @$arrayref ) { push @prons_of_block, $lpron; # typically (always?) just one pron of a letter. } } elsif ($n == 2) { # Two repeats. Can be "double a" or "a a" foreach $lpron ( @$arrayref ) { push @prons_of_block, "D AH1 B AH0 L " . $lpron; push @prons_of_block, $lpron . $lpron; } } elsif ($n == 3) { # can be "triple a" or "a a a" foreach $lpron ( @$arrayref ) { push @prons_of_block, "T R IH1 P AH0 L " . $lpron; push @prons_of_block, $lpron . $lpron . $lpron; } } elsif ($n >= 4) { # let's say it can only be that letter repeated $n times.. # not sure really. foreach $lpron ( @$arrayref ) { $nlpron = ""; for ($m = 0; $m < $n; $m++) { $nlpron = $nlpron . $lpron; } push @prons_of_block, $nlpron; } } my @new_prons = (); foreach $pron (@prons) { foreach $pron_of_block(@prons_of_block) { if ($pron eq "") { push @new_prons, $pron_of_block; } else { push @new_prons, $pron . " " . $pron_of_block; } } } @prons = @new_prons; } return @prons; } |