get_acronym_prons.pl
3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env perl
# Reads a dictionary (for prons of letters), and an OOV list,
# and puts out candidate pronunciations of words in that list
# that could plausibly be acronyms.
# We judge that a word can plausibly be an acronym if it is
# a sequence of just letters (no non-letter characters such
# as "'"), or something like U.K.,
# and the number of letters is four or less.
#
# If the text were not already pre-normalized, there would
# be other hints such as capitalization.
# This program appends
# the prons of the individual letters (A., B. and so on) to work out
# the pron of the acronym.
# Note: this is somewhat dependent on the convention used in CMUduct, that
# the individual letters are spelled this way (e.g. "A."). [it seems
# to also have the separated versions.
if (!(@ARGV == 1 || @ARGV == 2)) {
print "Usage: get_acronym_prons.pl dict [oovlist]";
}
$max_length = 4; # Max #letters in an acronym. (Longer
# acronyms tend to have "pseudo-pronunciations", e.g. think about UNICEF.
$dict = shift @ARGV;
open(D, "<$dict") || die "Opening dictionary $dict";
while(<D>) { # Read the dict, to get the prons of the letters.
chop;
@A = split(" ", $_);
$word = shift @A;
$pron = join(" ", @A);
if ($word =~ m/^([A-Z])\.$/ ) {
chop $word; # Remove trailing "." to get just the letter
$letter = $1;
if (!defined $letter_prons{$letter} ) {
$letter_prons{$letter} = [ ]; # new anonymous array
}
$arrayref = $letter_prons{$letter};
push @$arrayref, $pron;
} elsif( length($word) <= $max_length ) {
$pronof{$word . "," . $pron} = 1;
$isword{$word} = 1;
#if (!defined $prons{$word} ) {
# $prons{$word} = [ ];
#}
# push @{$prons{$word}}, $pron;
}
}
sub get_letter_prons;
while(<>) { # Read OOVs.
# For now, just do the simple cases without "." in
# between... things with "." in the OOV list seem to
# be mostly errors.
chop;
$word = $_;
if ($word =~ m/^[A-Z]{1,5}$/) {
foreach $pron ( get_letter_prons($word) ) { # E.g. UNPO
print "$word $pron\n";
}
} elsif ($word =~ m:^(\w\.){1,4}\w\.?$:) { # E.g. U.K. Make the final "." optional.
$letters = $word;
$letters =~ s:\.::g;
foreach $pron ( get_letter_prons($letters) ) {
print "$word $pron\n";
}
}
}
sub get_letter_prons {
@acronym = split("", shift); # The letters in the word.
my @prons = ( "" );
while (@acronym > 0) {
$l = shift @acronym;
$n = 1; # num-repeats of letter $l.
while (@acronym > 0 && $acronym[0] eq $l) {
$n++;
shift @acronym;
}
my $arrayref = $letter_prons{$l};
my @prons_of_block = ();
if ($n == 1) { # Just one repeat.
foreach $lpron ( @$arrayref ) {
push @prons_of_block, $lpron; # typically (always?) just one pron of a letter.
}
} elsif ($n == 2) { # Two repeats. Can be "double a" or "a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "D AH1 B AH0 L " . $lpron;
push @prons_of_block, $lpron . " " . $lpron;
}
} elsif ($n == 3) { # can be "triple a" or "a a a"
foreach $lpron ( @$arrayref ) {
push @prons_of_block, "T R IH1 P AH0 L " . $lpron;
push @prons_of_block, "$lpron $lpron $lpron";
}
} elsif ($n >= 4) { # let's say it can only be that letter repeated $n times..
# not sure really.
foreach $lpron ( @$arrayref ) {
$nlpron = $lpron;
for ($m = 1; $m < $n; $m++) { $nlpron = $nlpron . " " . $lpron; }
push @prons_of_block, $nlpron;
}
}
my @new_prons = ();
foreach $pron (@prons) {
foreach $pron_of_block(@prons_of_block) {
if ($pron eq "") {
push @new_prons, $pron_of_block;
} else {
push @new_prons, $pron . " " . $pron_of_block;
}
}
}
@prons = @new_prons;
}
return @prons;
}