Blame view
egs/sprakbanken/s5/local/dict/get_rules.pl
7.55 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
#!/usr/bin/env perl # This program creates suggested suffix rules from a dictionary. # It outputs quadruples of the form: # suffix,base-suffix,psuffix,base-psuffix # where "suffix" is the suffix of the letters of a word, "base-suffix" is # the suffix of the letters of the base-word, "psuffix" is the suffix of the # pronunciation of the word (a space-separated list of phonemes), and # "base-psuffix" is the suffix of the pronunciation of the baseword. # As far as this program is concerned, there is no distinction between # "word" and "base-word". To simplify things slightly, what it does # is return all tuples (a,b,c,d) [with a != b] such that there are # at least $min_suffix_count instances in the dictionary of # a (word-prefix, pron-prefix) pair where there exists (word,pron) # pairs of the form # ( word-prefix . a, pron-prefix . c) # and # ( word-prefix . b, pron-prefix . d) # For example if (a,b,c,d) equals (USLY,US,S L IY0,S) # then this quadruple will be output as long as there at least # e.g. 30 instances of prefixes like (FAM, F EY1 M AH0) # where there exist (word, pron) pairs like: # FAMOUS, F EY1 M AH0 S # FAMOUSLY F EY1 M AH0 S L IY0 # # There are some modifications to the picture above, for efficiency. # If $disallow_empty_suffix != 0, this program will not output 4-tuples where # the first element (the own-word suffix) is empty, as this would cause # efficiency problems in get_candidate_prons.pl. If # $ignore_prefix_stress != 0, this program will ignore stress markings # while evaluating whether prefixes are the same. # The minimum count for a quadruple to be output is $min_suffix_count # (e.g. 30). # # The function of this program is not to evaluate the accuracy of these rules; # it is mostly a pruning step, where we suggest rules that have large enough # counts to be suitable for our later procedure where we evaluate their # accuracy in predicting prons. $disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is # empty. This is for efficiency in later stages (e.g. get_candidate_prons.pl). $min_prefix_len = 3; # this must match with get_candidate_prons.pl $ignore_prefix_stress = 1; # or 0 to take account of stress in prefix. $min_suffix_count = 20; # Takes in dictionary. print STDERR "Reading dict "; while(<>) { @A = split(" ", $_); my $word = shift @A; my $pron = join(" ", @A); if (!defined $prons{$word}) { $prons{$word} = $pron; push @words, $word; } else { $prons{$word} = $prons{$word} . ";" . $pron; } } # Get common suffixes (e.g., count >100). Include empty suffix. print STDERR "Getting common suffix counts. "; { foreach $word (@words) { $len = length($word); for ($x = $min_prefix_len; $x <= $len; $x++) { $suffix_count{substr($word, $x)}++; } } foreach $suffix (keys %suffix_count) { if ($suffix_count{$suffix} >= $min_suffix_count) { $newsuffix_count{$suffix} = $suffix_count{$suffix}; } } %suffix_count = %newsuffix_count; undef %newsuffix_count; foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) { print STDERR "$suffix_count{$suffix} $suffix "; } } print STDERR "Getting common suffix pairs. "; { print STDERR " Getting map from prefix -> suffix-set. "; # Create map from prefix -> suffix-set. foreach $word (@words) { $len = length($word); for ($x = $min_prefix_len; $x <= $len; $x++) { $prefix = substr($word, 0, $x); $suffix = substr($word, $x); if (defined $suffix_count{$suffix}) { # Suffix is common... if (!defined $suffixes_of{$prefix}) { $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with # one element. } else { push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the # hash member is a reference . } } } } my %suffix_set_count; print STDERR " Getting map from suffix-set -> count. "; while ( my ($key, $value) = each(%suffixes_of) ) { my @suffixes = sort ( @$value ); $suffix_set_count{join(";", @suffixes)}++; } print STDERR " Getting counts for suffix pairs. "; while ( my ($suffix_set, $count) = each (%suffix_set_count) ) { my @suffixes = split(";", $suffix_set); # Consider pairs to be ordered. This is more convenient # later on. foreach $suffix_a (@suffixes) { foreach $suffix_b (@suffixes) { if ($suffix_a ne $suffix_b) { $suffix_pair = $suffix_a . "," . $suffix_b; $suffix_pair_count{$suffix_pair} += $count; } } } } # To save memory, only keep pairs above threshold in the hash. while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) { if ($count >= $min_suffix_count) { $new_hash{$suffix_pair} = $count; } } %suffix_pair_count = %new_hash; undef %new_hash; # Print out the suffix pairs so the user can see. foreach $suffix_pair ( sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) { print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair "; } } print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples "; { while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) { # Note: suffixes_ref is a reference to an array. We dereference with # @$suffixes_ref. # Consider each pair of suffixes (in each order). foreach my $suffix_a ( @$suffixes_ref ) { foreach my $suffix_b ( @$suffixes_ref ) { # could just used "defined" in next line, but this is for clarity. $suffix_pair = $suffix_a.",".$suffix_b; if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) { foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) { @pron_a = split(" ", $pron_a_str); foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) { @pron_b = split(" ", $pron_b_str); $len_a = @pron_a; # evaluating array as scalar automatically gives length. $len_b = @pron_b; for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) { # $pos is starting-pos of psuffix-pair. $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]); $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]); $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b; $quadruple_count{$quadruple}++; my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos]; if ($ignore_prefix_stress) { $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH. Only affects $pron_b_pos =~ s/\d//; # whether we exit the loop below. } if ($pron_a_pos ne $pron_b_pos) { # This is important: we don't consider a pron suffix-pair to be # valid unless the pron prefix is the same. last; } } } } } } } } # To save memory, only keep pairs above threshold in the hash. while ( my ($quadruple, $count) = each (%quadruple_count) ) { if ($count >= $min_suffix_count) { $new_hash{$quadruple} = $count; } } %quadruple_count = %new_hash; undef %new_hash; # Print out the quadruples for diagnostics. foreach $quadruple ( sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) { print STDERR "$quadruple_count{$quadruple} $quadruple "; } } # Now print out the quadruples; these are the output of this program. foreach $quadruple (keys %quadruple_count) { print $quadruple." "; } |