Blame view
egs/wsj/s5/utils/add_lex_disambig.pl
6.64 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
#!/usr/bin/env perl # Copyright 2010-2011 Microsoft Corporation # 2013-2016 Johns Hopkins University (author: Daniel Povey) # 2015 Hainan Xu # 2015 Guoguo Chen # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # Adds disambiguation symbols to a lexicon. # Outputs still in the normal lexicon format. # Disambig syms are numbered #1, #2, #3, etc. (#0 # reserved for symbol in grammar). # Outputs the number of disambig syms to the standard output. # With the --pron-probs option, expects the second field # of each lexicon line to be a pron-prob. # With the --sil-probs option, expects three additional # fields after the pron-prob, representing various components # of the silence probability model. $pron_probs = 0; $sil_probs = 0; $first_allowed_disambig = 1; for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { if ($ARGV[0] eq "--pron-probs") { $pron_probs = 1; shift @ARGV; } if ($ARGV[0] eq "--sil-probs") { $sil_probs = 1; shift @ARGV; } if ($ARGV[0] eq "--first-allowed-disambig") { $first_allowed_disambig = 0 + $ARGV[1]; if ($first_allowed_disambig < 1) { die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig "; } shift @ARGV; shift @ARGV; } } if (@ARGV != 2) { die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out> " . "This script adds disambiguation symbols to a lexicon in order to " . "make decoding graphs determinizable; it adds pseudo-phone " . "disambiguation symbols #1, #2 and so on at the ends of phones " . "to ensure that all pronunciations are different, and that none " . "is a prefix of another. " . "It prints to the standard output the number of the largest-numbered" . "disambiguation symbol that was used. " . " " . "Options: --pron-probs Expect pronunciation probabilities in the 2nd field " . " --sil-probs [should be with --pron-probs option] " . " Expect 3 extra fields after the pron-probs, for aspects of " . " the silence probability model " . " --first-allowed-disambig <n> The number of the first disambiguation symbol " . " that this script is allowed to add. By default this is " . " #1, but you can set this to a larger value using this option. " . "e.g.: " . " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt " . " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt " . " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt "; } $lexfn = shift @ARGV; $lexoutfn = shift @ARGV; open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; # (1) Read in the lexicon. @L = ( ); while(<L>) { @A = split(" ", $_); push @L, join(" ", @A); } # (2) Work out the count of each phone-sequence in the # lexicon. foreach $l (@L) { @A = split(" ", $l); shift @A; # Remove word. if ($pron_probs) { $p = shift @A; if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } } if ($sil_probs) { $silp = shift @A; if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } $correction = shift @A; if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } $correction = shift @A; if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } } if (!(@A)) { die "Bad lexicon line $1, no phone in phone list"; } $count{join(" ",@A)}++; } # (3) For each left sub-sequence of each phone-sequence, note down # that it exists (for identifying prefixes of longer strings). foreach $l (@L) { @A = split(" ", $l); shift @A; # Remove word. if ($pron_probs) { shift @A; } # remove pron-prob. if ($sil_probs) { shift @A; # Remove silprob shift @A; # Remove silprob shift @A; # Remove silprob, there three numbers for sil_probs } while(@A > 0) { pop @A; # Remove last phone $issubseq{join(" ",@A)} = 1; } } # (4) For each entry in the lexicon: # if the phone sequence is unique and is not a # prefix of another word, no diambig symbol. # Else output #1, or #2, #3, ... if the same phone-seq # has already been assigned a disambig symbol. open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing. "; # max_disambig will always be the highest-numbered disambiguation symbol that # has been used so far. $max_disambig = $first_allowed_disambig - 1; foreach $l (@L) { @A = split(" ", $l); $word = shift @A; if ($pron_probs) { $pron_prob = shift @A; } if ($sil_probs) { $sil_word_prob = shift @A; $word_sil_correction = shift @A; $prev_nonsil_correction = shift @A } $phnseq = join(" ", @A); if (!defined $issubseq{$phnseq} && $count{$phnseq} == 1) { ; # Do nothing. } else { if ($phnseq eq "") { # need disambig symbols for the empty string # that are not use anywhere else. $max_disambig++; $reserved_for_the_empty_string{$max_disambig} = 1; $phnseq = "#$max_disambig"; } else { $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; if (!defined $cur_disambig) { $cur_disambig = $first_allowed_disambig; } else { $cur_disambig++; # Get a number that has not been used yet for # this phone sequence. } while (defined $reserved_for_the_empty_string{$cur_disambig}) { $cur_disambig++; } if ($cur_disambig > $max_disambig) { $max_disambig = $cur_disambig; } $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; $phnseq = $phnseq . " #" . $cur_disambig; } } if ($pron_probs) { if ($sil_probs) { print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq "; } else { print O "$word\t$pron_prob\t$phnseq "; } } else { print O "$word\t$phnseq "; } } print $max_disambig . " "; |