Blame view
LIA_kaldiUtils/add_lex_disambig.pl
2.92 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
#!/usr/bin/perl # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # Adds disambiguation symbols to a lexicon. # Outputs still in the normal lexicon format. # Disambig syms are numbered #1, #2, #3, etc. (#0 # reserved for symbol in grammar). # Outputs the number of disambig syms to the standard output. if(@ARGV != 2) { die "Usage: add_lex_disambig.pl lexicon.txt lexicon_disambig.txt " } $lexfn = shift @ARGV; $lexoutfn = shift @ARGV; open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; # (1) Read in the lexicon. @L = ( ); while(<L>) { @A = split(" ", $_); push @L, join(" ", @A); } # (2) Work out the count of each phone-sequence in the # lexicon. foreach $l (@L) { @A = split(" ", $l); shift @A; # Remove word. $count{join(" ",@A)}++; } # (3) For each left sub-sequence of each phone-sequence, note down # that exists (for identifying prefixes of longer strings). foreach $l (@L) { @A = split(" ", $l); shift @A; # Remove word. while(@A > 0) { pop @A; # Remove last phone $issubseq{join(" ",@A)} = 1; } } # (4) For each entry in the lexicon: # if the phone sequence is unique and is not a # prefix of another word, no diambig symbol. # Else output #1, or #2, #3, ... if the same phone-seq # has already been assigned a disambig symbol. open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing. "; $max_disambig = 0; foreach $l (@L) { @A = split(" ", $l); $word = shift @A; $phnseq = join(" ",@A); if(!defined $issubseq{$phnseq} && $count{$phnseq}==1) { ; # Do nothing. } else { if($phnseq eq "") { # need disambig symbols for the empty string # that are not use anywhere else. $max_disambig++; $reserved{$max_disambig} = 1; $phnseq = "#$max_disambig"; } else { $curnumber = $disambig_of{$phnseq}; if(!defined{$curnumber}) { $curnumber = 0; } $curnumber++; # now 1 or 2, ... while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols if($curnumber > $max_disambig) { $max_disambig = $curnumber; } $disambig_of{$phnseq} = $curnumber; $phnseq = $phnseq . " #" . $curnumber; } } print O "$word\t$phnseq "; } print $max_disambig . " "; |