Blame view
egs/gp/s1/utils/find_arpa_oovs.pl
2 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
#!/usr/bin/env perl # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. if ( @ARGV < 1 && @ARGV > 2) { die "Usage: find_arpa_oovs.pl words.txt [lm.arpa] "; # This program finds words in the arpa file that are not symbols # in the OpenFst-format symbol table words.txt. It prints them # on the standard output, one per line. } $symtab = shift @ARGV; open(S, "<$symtab") || die "Failed opening symbol table file $symtab "; while(<S>){ @A = split(" ", $_); @A == 2 || die "Bad line in symbol table file: $_"; $seen{$A[0]} = 1; } $curgram=0; while(<>) { # Find the \data\ marker. if(m:^\\data\\$:) { last; } } while(<>) { if(m/^\\(\d+)\-grams:\s*$/) { $curgram = $1; if($curgram > 1) { last; # This is an optimization as we can get the vocab from the 1-grams } } elsif($curgram > 0) { @A = split(" ", $_); if(@A > 1) { shift @A; for($n=0;$n<$curgram;$n++) { $word = $A[$n]; if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file. "; } $in_arpa{$word} = 1; } } else { if(@A > 0 && $A[0] !~ m:\\end\\:) { print STDERR "Unusual line $_ (line $.) in arpa file "; } } } } foreach $w (keys %in_arpa) { if(!defined $seen{$w} && $w ne "<s>" && $w ne "</s>") { print "$w "; } } |