Blame view
egs/wsj/s5/utils/lang/check_g_properties.pl
2.59 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
#!/usr/bin/env perl use IPC::Open2; if (@ARGV != 1) { print "Usage: $0 [options] <lang_directory> "; print "e.g.: $0 data/lang "; exit(1); } $lang = shift @ARGV; # This script checks that G.fst in the lang.fst directory is OK with respect # to certain expected properties, and returns nonzero exit status if a problem was # detected. It is called from validate_lang.pl. # This only checks the properties of G that relate to disambiguation symbols, # epsilons and forbidden symbols <s> and </s>. if (! -e "$lang/G.fst") { print "$0: error: $lang/G.fst does not exist "; exit(1); } open(W, "<$lang/words.txt") || die "opening $lang/words.txt"; $hash_zero = -1; while (<W>) { @A = split(" ", $_); ($sym, $int) = @A; if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; } if ($sym eq "#0") { $hash_zero = $int; } if ($sym =~ m/^#nonterm/) { $is_nonterminal{$int} = 1; } } if (-e "$lang/phones/wdisambig_words.int") { open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int"; while (<F>) { chop; $is_disambig{$_} = 1; } } else { $is_disambig{$hash_zero} = 1; } $input_cmd = ". ./path.sh; fstprint $lang/G.fst|"; open(G, $input_cmd) || die "running command $input_cmd"; $info_cmd = ". ./path.sh; fstcompile | fstinfo "; open2(O, I, "$info_cmd") || die "running command $info_cmd"; $has_epsilons = 0; while (<G>) { @A = split(" ", $_); if (@A >= 4) { if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) { chop; print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol <s> or </s> "; exit(1); } elsif ($is_disambig{$A[2]}) { print I $_; if ($A[3] != 0) { chop; print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output "; exit(1); } } elsif ($A[2] == 0) { print I $_; $has_epsilons = 1; } elsif ($A[2] != $A[3] && !$is_nonterminal{$A[2]} ) { chop; print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol or nonterminal. "; exit(1); } } } close(I); # tell 'fstcompile | fstinfo' pipeline that its input is done. while (<O>) { if (m/cyclic\s+y/) { print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons. Would cause determinization failure "; exit(1); } } if ($has_epsilons) { print "$0: warning: validating $lang: G.fst has epsilon-input arcs. We don't expect these in most setups. "; } print "--> $0 successfully validated $lang/G.fst "; exit(0); |