Blame view

egs/wsj/s5/utils/lang/check_g_properties.pl 2.59 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  #!/usr/bin/env perl
  
  use IPC::Open2;
  
  if (@ARGV != 1) {
    print "Usage: $0 [options] <lang_directory>
  ";
    print "e.g.:  $0 data/lang
  ";
    exit(1);
  }
  
  $lang = shift @ARGV;
  
  # This script checks that G.fst in the lang.fst directory is OK with respect
  # to certain expected properties, and returns nonzero exit status if a problem was
  # detected.  It is called from validate_lang.pl.
  # This only checks the properties of G that relate to disambiguation symbols,
  # epsilons and forbidden symbols <s> and </s>.
  
  if (! -e "$lang/G.fst") {
    print "$0: error: $lang/G.fst does not exist
  ";
    exit(1);
  }
  
  open(W, "<$lang/words.txt") || die "opening $lang/words.txt";
  $hash_zero = -1;
  while (<W>) {
    @A = split(" ", $_);
    ($sym, $int) = @A;
    if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; }
    if ($sym eq "#0") { $hash_zero = $int; }
    if ($sym =~ m/^#nonterm/) { $is_nonterminal{$int} = 1; }
  }
  
  if (-e "$lang/phones/wdisambig_words.int") {
    open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int";
    while (<F>) {
      chop;
      $is_disambig{$_} = 1;
    }
  } else {
    $is_disambig{$hash_zero} = 1;
  }
  
  $input_cmd = ". ./path.sh; fstprint $lang/G.fst|";
  open(G, $input_cmd) || die "running command $input_cmd";
  
  $info_cmd = ". ./path.sh; fstcompile | fstinfo ";
  open2(O, I, "$info_cmd") || die "running command $info_cmd";
  
  $has_epsilons = 0;
  
  while (<G>) {
    @A = split(" ", $_);
    if (@A >= 4) {
      if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) {
        chop;
        print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol <s> or </s>
  ";
        exit(1);
      } elsif ($is_disambig{$A[2]}) {
        print I $_;
        if ($A[3] != 0) {
          chop;
          print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output
  ";
          exit(1);
        }
      } elsif ($A[2] == 0) {
        print I $_;
        $has_epsilons = 1;
      } elsif ($A[2] != $A[3] && !$is_nonterminal{$A[2]} ) {
        chop;
        print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol or nonterminal.
  ";
        exit(1);
      }
    }
  }
  
  close(I);  # tell 'fstcompile | fstinfo' pipeline that its input is done.
  while (<O>) {
    if (m/cyclic\s+y/) {
      print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons.  Would cause determinization failure
  ";
      exit(1);
    }
  }
  
  if ($has_epsilons) {
    print "$0: warning: validating $lang: G.fst has epsilon-input arcs.  We don't expect these in most setups.
  ";
  }
  
  print "--> $0 successfully validated $lang/G.fst
  ";
  exit(0);