Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/validate_lang.pl 34.3 KB
  #!/usr/bin/env perl
  
  # Apache 2.0.
  # Copyright  2012   Guoguo Chen
  #            2014   Neil Nelson
  #            2017   Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
  #            2019   Dongji Gao
  #
  # Validation script for data/lang
  
  # this function reads the opened file (supplied as a first
  # parameter) into an array of lines. For each
  # line, it tests whether it's a valid utf-8 compatible
  # line. If all lines are valid utf-8, it returns the lines
  # decoded as utf-8, otherwise it assumes the file's encoding
  # is one of those 1-byte encodings, such as ISO-8859-x
  # or Windows CP-X.
  # Please recall we do not really care about
  # the actually encoding, we just need to
  # make sure the length of the (decoded) string
  # is correct (to make the output formatting looking right).
  sub get_utf8_or_bytestream {
    use Encode qw(decode encode);
    my $is_utf_compatible = 1;
    my @unicode_lines;
    my @raw_lines;
    my $raw_text;
    my $lineno = 0;
    my $file = shift;
  
    while (<$file>) {
      $raw_text = $_;
      last unless $raw_text;
      if ($is_utf_compatible) {
        my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
        $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
        push @unicode_lines, $decoded_text;
      } else {
        #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text
  ";
        ;
      }
      push @raw_lines, $raw_text;
      $lineno += 1;
    }
  
    if (!$is_utf_compatible) {
      return (0, @raw_lines);
    } else {
      return (1, @unicode_lines);
    }
  }
  
  # check if the given unicode string contain unicode whitespaces
  # other than the usual four: TAB, LF, CR and SPACE
  sub validate_utf8_whitespaces {
    my $unicode_lines = shift;
    use feature 'unicode_strings';
    for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
      my $current_line = $unicode_lines->[$i];
      if ((substr $current_line, -1) ne "
  "){
        print STDERR "$0: The current line (nr. $i) has invalid newline
  ";
        return 1;
      }
      # we replace TAB, LF, CR, and SPACE
      # this is to simplify the test
      if ($current_line =~ /\x{000d}/) {
        print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character
  ";
        return 1;
      }
      $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
      if ($current_line =~/\s/) {
        return 1;
      }
    }
    return 0;
  }
  
  # checks if the text in the file (supplied as the argument) is utf-8 compatible
  # if yes, checks if it contains only allowed whitespaces. If no, then does not
  # do anything. The function seeks to the original position in the file after
  # reading the text.
  sub check_allowed_whitespace {
    my $file = shift;
    my $pos = tell($file);
    (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
    seek($file, $pos, SEEK_SET);
    if ($is_utf) {
      my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
      print "--> text seems to be UTF-8 or ASCII, checking whitespaces
  ";
      if ($has_invalid_whitespaces) {
        print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)
  ";
        return 0;
      } else {
        print "--> text contains only allowed whitespaces
  ";
      }
    } else {
      print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces
  ";
    }
    return 1;
  }
  
  $skip_det_check = 0;
  $skip_disambig_check = 0;
  $skip_generate_words_check = 0;
  $subword_check = 0;
  
  for ($x=0; $x <= 3; $x++) {
    if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
      $skip_det_check = 1;
      shift @ARGV;
    }
    if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") {
      $skip_disambig_check = 1;
      shift @ARGV;
    }
    if (@ARGV > 0 && $ARGV[0] eq "--skip-generate-words-check") {
      $skip_generate_words_check = 1;
      shift @ARGV;
    }
  }
  
  if (@ARGV != 1) {
    print "Usage: $0 [options] <lang_directory>
  ";
    print "e.g.:  $0 data/lang
  ";
    print "Options:
  ";
    print " --skip-det-check                         (this flag causes it to skip a deterministic fst check).
  ";
    print " --skip-determinization-check             (this flag causes it to skip a time consuming check).
  ";
    print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).
  ";
    exit(1);
  }
  
  print "$0 " . join(" ", @ARGV) . "
  ";
  
  $lang = shift @ARGV;
  $exit = 0;
  $warning = 0;
  
  # Checking existence of separator file ------------------
  print "Checking existence of separator file
  ";
  if (!-e "$lang/subword_separator.txt") {
    print "separator file $lang/subword_separator.txt is empty or does not exist, deal in word case.
  ";
  } else {
    if (!open(S, "<$lang/subword_separator.txt")) {
      print "--> ERROR: fail to open $lang/subword_separator.txt
  "; exit 1;
    } else {
      $line_num = `wc -l <$lang/subword_separator.txt`;
      if ($line_num != 1) {
        print "--> ERROR, $lang/subword_separator.txt should only contain one line.
  "; exit 1;
      } else {
        while (<S>) {
          chomp;
          my @col = split(" ", $_);
          if (@col != 1) {
            print "--> ERROR, invalid separator.
  "; exit 1;
          } else {
           $separator = shift @col;
           $separator_length = length $separator;
           $subword_check = 1;
          }
        }
      }
    }
  }
  
  if (!$subword_check) {
    $word_boundary = "word_boundary";
  } else {
    $word_boundary = "word_boundary_moved";
  }
  
  # Checking phones.txt -------------------------------
  print "Checking $lang/phones.txt ...
  ";
  if (-z "$lang/phones.txt") {
    print "--> ERROR: $lang/phones.txt is empty or does not exist
  "; exit 1;
  }
  if (!open(P, "<$lang/phones.txt")) {
    print "--> ERROR: fail to open $lang/phones.txt
  "; exit 1;
  }
  $idx = 1;
  %psymtab = ();
  check_allowed_whitespace(\*P) or exit 1;
  while (<P>) {
    chomp;
    my @col = split(" ", $_);
    if (@col != 2) {
      print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)
  "; exit 1;
    }
    my $phone = shift @col;
    my $id = shift @col;
    $psymtab{$phone} = $id;
    $idx ++;
  }
  close(P);
  %pint2sym = ();
  foreach (keys %psymtab) {
    if ($pint2sym{$psymtab{$_}}) {
      print "--> ERROR: ID \"$psymtab{$_}\" duplicates
  "; exit 1;
    } else {
      $pint2sym{$psymtab{$_}} = $_;
    }
  }
  print "--> $lang/phones.txt is OK
  ";
  print "
  ";
  
  # Check word.txt -------------------------------
  print "Checking words.txt: #0 ...
  ";
  if (-z "$lang/words.txt") {
    print "--> ERROR: $lang/words.txt is empty or does not exist
  "; exit 1;
  }
  if (!open(W, "<$lang/words.txt")) {
    print "--> ERROR: fail to open $lang/words.txt
  "; exit 1;
  }
  $idx = 1;
  %wsymtab = ();
  check_allowed_whitespace(\*W) or exit 1;
  while (<W>) {
    chomp;
    my @col = split(" ", $_);
    if (@col != 2) {
      print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)
  "; exit 1;
    }
    $word = shift @col;
    $id = shift @col;
    $wsymtab{$word} = $id;
    $idx ++;
  }
  close(W);
  %wint2sym = ();
  foreach (keys %wsymtab) {
    if ($wint2sym{$wsymtab{$_}}) {
      print "--> ERROR: ID \"$wsymtab{$_}\" duplicates
  "; exit 1;
    } else {
      $wint2sym{$wsymtab{$_}} = $_;
    }
  }
  print "--> $lang/words.txt is OK
  ";
  print "
  ";
  
  # Checking phones/* -------------------------------
  sub check_txt_int_csl {
    my ($cat, $symtab) = @_;
    print "Checking $cat.\{txt, int, csl\} ...
  ";
    if (!open(TXT, "<$cat.txt")) {
      $exit = 1; return print "--> ERROR: fail to open $cat.txt
  ";
    }
    if (!open(INT, "<$cat.int")) {
      $exit = 1; return print "--> ERROR: fail to open $cat.int
  ";
    }
    if (!open(CSL, "<$cat.csl")) {
      $exit = 1; return print "--> ERROR: fail to open $cat.csl
  ";
    }
    if (-z "$cat.txt") {
      $warning = 1; print "--> WARNING: $cat.txt is empty
  ";
    }
    if (-z "$cat.int") {
      $warning = 1; print "--> WARNING: $cat.int is empty
  ";
    }
    if (-z "$cat.csl") {
      $warning = 1; print "--> WARNING: $cat.csl is empty
  ";
    }
  
    $idx1 = 1;
    check_allowed_whitespace(\*TXT) or $exit = 1;
    while (<TXT>) {
      chomp;
      my @col = split(" ", $_);
      if (@col != 1) {
        $exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)
  ";
      }
      $entry[$idx1] = shift @col;
      $idx1 ++;
    }
    close(TXT); $idx1 --;
    print "--> $idx1 entry/entries in $cat.txt
  ";
  
    $idx2 = 1;
    while (<INT>) {
      chomp;
      my @col = split(" ", $_);
      if (@col != 1) {
        $exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)
  ";
      }
      if ($symtab->{$entry[$idx2]} ne shift @col) {
        $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)
  ";
      }
      $idx2 ++;
    }
    close(INT); $idx2 --;
    if ($idx1 != $idx2) {
      $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")
  ";
    }
    print "--> $cat.int corresponds to $cat.txt
  ";
  
    $num_lines = 0;
    while (<CSL>) {
      chomp;
      my @col = split(":", $_);
      $num_lines++;
      if (@col != $idx1) {
        $exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)
  ";
      }
      foreach (1 .. $idx1) {
        if ($symtab->{$entry[$_]} ne @col[$_-1]) {
          $exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)
  ";
        }
      }
    }
    close(CSL);
    if ($idx1 != 0) {             # nonempty .txt,.int files
      if ($num_lines != 1) {
        $exit = 1;
        return print "--> ERROR: expect 1 line in $cat.csl
  ";
      }
    } else {
      if ($num_lines != 1 && $num_lines != 0) {
        $exit = 1;
        return print "--> ERROR: expect 0 or 1 line in $cat.csl, since empty .txt,int
  ";
      }
    }
    print "--> $cat.csl corresponds to $cat.txt
  ";
  
    return print "--> $cat.\{txt, int, csl\} are OK
  ";
  }
  
  sub check_txt_int {
    my ($cat, $symtab, $sym_check) = @_;
    print "Checking $cat.\{txt, int\} ...
  ";
    if (-z "$cat.txt") {
      $exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist
  ";
    }
    if (-z "$cat.int") {
      $exit = 1; return print "--> ERROR: $cat.int is empty or does not exist
  ";
    }
    if (!open(TXT, "<$cat.txt")) {
      $exit = 1; return print "--> ERROR: fail to open $cat.txt
  ";
    }
    if (!open(INT, "<$cat.int")) {
      $exit = 1; return print "--> ERROR: fail to open $cat.int
  ";
    }
  
    $idx1 = 1;
    check_allowed_whitespace(\*TXT) or $exit = 1;
    while (<TXT>) {
      chomp;
      s/^(shared|not-shared) (split|not-split) //g;
      s/ nonword$//g;
      s/ begin$//g;
      s/ end$//g;
      s/ internal$//g;
      s/ singleton$//g;
      $entry[$idx1] = $_;
      $idx1 ++;
    }
    close(TXT); $idx1 --;
    print "--> $idx1 entry/entries in $cat.txt
  ";
  
    my %used_syms = ();
    $idx2 = 1;
    while (<INT>) {
      chomp;
      s/^(shared|not-shared) (split|not-split) //g;
      s/ nonword$//g;
      s/ begin$//g;
      s/ end$//g;
      s/ internal$//g;
      s/ singleton$//g;
      my @col = split(" ", $_);
      @set = split(" ", $entry[$idx2]);
      if (@set != @col) {
        $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)
  ";
      }
      foreach (0 .. @set-1) {
        if ($symtab->{@set[$_]} ne @col[$_]) {
          $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")
  ";
        }
        if ($sym_check && defined $used_syms{@set[$_]}) {
          $exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")
  ";
        }
        $used_syms{@set[$_]} = 1;
      }
      $idx2 ++;
    }
    close(INT); $idx2 --;
    if ($idx1 != $idx2) {
      $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")
  ";
    }
    print "--> $cat.int corresponds to $cat.txt
  ";
  
    if ($sym_check) {
      while ( my ($key, $value) = each(%silence) ) {
        if (!defined $used_syms{$key}) {
          $exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones
  ";
        }
      }
      while ( my ($key, $value) = each(%nonsilence) ) {
        if (!defined $used_syms{$key}) {
          $exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones
  ";
        }
      }
    }
  
    return print "--> $cat.\{txt, int\} are OK
  ";
  }
  
  # Check disjoint and summation -------------------------------
  sub intersect {
    my ($a, $b) = @_;
    @itset = ();
    %itset = ();
    foreach (keys %$a) {
      if (exists $b->{$_} and !$itset{$_}) {
        push(@itset, $_);
        $itset{$_} = 1;
      }
    }
    return @itset;
  }
  
  sub check_disjoint {
    print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...
  ";
    if (!open(S, "<$lang/phones/silence.txt")) {
      $exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt
  ";
    }
    if (!open(N, "<$lang/phones/nonsilence.txt")) {
      $exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt
  ";
    }
    if (!$skip_disambig_check && !open(D, "<$lang/phones/disambig.txt")) {
      $exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt
  ";
    }
  
    $idx = 1;
    while (<S>) {
      chomp;
      my @col = split(" ", $_);
      $phone = shift @col;
      if ($silence{$phone}) {
        $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)
  ";
      }
      $silence{$phone} = 1;
      push(@silence, $phone);
      $idx ++;
    }
    close(S);
  
    $idx = 1;
    while (<N>) {
      chomp;
      my @col = split(" ", $_);
      $phone = shift @col;
      if ($nonsilence{$phone}) {
        $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)
  ";
      }
      $nonsilence{$phone} = 1;
      push(@nonsilence, $phone);
      $idx ++;
    }
    close(N);
  
    $idx = 1;
    while (<D>) {
      chomp;
      my @col = split(" ", $_);
      $phone = shift @col;
      if ($disambig{$phone}) {
        $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)
  ";
      }
      $disambig{$phone} = 1;
      $idx ++;
    }
    close(D);
  
    my @itsect1 = intersect(\%silence, \%nonsilence);
    my @itsect2 = intersect(\%silence, \%disambig);
    my @itsect3 = intersect(\%disambig, \%nonsilence);
  
    $success = 1;
    if (@itsect1 != 0) {
      $success = 0;
      $exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- ";
      foreach (@itsect1) {
        print $_, " ";
      }
      print "
  ";
    } else {
      print "--> silence.txt and nonsilence.txt are disjoint
  ";
    }
  
    if (@itsect2 != 0) {
      $success = 0;
      $exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- ";
      foreach (@itsect2) {
        print $_, " ";
      }
      print "
  ";
    } else {
      print "--> silence.txt and disambig.txt are disjoint
  ";
    }
  
    if (@itsect3 != 0) {
      $success = 0;
      $exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- ";
      foreach (@itsect1) {
        print $_, " ";
      }
      print "
  ";
    } else {
      print "--> disambig.txt and nonsilence.txt are disjoint
  ";
    }
  
    $success == 0 || print "--> disjoint property is OK
  ";
    return;
  }
  
  sub check_summation {
    print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...
  ";
    if (scalar(keys %silence) == 0) {
      $exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or does not exist
  ";
    }
    if (scalar(keys %nonsilence) == 0) {
      $exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist
  ";
    }
    if (!$skip_disambig_check && scalar(keys %disambig) == 0) {
      $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist
  ";
    }
  
    %sum = (%silence, %nonsilence, %disambig);
    $sum{"<eps>"} = 1;
  
    my $ok = 1;
    foreach $p (keys %psymtab) {
      if (! defined $sum{$p} && $p !~ m/^#nonterm/) {
        $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...
  ");
      }
    }
  
    if ($ok) {
      print "--> found no unexplainable phones in phones.txt
  ";
    }
    return;
  }
  
  %silence = ();
  @silence = ();
  %nonsilence = ();
  @nonsilence = ();
  %disambig = ();
  check_disjoint; print "
  ";
  check_summation; print "
  ";
  
  @list1 = ("context_indep", "nonsilence", "silence", "optional_silence");
  @list2 = ("roots", "sets");
  if (!$skip_disambig_check) {
      push(@list1, "disambig");
  }
  foreach (@list1) {
    check_txt_int_csl("$lang/phones/$_", \%psymtab); print "
  ";
  }
  foreach (@list2) {
    check_txt_int("$lang/phones/$_", \%psymtab, 1); print "
  ";
  }
  if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) {
    check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "
  ";
  } else {
    print "Checking $lang/phones/extra_questions.\{txt, int\} ...
  ";
    if (!((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int"))) {
      print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)
  
  ";
      $exit = 1;
    }
  }
  if (-e "$lang/phones/$word_boundary.txt") {
    check_txt_int("$lang/phones/$word_boundary", \%psymtab, 0); print "
  ";
  }
  
  # Checking optional_silence.txt -------------------------------
  print "Checking optional_silence.txt ...
  ";
  $idx = 1;
  $success = 1;
  if (-z "$lang/phones/optional_silence.txt") {
    $exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or does not exist
  ";
  }
  if (!open(OS, "<$lang/phones/optional_silence.txt")) {
    $exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt
  ";
  }
  print "--> reading $lang/phones/optional_silence.txt
  ";
  while (<OS>) {
    chomp;
    my @col = split(" ", $_);
    if ($idx > 1 or @col > 1) {
      $exit = 1; print "--> ERROR: only 1 phone expected in $lang/phones/optional_silence.txt
  "; $success = 0;
    } elsif (!$silence{$col[0]}) {
      $exit = 1; print "--> ERROR: phone $col[0] not found in $lang/phones/silence_phones.txt
  "; $success = 0;
    }
    $idx ++;
  }
  close(OS);
  $success == 0 || print "--> $lang/phones/optional_silence.txt is OK
  ";
  print "
  ";
  
  if (!$skip_disambig_check) {
    # Check disambiguation symbols -------------------------------
    print "Checking disambiguation symbols: #0 and #1
  ";
    if (scalar(keys %disambig) == 0) {
      $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist
  ";
    }
    if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
      print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"
  ";
      print "--> $lang/phones/disambig.txt is OK
  
  ";
    } else {
      print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";
  ";
      print "-->          this would not be OK with a conventional ARPA-type language
  ";
      print "-->          model or a conventional lexicon (L.fst)
  ";
      $warning = 1;
    }
  }
  
  
  # Check topo -------------------------------
  print "Checking topo ...
  ";
  if (-z "$lang/topo") {
    $exit = 1; print "--> ERROR: $lang/topo is empty or does not exist
  ";
  }
  if (!open(T, "<$lang/topo")) {
    $exit = 1; print "--> ERROR: fail to open $lang/topo
  ";
  } else {
    $topo_ok = 1;
    $idx = 1;
    %phones_in_topo_int_hash = ( );
    %phones_in_topo_hash = ( );
    while (<T>) {
      chomp;
      next if (m/^<.*>[ ]*$/);
      foreach $i (split(" ", $_)) {
        if (defined $phones_in_topo_int_hash{$i}) {
          $topo_ok = 0;
          $exit = 1; print "--> ERROR: $lang/topo has phone $i twice
  ";
        }
        if (!defined $pint2sym{$i}) {
          $topo_ok = 0;
          $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt
  ";
        }
        $phones_in_topo_int_hash{$i} = 1;
        $phones_in_topo_hash{$pint2sym{$i}} = 1;
      }
    }
    close(T);
    $phones_that_should_be_in_topo_hash = {};
    foreach $p (@silence, @nonsilence) { $phones_that_should_be_in_topo_hash{$p} = 1; }
    foreach $p (keys %phones_that_should_be_in_topo_hash) {
      if ( ! defined $phones_in_topo_hash{$p}) {
        $topo_ok = 0;
        $i = $pint2sym{$p};
        $exit = 1; print "--> ERROR: $lang/topo does not cover phone $p (label = $i)
  ";
      }
    }
    foreach $i (keys %phones_in_topo_int_hash) {
      $p = $pint2sym{$i};
      if ( ! defined $phones_that_should_be_in_topo_hash{$p}) {
        $topo_ok = 0;
        $exit = 1; print "--> ERROR: $lang/topo covers phone $p (label = $i) which is not a real phone
  ";
      }
    }
    if ($topo_ok) {
      "--> $lang/topo is OK
  ";
    }
    print "
  ";
  }
  
  # Check word_boundary -------------------------------
  $nonword   = "";
  $begin     = "";
  $end       = "";
  $internal  = "";
  $singleton = "";
  if (-s "$lang/phones/$word_boundary.txt") {
    print "Checking $word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...
  ";
    if (!open (W, "<$lang/phones/$word_boundary.txt")) {
      $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.txt
  ";
    }
    $idx = 1;
    %wb = ();
    while (<W>) {
      chomp;
      my @col;
      if (m/^.*nonword$/  ) {
        s/ nonword//g;    @col = split(" ", $_); if (@col == 1) {$nonword   .= "$col[0] ";}
      }
      if (m/^.*begin$/    ) {
        s/ begin$//g;     @col = split(" ", $_); if (@col == 1) {$begin     .= "$col[0] ";}
      }
      if (m/^.*end$/      ) {
        s/ end$//g;       @col = split(" ", $_); if (@col == 1) {$end       .= "$col[0] ";}
      }
      if (m/^.*internal$/ ) {
        s/ internal$//g;  @col = split(" ", $_); if (@col == 1) {$internal  .= "$col[0] ";}
      }
      if (m/^.*singleton$/) {
        s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}
      }
      if (@col != 1) {
        $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/$word_boundary.txt (line $idx)
  ";
      }
      $wb{shift @col} = 1;
      $idx ++;
    }
    close(W);
  
    @itset = intersect(\%disambig, \%wb);
    $success1 = 1;
    if (@itset != 0) {
      $success1 = 0;
      $exit = 1; print "--> ERROR: $lang/phones/$word_boundary.txt has disambiguation symbols -- ";
      foreach (@itset) {
        print "$_ ";
      }
      print "
  ";
    }
    $success1 == 0 || print "--> $lang/phones/$word_boundary.txt doesn't include disambiguation symbols
  ";
  
    %sum = (%silence, %nonsilence);
    @itset = intersect(\%sum, \%wb);
    %itset = (); foreach(@itset) {$itset{$_} = 1;}
    $success2 = 1;
    if (@itset < scalar(keys %sum)) {
      $success2 = 0;
      $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in $word_boundary.txt -- ";
      foreach (keys %sum) {
        if (!$itset{$_}) {
          print "$_ ";
        }
      }
      print "
  ";
    }
    if (@itset < scalar(keys %wb)) {
      $success2 = 0;
      $exit = 1; print "--> ERROR: phones in $word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
      foreach (keys %wb) {
        if (!$itset{$_}) {
          print "$_ ";
        }
      }
      print "
  ";
    }
    $success2 == 0 || print "--> $lang/phones/$word_boundary.txt is the union of nonsilence.txt and silence.txt
  ";
    $success1 != 1 or $success2 != 1 || print "--> $lang/phones/$word_boundary.txt is OK
  ";
    print "
  ";
  }
  
  
  
  {
    print "Checking word-level disambiguation symbols...
  ";
    # This block checks that one of the two following conditions hold:
    # (1) for lang diretories prepared by older versions of prepare_lang.sh:
    #  The symbol  '#0' should appear in words.txt and phones.txt, and should
    # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int
    #  exist, and have the expected properties (see below for details).
  
    # note, %wdisambig_words_hash hashes from the integer word-id of word-level
    # disambiguation symbols, to 1 if the word is a disambig symbol.
  
    if (! -e "$lang/phones/wdisambig.txt") {
      print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)
  ";
      if (exists $wsymtab{"#0"}) {
        print "--> $lang/words.txt has \"#0\"
  ";
        $wdisambig_words_hash{$wsymtab{"#0"}} = 1;
      } else {
        print "--> WARNING: $lang/words.txt doesn't have \"#0\"
  ";
        print "-->          (if you are using ARPA-type language models, you will normally
  ";
        print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)
  ";
      }
    } else {
      print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)
  ";
      if (!open(T, "<$lang/phones/wdisambig.txt")) {
        print "--> ERROR: fail to open $lang/phones/wdisambig.txt
  "; $exit = 1; return;
      }
      chomp(my @wdisambig = <T>);
      close(T);
      if (!open(W, "<$lang/phones/wdisambig_words.int")) {
        print "--> ERROR: fail to open $lang/phones/wdisambig_words.int
  "; $exit = 1; return;
      }
      chomp(my @wdisambig_words = <W>);
      close(W);
      if (!open(P, "<$lang/phones/wdisambig_phones.int")) {
        print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int
  "; $exit = 1; return;
      }
      chomp(my @wdisambig_phones = <P>);
      close(P);
      my $len = @wdisambig, $len2;
      if (($len2 = @wdisambig_words) != $len) {
        print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths
  ";
        $exit = 1; return;
      }
      if (($len2 = @wdisambig_phones) != $len) {
        print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths
  ";
        $exit = 1; return;
      }
      for (my $i = 0; $i < $len; $i++) {
        if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) {
          my $ii = $i + 1;
          print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch
  ";
          $exit = 1; return;
        }
      }
      for (my $i = 0; $i < $len; $i++) {
        if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) {
          my $ii = $i + 1;
          print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch
  ";
          $exit = 1; return;
        }
      }
      foreach my $i ( @wdisambig_words ) {
        $wdisambig_words_hash{$i} = 1;
      }
    }
  }
  
  # Check validity of L.fst, L_disambig.fst, and word_boundary.int.
  # First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
  # For subword case the last subword of the sequence must be a end-subword 
  # (i.e. the subword can only be at the end of word or is a single word itself) 
  # to guarantee the composition would not fail.
  # We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
  # In word case, the number of valid boundaries should be equal to the number of words.
  # In subword case, the number of valid boundaries should be equal to the number of end-subwords.
  if (-s "$lang/phones/$word_boundary.int") {
    print "Checking $word_boundary.int and disambig.int
  ";
    if (!open (W, "<$lang/phones/$word_boundary.int")) {
      $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.int
  ";
    }
    while (<W>) {
      @A = split;
      if (@A != 2) {
        $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/$word_boundary.int
  ";
      }
      $wbtype{$A[0]} = $A[1];
    }
    close(W);
    if (!open (D, "<$lang/phones/disambig.int")) {
      $exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int
  ";
    }
    while (<D>) {
      @A = split;
      if (@A != 1) {
        $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int
  ";
      }
      $is_disambig{$A[0]} = 1;
    }
  
    $text = `. ./path.sh`;
    if ($text ne "") {
      print "*** This script cannot continue because your path.sh or bash profile prints something: $text" .
        "*** Please fix that and try again.
  ";
      exit(1);
    }
  
    foreach $fst ("L.fst", "L_disambig.fst") {
      if ($skip_generate_words_check) {
        next;
      }
      $wlen = int(rand(100)) + 1;
      $end_subword = 0;
      print "--> generating a $wlen word/subword sequence
  ";
      $wordseq = "";
      $sid = 0;
      $wordseq_syms = "";
      # exclude disambiguation symbols, BOS and EOS, epsilon, and
      # grammar-related symbols from the word sequence.
      while ($sid < ($wlen - 1)) {
        $id = int(rand(scalar(keys %wint2sym)));
        while (defined $wdisambig_words_hash{$id} or
             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
             $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
          $id = int(rand(scalar(keys %wint2sym)));
        }
        $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
        $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0
  ";
        $sid ++;
  
        if ($subword_check) {
          $subword = $wint2sym{$id};
          $suffix = substr($subword, -$separator_length, $separator_length);
          if ($suffix ne $separator) {
            $end_subword ++;
          }
        }
      } 
  
      # generate the last word (subword)
      $id = int(rand(scalar(keys %wint2sym)));
      if ($subword_check) {
        $subword = $wint2sym{$id};
        $suffix = substr($subword, -$separator_length, $separator_length);
        # the last subword can not followed by separator  
        while (defined $wdisambig_words_hash{$id} or
             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
             $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
          $id = int(rand(scalar(keys %wint2sym)));
          $subword = $wint2sym{$id};
          $suffix = substr($subword, -$separator_length, $separator_length);
        }
        $end_subword ++;
      } else {
        while (defined $wdisambig_words_hash{$id} or
             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
             $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
         $id = int(rand(scalar(keys %wint2sym)));
        }
      }
      $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
      $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0
  ";
      $sid ++;
  
      $wordseq = $wordseq . "$sid 0";
      $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
      $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
      # 0 means transition is allowed.  bos and eos are added as extra symbols here.
      foreach $x ("bos", "nonword", "end", "singleton") {
        $transition{$x, "nonword"} = 0;
        $transition{$x, "begin"} = 1;
        $transition{$x, "singleton"} = 1;
        $transition{$x, "eos"} = 0;
      }
      $transition{"begin", "end"} = 0;
      $transition{"begin", "internal"} = 0;
      $transition{"internal", "internal"} = 0;
      $transition{"internal", "end"} = 0;
  
      $cur_state = "bos";
      $num_words = 0;
      foreach $phone (split (" ", "$phoneseq <<eos>>")) {
        # Note: now that we support unk-LMs (see the --unk-fst option to
        # prepare_lang.sh), the regular L.fst may contain some disambiguation
        # symbols.
        if (! defined $is_disambig{$phone}) {
          if ($phone eq "<<eos>>") {
            $state = "eos";
          } elsif ($phone == 0) {
            $exit = 1; print "--> ERROR: unexpected phone sequence=$phoneseq, wordseq=$wordseq
  "; last;
          } else {
            $state = $wbtype{$phone};
          }
          if (!defined $state) {
            $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/$word_boundary.int
  ";
            last;
          } elsif (!defined $transition{$cur_state, $state}) {
            $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in $word_boundary.int or L.fst
  ";
            last;
          } else {
            $num_words += $transition{$cur_state, $state};
            $cur_state = $state;
          }
        }
      }
      if (!$exit) {
        if ($subword_check) { 
          $wlen = $end_subword;
        }
        if ($num_words != $wlen) {
          $phoneseq_syms = "";
          foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; }
          $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or $word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms
  ";
        } else {
          print "--> resulting phone sequence from $fst corresponds to the word sequence
  ";
          print "--> $fst is OK
  ";
        }
      }
    }
    print "
  ";
  }
  
  # Check oov -------------------------------
  check_txt_int("$lang/oov", \%wsymtab, 0); print "
  ";
  
  # Check if L.fst is olabel sorted.
  if (-e "$lang/L.fst") {
    $cmd = "fstinfo $lang/L.fst | grep -E 'output label sorted.*y' > /dev/null";
    $res = system(". ./path.sh; $cmd");
    if ($res == 0) {
      print "--> $lang/L.fst is olabel sorted
  ";
    } else {
      print "--> ERROR: $lang/L.fst is not olabel sorted
  ";
      $exit = 1;
    }
  }
  
  # Check if L_disambig.fst is olabel sorted.
  if (-e "$lang/L_disambig.fst") {
    $cmd = "fstinfo $lang/L_disambig.fst | grep -E 'output label sorted.*y' > /dev/null";
    $res = system(". ./path.sh; $cmd");
    if ($res == 0) {
      print "--> $lang/L_disambig.fst is olabel sorted
  ";
    } else {
      print "--> ERROR: $lang/L_disambig.fst is not olabel sorted
  ";
      $exit = 1;
    }
  }
  
  if (-e "$lang/G.fst") {
    # Check that G.fst is ilabel sorted and nonempty.
    $text = `. ./path.sh; fstinfo $lang/G.fst`;
    if ($? != 0) {
      print "--> ERROR: fstinfo failed on $lang/G.fst
  ";
      $exit = 1;
    }
    if ($text =~ m/input label sorted\s+y/) {
      print "--> $lang/G.fst is ilabel sorted
  ";
    } else {
      print "--> ERROR: $lang/G.fst is not ilabel sorted
  ";
      $exit = 1;
    }
    if ($text =~ m/# of states\s+(\d+)/) {
      $num_states = $1;
      if ($num_states == 0) {
        print "--> ERROR: $lang/G.fst is empty
  ";
        $exit = 1;
      } else {
        print "--> $lang/G.fst has $num_states states
  ";
      }
    }
  
    # Check that G.fst is determinizable.
    if (!$skip_det_check) {
      # Check determinizability of G.fst
      # fstdeterminizestar is much faster, and a more relevant test as it's what
      # we do in the actual graph creation recipe.
      if (-e "$lang/G.fst") {
        $cmd = "fstdeterminizestar $lang/G.fst /dev/null";
        $res = system(". ./path.sh; $cmd");
        if ($res == 0) {
          print "--> $lang/G.fst is determinizable
  ";
        } else {
          print "--> ERROR: fail to determinize $lang/G.fst
  ";
          $exit = 1;
        }
      }
    }
  
    # Check that G.fst does not have cycles with only disambiguation symbols or
    # epsilons on the input, or the forbidden symbols <s> and </s> (and a few
    # related checks
  
    if (-e "$lang/G.fst") {
      system("utils/lang/check_g_properties.pl $lang");
      if ($? != 0) {
        print "--> ERROR: failure running check_g_properties.pl
  ";
        $exit = 1;
      } else {
        print("--> utils/lang/check_g_properties.pl succeeded.
  ");
      }
    }
  }
  
  
  if (!$skip_det_check) {
    if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") {
      print "--> Testing determinizability of L_disambig . G
  ";
      $output = `. ./path.sh; fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar | fstinfo 2>&1 `;
      if ($output =~ m/# of states\s*[1-9]/) {
        print "--> L_disambig . G is determinizable
  ";
      } else {
        print "--> ERROR: fail to determinize L_disambig . G.  Output is:
  ";
        print "$output
  ";
        $exit = 1;
      }
    }
  }
  
  if ($exit == 1) {
    print "--> ERROR (see error messages above)
  "; exit 1;
  } else {
    if ($warning == 1) {
      print "--> WARNING (check output above for warnings)
  "; exit 0;
    } else {
      print "--> SUCCESS [validating lang directory $lang]
  "; exit 0;
    }
  }