Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/validate_dict_dir.pl 18 KB
  #!/usr/bin/env perl
  
  # Apache 2.0.
  # Copyright  2012 Guoguo Chen
  #            2015 Daniel Povey
  #            2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
  #
  # Validation script for 'dict' directories (e.g. data/local/dict)
  
  # this function reads the opened file (supplied as a first
  # parameter) into an array of lines. For each
  # line, it tests whether it's a valid utf-8 compatible
  # line. If all lines are valid utf-8, it returns the lines
  # decoded as utf-8, otherwise it assumes the file's encoding
  # is one of those 1-byte encodings, such as ISO-8859-x
  # or Windows CP-X.
  # Please recall we do not really care about
  # the actually encoding, we just need to
  # make sure the length of the (decoded) string
  # is correct (to make the output formatting looking right).
  sub get_utf8_or_bytestream {
    use Encode qw(decode encode);
    my $is_utf_compatible = 1;
    my @unicode_lines;
    my @raw_lines;
    my $raw_text;
    my $lineno = 0;
    my $file = shift;
  
    while (<$file>) {
      $raw_text = $_;
      last unless $raw_text;
      if ($is_utf_compatible) {
        my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
        $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
        push @unicode_lines, $decoded_text;
      } else {
        #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text
  ";
        ;
      }
      push @raw_lines, $raw_text;
      $lineno += 1;
    }
  
    if (!$is_utf_compatible) {
      return (0, @raw_lines);
    } else {
      return (1, @unicode_lines);
    }
  }
  
  # check if the given unicode string contain unicode whitespaces
  # other than the usual four: TAB, LF, CR and SPACE
  sub validate_utf8_whitespaces {
    my $unicode_lines = shift;
    use feature 'unicode_strings';
    for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
      my $current_line = $unicode_lines->[$i];
      if ((substr $current_line, -1) ne "
  "){
        print STDERR "$0: The current line (nr. $i) has invalid newline
  ";
        return 1;
      }
      # we replace TAB, LF, CR, and SPACE
      # this is to simplify the test
      if ($current_line =~ /\x{000d}/) {
        print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character
  ";
        return 1;
      }
      $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
      if ($current_line =~/\s/) {
        return 1;
      }
    }
    return 0;
  }
  
  # checks if the text in the file (supplied as the argument) is utf-8 compatible
  # if yes, checks if it contains only allowed whitespaces. If no, then does not
  # do anything. The function seeks to the original position in the file after
  # reading the text.
  sub check_allowed_whitespace {
    my $file = shift;
    my $pos = tell($file);
    (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
    seek($file, $pos, SEEK_SET);
    if ($is_utf) {
      my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
      print "--> text seems to be UTF-8 or ASCII, checking whitespaces
  ";
      if ($has_invalid_whitespaces) {
        print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)
  ";
        return 0;
      } else {
        print "--> text contains only allowed whitespaces
  ";
      }
    } else {
      print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces
  ";
    }
    return 1;
  }
  
  
  if(@ARGV != 1) {
    die "Usage: validate_dict_dir.pl <dict-dir>
  " .
        "e.g.: validate_dict_dir.pl data/local/dict
  ";
  }
  
  $dict = shift @ARGV;
  $dict =~ s:/$::;
  
  $exit = 0;
  $success = 1;  # this is re-set each time we read a file.
  
  sub set_to_fail { $exit = 1; $success = 0; }
  
  # Checking silence_phones.txt -------------------------------
  print "Checking $dict/silence_phones.txt ...
  ";
  if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists
  "; exit 1;}
  if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt
  "; exit 1;}
  $idx = 1;
  %silence = ();
  $crlf = 1;
  
  print "--> reading $dict/silence_phones.txt
  ";
  check_allowed_whitespace(\*S) || set_to_fail();
  while(<S>) {
    if (! s/
  $//) {
      print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.
  ";
      set_to_fail();
    }
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.
  ";
      set_to_fail();
      $crlf = 0;
    }
    my @col = split(" ", $_);
    if (@col == 0) {
      set_to_fail();
      print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)
  ";
    }
    foreach(0 .. @col-1) {
      my $p = $col[$_];
      if($silence{$p}) {
        set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)
  ";
      } else {
        $silence{$p} = 1;
      }
      # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
      # problems with word-position-dependent systems, and <eps> is obviously
      # confusable with epsilon.
      if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
        set_to_fail();
        print "--> ERROR: phone \"$p\" has disallowed written form
  ";
      }
    }
    $idx ++;
  }
  close(S);
  $success == 0 || print "--> $dict/silence_phones.txt is OK
  ";
  print "
  ";
  
  # Checking optional_silence.txt -------------------------------
  print "Checking $dict/optional_silence.txt ...
  ";
  if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists
  "; exit 1;}
  if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt
  "; exit 1;}
  $idx = 1;
  $success = 1;
  $crlf = 1;
  print "--> reading $dict/optional_silence.txt
  ";
  check_allowed_whitespace(\*OS) or exit 1;
  while(<OS>) {
    chomp;
    my @col = split(" ", $_);
    if ($idx > 1 or @col > 1) {
      set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt
  ";
    } elsif (!$silence{$col[0]}) {
      set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt
  ";
    }
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.
  ";
      set_to_fail();
      $crlf = 0;
    }
    $idx ++;
  }
  close(OS);
  $success == 0 || print "--> $dict/optional_silence.txt is OK
  ";
  print "
  ";
  
  # Checking nonsilence_phones.txt -------------------------------
  print "Checking $dict/nonsilence_phones.txt ...
  ";
  if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists
  "; exit 1;}
  if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt
  "; exit 1;}
  $idx = 1;
  %nonsilence = ();
  $success = 1;
  $crlf = 1;
  print "--> reading $dict/nonsilence_phones.txt
  ";
  check_allowed_whitespace(\*NS) or set_to_fail();
  while(<NS>) {
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.
  ";
      set_to_fail();
      $crlf = 0;
    }
    if (! s/
  $//) {
      print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.
  ";
      set_to_fail();
    }
    my @col = split(" ", $_);
    if (@col == 0) {
      set_to_fail();
      print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)
  ";
    }
    foreach(0 .. @col-1) {
      my $p = $col[$_];
      if($nonsilence{$p}) {
        set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)
  ";
      } else {
        $nonsilence{$p} = 1;
      }
      # phones that start with the pound sign/hash may be mistaken for
      # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
      # problems with word-position-dependent systems, and <eps> is obviously
      # confusable with epsilon.
      if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
        set_to_fail();
        print "--> ERROR: phone \"$p\" has disallowed written form
  ";
      }
    }
    $idx ++;
  }
  close(NS);
  $success == 0 || print "--> $dict/nonsilence_phones.txt is OK
  ";
  print "
  ";
  
  # Checking disjoint -------------------------------
  sub intersect {
    my ($a, $b) = @_;
    @itset = ();
    %itset = ();
    foreach(keys %$a) {
      if(exists $b->{$_} and !$itset{$_}) {
        push(@itset, $_);
        $itset{$_} = 1;
      }
    }
    return @itset;
  }
  
  print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt
  ";
  @itset = intersect(\%silence, \%nonsilence);
  if(@itset == 0) {print "--> disjoint property is OK.
  ";}
  else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "
  ";}
  print "
  ";
  
  
  sub check_lexicon {
    my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
    print "Checking $lex
  ";
    !open(L, "<$lex") && print "--> ERROR: fail to open $lex
  " && set_to_fail();
    my %seen_line = {};
    $idx = 1; $success = 1; $crlf = 1;
    print "--> reading $lex
  ";
    check_allowed_whitespace(\*L) or set_to_fail();
    while (<L>) {
      if ($crlf == 1 && m/\r/) {
        print "--> ERROR: $lex contains Carriage Return (^M) characters.
  ";
        set_to_fail();
        $crlf = 0;
      }
      if (defined $seen_line{$_}) {
        print "--> ERROR: line '$_' of $lex is repeated
  ";
        set_to_fail();
      }
      $seen_line{$_} = 1;
      if (! s/
  $//) {
        print "--> ERROR: last line '$_' of $lex does not end in newline.
  ";
        set_to_fail();
      }
      my @col = split(" ", $_);
      $word = shift @col;
      if (!defined $word) {
        print "--> ERROR: empty lexicon line in $lex
  "; set_to_fail();
      }
      if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
        print "--> ERROR: lexicon.txt contains forbidden word $word
  ";
        set_to_fail();
      }
      for ($n = 0; $n < $num_prob_cols; $n++) {
        $prob = shift @col;
        if (!($prob > 0.0 && $prob <= 1.0)) {
          print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex
  ";
          set_to_fail();
        }
      }
      for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
      if (@col == 0) {
        print "--> ERROR: lexicon.txt contains word $word with empty ";
        print "pronunciation.
  ";
        set_to_fail();
      }
      foreach (0 .. @col-1) {
        if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
          print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
          print "(line $idx)
  ";
          set_to_fail();
        }
      }
      $idx ++;
    }
    close(L);
    $success == 0 || print "--> $lex is OK
  ";
    print "
  ";
  }
  
  if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
  if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
  if (-f "$dict/lexiconp_silprob.txt") {
    # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
    # exist.
    check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
    if (-f "$dict/silprob.txt") {
      !open(SP, "<$dict/silprob.txt") &&
        print "--> ERROR: fail to open $dict/silprob.txt
  " && set_to_fail();
        $crlf = 1;
      while (<SP>) {
        if ($crlf == 1 && m/\r/) {
          print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.
  ";
          set_to_fail();
          $crlf = 0;
        }
        chomp; my @col = split;
        @col != 2 && die "--> ERROR: bad line \"$_\"
  " && set_to_fail();
        if ($col[0] eq "<s>" || $col[0] eq "overall") {
          if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
            set_to_fail();
            print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"
  ";
          }
        } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
          if ($col[1] <= 0.0) {
            set_to_fail();
            print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"
  ";
          }
        } else {
          print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"
  ";
          set_to_fail();
        }
      }
      close(SP);
    } else {
      set_to_fail();
      print "--> ERROR: expecting $dict/silprob.txt to exist
  ";
    }
  }
  
  if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
    print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir
  ";
    set_to_fail();
  }
  
  sub check_lexicon_pair {
    my ($lex1, $num_prob_cols1, $num_skipped_cols1,
        $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
    # We have checked individual lexicons already.
    open(L1, "<$lex1"); open(L2, "<$lex2");
    print "Checking lexicon pair $lex1 and $lex2
  ";
    my $line_num = 0;
    while(<L1>) {
      $line_num++;
      @A = split;
      $line_B = <L2>;
      if (!defined $line_B) {
        print "--> ERROR: $lex1 and $lex2 have different number of lines.
  ";
        set_to_fail(); last;
      }
      @B = split(" ", $line_B);
      # Check if the word matches.
      if ($A[0] ne $B[0]) {
        print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?
  ";
        set_to_fail(); last;
      }
      shift @A; shift @B;
      for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
      for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
      # Check if the pronunciation matches
      if (join(" ", @A) ne join(" ", @B)) {
        print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?
  ";
        set_to_fail(); last;
      }
    }
    $line_B = <L2>;
    if (defined $line_B && $exit == 0) {
      print "--> ERROR: $lex1 and $lex2 have different number of lines.
  ";
      set_to_fail();
    }
    $success == 0 || print "--> lexicon pair $lex1 and $lex2 match
  
  ";
  }
  
  # If more than one lexicon exist, we have to check if they correspond to each
  # other. It could be that the user overwrote one and we need to regenerate the
  # other, but we do not know which is which.
  if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
    check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
  }
  if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
    check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
                       "$dict/lexiconp_silprob.txt", 2, 2);
  }
  
  # Checking extra_questions.txt -------------------------------
  %distinguished = (); # Keep track of all phone-pairs including nonsilence that
                       # are distinguished (split apart) by extra_questions.txt,
                       # as $distinguished{$p1,$p2} = 1.  This will be used to
                       # make sure that we don't have pairs of phones on the same
                       # line in nonsilence_phones.txt that can never be
                       # distinguished from each other by questions.  (If any two
                       # phones appear on the same line in nonsilence_phones.txt,
                       # they share a tree root, and since the automatic
                       # question-building treats all phones that appear on the
                       # same line of nonsilence_phones.txt as being in the same
                       # group, we can never distinguish them without resorting to
                       # questions in extra_questions.txt.
  print "Checking $dict/extra_questions.txt ...
  ";
  if (-s "$dict/extra_questions.txt") {
    if (!open(EX, "<$dict/extra_questions.txt")) {
      set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt
  ";
    }
    $idx = 1;
    $success = 1;
    $crlf = 1;
    print "--> reading $dict/extra_questions.txt
  ";
    check_allowed_whitespace(\*EX) or set_to_fail();
    while(<EX>) {
      if ($crlf == 1 && m/\r/) {
        print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.
  ";
        set_to_fail();
        $crlf = 0;
      }
      if (! s/
  $//) {
        print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.
  ";
        set_to_fail();
      }
      my @col = split(" ", $_);
      if (@col == 0) {
        set_to_fail();  print "--> ERROR: empty line in $dict/extra_questions.txt
  ";
      }
      foreach (0 .. @col-1) {
        if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
          set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")
  ";
        }
        $idx ++;
      }
      %col_hash = ();
      foreach $p (@col) { $col_hash{$p} = 1; }
      foreach $p1 (@col) {
        # Update %distinguished hash.
        foreach $p2 (keys %nonsilence) {
          if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
                                         # in this question (and in nonsilence
                                         # phones)... mark p1,p2 as being split apart
            $distinguished{$p1,$p2} = 1;
            $distinguished{$p2,$p1} = 1;
          }
        }
      }
    }
    close(EX);
    $success == 0 || print "--> $dict/extra_questions.txt is OK
  ";
  } else { print "--> $dict/extra_questions.txt is empty (this is OK)
  ";}
  
  if (-f "$dict/nonterminals.txt") {
    open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt";
    my %nonterminals = ();
    my $line_number = 1;
    while (<NT>) {
      chop;
      my @line = split(" ", $_);
      if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) {
        print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt
  "; exit 1;
      }
      $nonterminals{$line[0]} = 1;
      $line_number++;
    }
    print "--> $dict/nonterminals.txt is OK
  ";
  }
  
  
  # check nonsilence_phones.txt again for phone-pairs that are never
  # distnguishable.  (note: this situation is normal and expected for silence
  # phones, so we don't check it.)
  if(!open(NS, "<$dict/nonsilence_phones.txt")) {
    print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time
  "; exit 1;
  }
  
  $num_warn_nosplit = 0;
  $num_warn_nosplit_limit = 10;
  while(<NS>) {
    my @col = split(" ", $_);
    foreach $p1 (@col) {
      foreach $p2 (@col) {
        if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
          set_to_fail();
          if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
            print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.
  ";
          }
          if ($num_warn_nosplit == $num_warn_nosplit_limit) {
            print "... Not warning any more times about this issue.
  ";
          }
          if ($num_warn_nosplit == 0) {
            print "    (note: we started checking for this only recently.  You can still build a system but
  ";
            print "     phones $p1 and $p2 will be acoustically indistinguishable).
  ";
          }
          $num_warn_nosplit++;
        }
      }
    }
  }
  
  
  if ($exit == 1) {
    print "--> ERROR validating dictionary directory $dict (see detailed error ";
    print "messages above)
  
  ";
    exit 1;
  } else {
    print "--> SUCCESS [validating dictionary directory $dict]
  
  ";
  }
  
  exit 0;