Blame view

egs/wsj/s5/utils/lang/validate_disambig_sym_file.pl 2.38 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  #!/usr/bin/env perl
  
  # Copyright 2016 FAU Erlangen (Author: Axel Horndasch)
  # Apache 2.0.
  #
  # Concept: Dan Povey
  
  use strict;
  use warnings;
  use Getopt::Long;
  
  my $Usage = <<EOU;
  Usage:  validate_disambig_sym_file.pl [options] disambig_syms.txt
  
  This scripts checks if the entries of a file containing disambiguation symbols
  (word or phone level) are all valid. To be valid the symbols
  - must start with the hash mark '#',
  - must not contain any whitespace,
  - must not be equal to '#-1' (disallowed because it is used internally in some
    FST stuff).
  
  In case the option '--allow-numeric' is used with 'false', the symbols must
  also be non-numeric (to avoid overlap with the automatically generated symbols).
  
  Allowed options:
    --allow-numeric (true|false) : Default true. If false, disallow numeric
                                   disambiguation symbols like #0, #1 and so on.
  
  EOU
  
  # Command line options
  my $allow_numeric = "true";
  
  # Get the optional command line options
  GetOptions(
      "allow-numeric=s" => \$allow_numeric,
      ) or die ($Usage);
  
  if (@ARGV != 1) {
    die($Usage);
  }
  
  my $disambig_sym_file = shift @ARGV;
  
  print "$0: Checking validity of file \"$disambig_sym_file\" ...
  ";
  if (-z $disambig_sym_file) {
    print "$0: The file \"$disambig_sym_file\" is empty or does not exist, exiting ...
  "; exit 1;
  }
  
  if (not open(SYMS, "<$disambig_sym_file")) {
    print "$0: Could not open file \"$disambig_sym_file\", exiting ...
  "; exit 1;
  }
  
  # Go through the file containing disambiguation symbols line by line
  while (<SYMS>) {
    chomp;
    my $symbol = $_;
  
    if ($symbol =~ /^#(.*)$/) {
      my $sympart = $1;
      if ($sympart eq "") {
        print "$0: Only \"$symbol\" is not allowed as a disambiguation symbol, exiting ...
  "; exit 1;
      }
      if ($sympart =~/\s+/) {
        print "$0: The disambiguation symbol \"$symbol\" contains whitespace, exiting ...
  "; exit 1;
      }
      if ($sympart eq "-1") {
        print "$0: The disambiguation symbol \"$symbol\" is not allowed, exiting ...
  "; exit 1;
      }
      if ($allow_numeric eq "false" &&
  	$sympart =~/^[0-9]+$/) {
        print "$0: Since \"$symbol\" is supposed to be an extra disambiguation symbol, it must not be numeric, exiting ...
  "; exit 1;
      }
    } else {
      print "$0: The disambiguation symbol \"$symbol\" does not start with a '#', exiting ...
  "; exit 1;
    }
  }
  
  print "--> SUCCESS [validating disambiguation symbol file \"$disambig_sym_file\"]
  ";
  exit 0;