Blame view
egs/wsj/s5/utils/lang/validate_disambig_sym_file.pl
2.38 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#!/usr/bin/env perl # Copyright 2016 FAU Erlangen (Author: Axel Horndasch) # Apache 2.0. # # Concept: Dan Povey use strict; use warnings; use Getopt::Long; my $Usage = <<EOU; Usage: validate_disambig_sym_file.pl [options] disambig_syms.txt This scripts checks if the entries of a file containing disambiguation symbols (word or phone level) are all valid. To be valid the symbols - must start with the hash mark '#', - must not contain any whitespace, - must not be equal to '#-1' (disallowed because it is used internally in some FST stuff). In case the option '--allow-numeric' is used with 'false', the symbols must also be non-numeric (to avoid overlap with the automatically generated symbols). Allowed options: --allow-numeric (true|false) : Default true. If false, disallow numeric disambiguation symbols like #0, #1 and so on. EOU # Command line options my $allow_numeric = "true"; # Get the optional command line options GetOptions( "allow-numeric=s" => \$allow_numeric, ) or die ($Usage); if (@ARGV != 1) { die($Usage); } my $disambig_sym_file = shift @ARGV; print "$0: Checking validity of file \"$disambig_sym_file\" ... "; if (-z $disambig_sym_file) { print "$0: The file \"$disambig_sym_file\" is empty or does not exist, exiting ... "; exit 1; } if (not open(SYMS, "<$disambig_sym_file")) { print "$0: Could not open file \"$disambig_sym_file\", exiting ... "; exit 1; } # Go through the file containing disambiguation symbols line by line while (<SYMS>) { chomp; my $symbol = $_; if ($symbol =~ /^#(.*)$/) { my $sympart = $1; if ($sympart eq "") { print "$0: Only \"$symbol\" is not allowed as a disambiguation symbol, exiting ... "; exit 1; } if ($sympart =~/\s+/) { print "$0: The disambiguation symbol \"$symbol\" contains whitespace, exiting ... "; exit 1; } if ($sympart eq "-1") { print "$0: The disambiguation symbol \"$symbol\" is not allowed, exiting ... "; exit 1; } if ($allow_numeric eq "false" && $sympart =~/^[0-9]+$/) { print "$0: Since \"$symbol\" is supposed to be an extra disambiguation symbol, it must not be numeric, exiting ... "; exit 1; } } else { print "$0: The disambiguation symbol \"$symbol\" does not start with a '#', exiting ... "; exit 1; } } print "--> SUCCESS [validating disambiguation symbol file \"$disambig_sym_file\"] "; exit 0; |