Blame view

LIA_kaldiUtils/sym2int.pl 2.98 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  #!/usr/bin/perl
  # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  $ignore_oov = 0;
  $ignore_first_field = 0;
  for($x = 0; $x < 2; $x++) {
    if ($ARGV[0] eq "--map-oov") {
      shift @ARGV; $map_oov = shift @ARGV;
    }
    if ($ARGV[0] eq "-f") {
      shift @ARGV; 
      $field_spec = shift @ARGV; 
      if ($field_spec =~ m/^\d+$/) {
        $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
      }
      if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
        if ($1 ne "") {
          $field_begin = $1 - 1;  # Change to zero-based indexing.
        }
        if ($2 ne "") {
          $field_end = $2 - 1;    # Change to zero-based indexing.
        }
      }
      if (!defined $field_begin && !defined $field_end) {
        die "Bad argument to -f option: $field_spec"; 
      }
    }
  }
  
  $symtab = shift @ARGV;
  if (!defined $symtab) {
    print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions
  " .
      "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]
  " .
        "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.
  ";
  }
  open(F, "<$symtab") || die "Error opening symbol table file $symtab";
  while(<F>) {
      @A = split(" ", $_);
      @A == 2 || die "bad line in symbol table file: $_";
      $sym2int{$A[0]} = $A[1] + 0;
  }
  
  if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
    if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
    $map_oov = $sym2int{$map_oov};
  }
  
  $num_warning = 0;
  $max_warning = 20;
  
  while (<>) {
    @A = split(" ", $_);
    if (@A == 0) {
      die "Empty line in transcriptions input.";
    }
    @B = ();
    for ($n = 0; $n < @A; $n++) {
      $a = $A[$n];
      if ( (!defined $field_begin || $n >= $field_begin)
           && (!defined $field_end || $n <= $field_end)) {
        $i = $sym2int{$a};
        if (!defined ($i)) {
          if (defined $map_oov) {
            if ($num_warning++ < $max_warning) {
              print STDERR "sym2int.pl: replacing $a with $map_oov
  ";
              if ($num_warning == $max_warning) {
                print STDERR "sym2int.pl: not warning for OOVs any more times
  ";
              }
            }
            $i = $map_oov;
          } else {
            $pos = $n+1;
            die "sym2int.pl: undefined symbol $a (in position $pos)
  ";
          }
        }
        $a = $i;
      }
      push @B, $a;
    }
    print join(" ", @B);
    print "
  ";
  }
  
  exit(0);