Blame view

LIA_kaldiUtils/add_lex_disambig.pl 2.92 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  #!/usr/bin/perl
  # Copyright 2010-2011 Microsoft Corporation
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  # Adds disambiguation symbols to a lexicon.
  # Outputs still in the normal lexicon format.
  # Disambig syms are numbered #1, #2, #3, etc. (#0 
  # reserved for symbol in grammar).
  # Outputs the number of disambig syms to the standard output.
  
  if(@ARGV != 2) {
      die "Usage: add_lex_disambig.pl  lexicon.txt lexicon_disambig.txt "
  }
  
  
  $lexfn = shift @ARGV;
  $lexoutfn = shift @ARGV;
  
  open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
  
  # (1)  Read in the lexicon.
  @L = ( );
  while(<L>) {
      @A = split(" ", $_);
      push @L, join(" ", @A);
  }
  
  # (2) Work out the count of each phone-sequence in the
  # lexicon.
  
  foreach $l (@L) {
      @A = split(" ", $l);
      shift @A; # Remove word.
      $count{join(" ",@A)}++;
  }
  
  # (3) For each left sub-sequence of each phone-sequence, note down
  # that exists (for identifying prefixes of longer strings).
  
  foreach $l (@L) {
      @A = split(" ", $l);
      shift @A; # Remove word.
      while(@A > 0) {
          pop @A;  # Remove last phone
          $issubseq{join(" ",@A)} = 1;
      }
  }
  
  # (4) For each entry in the lexicon:
  #  if the phone sequence is unique and is not a
  #  prefix of another word, no diambig symbol.
  #  Else output #1, or #2, #3, ... if the same phone-seq
  #  has already been assigned a disambig symbol.
  
  
  open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.
  ";
  
  $max_disambig = 0;
  foreach $l (@L) {
      @A = split(" ", $l);
      $word = shift @A;
      $phnseq = join(" ",@A);
      if(!defined $issubseq{$phnseq}
         && $count{$phnseq}==1) {
          ; # Do nothing.
      } else {
          if($phnseq eq "") { # need disambig symbols for the empty string
              # that are not use anywhere else.
              $max_disambig++;
              $reserved{$max_disambig} = 1;
              $phnseq = "#$max_disambig";
          } else {
              $curnumber = $disambig_of{$phnseq};
              if(!defined{$curnumber}) { $curnumber = 0; }
              $curnumber++; # now 1 or 2, ... 
              while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
              if($curnumber > $max_disambig) {
                  $max_disambig = $curnumber;
              }
              $disambig_of{$phnseq} = $curnumber;
              $phnseq = $phnseq . " #" . $curnumber;
           }
      }
      print O "$word\t$phnseq
  ";
  }
  
  print $max_disambig . "
  ";