Blame view

egs/wsj/s5/utils/map_arpa_lm.pl 3.36 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
  #!/usr/bin/env perl
  
  # Copyright 2014  Guoguo Chen
  #           2014  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0.
  #
  use strict;
  use warnings;
  use Getopt::Long;
  
  my $Usage = <<EOU;
  This script reads the Arpa format language model, and maps the words into
  integers or vice versa. It ignores the words that are not in the symbol table,
  and updates the head information.
  
  It will be used joinly with lmbin/arpa-to-const-arpa to build ConstArpaLm format
  language model. We first map the words in an Arpa format language model to
  integers, and then use lmbin/arpa-to-const-arpa to build a ConstArpaLm format
  language model.
  
  Usage: utils/map_arpa_lm.pl [options] <vocab-file> < input-arpa >output-arpa
   e.g.: utils/map_arpa_lm.pl words.txt <arpa_lm.txt >arpa_lm.int
  
  Allowed options:
    --sym2int   : If true, maps words to integers, other wise maps integers to
                  words. (boolean, default = true)
  
  EOU
  
  my $sym2int = "true";
  GetOptions('sym2int=s' => \$sym2int);
  
  ($sym2int eq "true" || $sym2int eq "false") ||
    die "$0: Bad value for option --sym2int
  ";
  
  if (@ARGV != 1) {
    die $Usage;
  }
  
  # Gets parameters.
  my $symtab = shift @ARGV;
  my $arpa_in = shift @ARGV;
  my $arpa_out = shift @ARGV;
  
  # Opens files.
  open(M, "<$symtab") || die "$0: Fail to open $symtab
  ";
  
  # Reads in the mapper.
  my %mapper;
  while (<M>) {
    chomp;
    my @col = split(/[\s]+/, $_);
    @col == 2 || die "$0: Bad line in mapper file \"$_\"
  ";
    if ($sym2int eq "true") {
      if (defined($mapper{$col[0]})) {
        die "$0: Duplicate entry \"$col[0]\"
  ";
      }
      $mapper{$col[0]} = $col[1];
    } else {
      if (defined($mapper{$col[1]})) {
        die "$0: Duplicate entry \"$col[1]\"
  ";
      }
      $mapper{$col[1]} = $col[0];
    }
  }
  
  my $num_oov_lines = 0;
  my $max_oov_warn = 20;
  
  # Parses Arpa n-gram language model.
  my $arpa = "";
  my $current_order = -1;
  my %head_ngram_count;
  my %actual_ngram_count;
  while (<STDIN>) {
    chomp;
    my @col = split(" ", $_);
  
    if ($current_order == -1 and ! m/^\\data\\$/) {
      next;
    }
  
    if (m/^\\data\\$/) {
      print STDERR "$0: Processing \"\\data\\\"
  ";
      print "$_
  ";
      $current_order = 0;
    } elsif (m/^\\[0-9]*-grams:$/) {
      $current_order = $_;
      $current_order =~ s/-grams:$//g;
      $current_order =~ s/^\\//g;
      print "$_
  ";
      print STDERR "$0: Processing \"\\$current_order-grams:\\\"
  ";
    } elsif (m/^\\end\\/) {
      print "$_
  ";
    } elsif ($_ eq "") {
      if ($current_order >= 1) {
        print "
  ";
      }
    } else {
      if ($current_order == 0) {
        # echo head section.
        print "$_
  ";
      } else {
        # Parses n-gram section.
        if (@col > 2 + $current_order || @col < 1 + $current_order) {
          die "$0: Bad line in arpa lm \"$_\"
  ";
        }
        my $prob = shift @col;
        my $is_oov = 0;
        for (my $i = 0; $i < $current_order; $i++) {
          my $temp = $mapper{$col[$i]};
          if (!defined($temp)) {
            $is_oov = 1;
            $num_oov_lines++;
            last;
          } else {
            $col[$i] = $temp;
          }
        }
        if (!$is_oov) {
          my $rest_of_line = join(" ", @col);
          print "$prob\t$rest_of_line
  ";
        } else {
          if ($num_oov_lines < $max_oov_warn) {
            print STDERR "$0: Warning: OOV line $_
  ";
          }
        }
      }
    }
  }
  
  if ($num_oov_lines > 0) {
    print STDERR "$0: $num_oov_lines lines of the Arpa file contained OOVs and ";
    print STDERR "were not printed.
  ";
  }
  
  close(M);