Blame view
egs/wsj/s5/utils/map_arpa_lm.pl
3.36 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
#!/usr/bin/env perl # Copyright 2014 Guoguo Chen # 2014 Johns Hopkins University (author: Daniel Povey) # Apache 2.0. # use strict; use warnings; use Getopt::Long; my $Usage = <<EOU; This script reads the Arpa format language model, and maps the words into integers or vice versa. It ignores the words that are not in the symbol table, and updates the head information. It will be used joinly with lmbin/arpa-to-const-arpa to build ConstArpaLm format language model. We first map the words in an Arpa format language model to integers, and then use lmbin/arpa-to-const-arpa to build a ConstArpaLm format language model. Usage: utils/map_arpa_lm.pl [options] <vocab-file> < input-arpa >output-arpa e.g.: utils/map_arpa_lm.pl words.txt <arpa_lm.txt >arpa_lm.int Allowed options: --sym2int : If true, maps words to integers, other wise maps integers to words. (boolean, default = true) EOU my $sym2int = "true"; GetOptions('sym2int=s' => \$sym2int); ($sym2int eq "true" || $sym2int eq "false") || die "$0: Bad value for option --sym2int "; if (@ARGV != 1) { die $Usage; } # Gets parameters. my $symtab = shift @ARGV; my $arpa_in = shift @ARGV; my $arpa_out = shift @ARGV; # Opens files. open(M, "<$symtab") || die "$0: Fail to open $symtab "; # Reads in the mapper. my %mapper; while (<M>) { chomp; my @col = split(/[\s]+/, $_); @col == 2 || die "$0: Bad line in mapper file \"$_\" "; if ($sym2int eq "true") { if (defined($mapper{$col[0]})) { die "$0: Duplicate entry \"$col[0]\" "; } $mapper{$col[0]} = $col[1]; } else { if (defined($mapper{$col[1]})) { die "$0: Duplicate entry \"$col[1]\" "; } $mapper{$col[1]} = $col[0]; } } my $num_oov_lines = 0; my $max_oov_warn = 20; # Parses Arpa n-gram language model. my $arpa = ""; my $current_order = -1; my %head_ngram_count; my %actual_ngram_count; while (<STDIN>) { chomp; my @col = split(" ", $_); if ($current_order == -1 and ! m/^\\data\\$/) { next; } if (m/^\\data\\$/) { print STDERR "$0: Processing \"\\data\\\" "; print "$_ "; $current_order = 0; } elsif (m/^\\[0-9]*-grams:$/) { $current_order = $_; $current_order =~ s/-grams:$//g; $current_order =~ s/^\\//g; print "$_ "; print STDERR "$0: Processing \"\\$current_order-grams:\\\" "; } elsif (m/^\\end\\/) { print "$_ "; } elsif ($_ eq "") { if ($current_order >= 1) { print " "; } } else { if ($current_order == 0) { # echo head section. print "$_ "; } else { # Parses n-gram section. if (@col > 2 + $current_order || @col < 1 + $current_order) { die "$0: Bad line in arpa lm \"$_\" "; } my $prob = shift @col; my $is_oov = 0; for (my $i = 0; $i < $current_order; $i++) { my $temp = $mapper{$col[$i]}; if (!defined($temp)) { $is_oov = 1; $num_oov_lines++; last; } else { $col[$i] = $temp; } } if (!$is_oov) { my $rest_of_line = join(" ", @col); print "$prob\t$rest_of_line "; } else { if ($num_oov_lines < $max_oov_warn) { print STDERR "$0: Warning: OOV line $_ "; } } } } } if ($num_oov_lines > 0) { print STDERR "$0: $num_oov_lines lines of the Arpa file contained OOVs and "; print STDERR "were not printed. "; } close(M); |