adjust_unk_arpa.pl
2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env perl
# Copyright 2018 Xiaohui Zhang
# Apache 2.0.
#
use strict;
use warnings;
use Getopt::Long;
my $Usage = <<EOU;
# This is a simple script to set/scale the prob of n-grams where the OOV dict entry is the predicted word, in an ARPA lm file.
Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa
Allowed options:
--fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to
the unigram prob of the OOV dict entry, rather than using it to
scale the probs. In this case higher order n-grams containing
the OOV dict entry remain untouched. This is useful when the OOV
dict entry doesn't appear in n-grams (n>1) as the predicted word.
EOU
my $fixed_value = "false";
GetOptions('fixed-value=s' => \$fixed_value);
($fixed_value eq "true" || $fixed_value eq "false") ||
die "$0: Bad value for option --fixed-value\n";
if (@ARGV != 2) {
die $Usage;
}
# Gets parameters.
my $unk_word = shift @ARGV;
my $unk_scale = shift @ARGV;
my $arpa_in = shift @ARGV;
my $arpa_out = shift @ARGV;
$unk_scale > 0.0 || die "Bad unk_scale"; # this must be positive
if ( $fixed_value eq "true" ) {
print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n";
} else {
print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n";
}
my $ngram = 0; # the order of ngram we are visiting
# Change the unigram prob of the unk-word in the ARPA LM.
while(<STDIN>) {
if (m/^\\1-grams:$/) { $ngram = 1; }
if (m/^\\2-grams:$/) { $ngram = 2; }
if (m/^\\3-grams:$/) { $ngram = 3; }
if (m/^\\4-grams:$/) { $ngram = 4; }
if (m/^\\5-grams:$/) { $ngram = 5; }
my @col = split(" ", $_);
if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) {
if ( $fixed_value eq "true" && $ngram == 1 ) {
$col[0] = (log($unk_scale) / log(10.0));
} else {
$col[0] += (log($unk_scale) / log(10.0));
}
my $line = join("\t", @col);
print "$line\n";
} else {
print;
}
}
exit 0