Blame view

egs/wsj/s5/utils/lang/add_unigrams_arpa.pl 2.31 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  #!/usr/bin/env perl
  
  # Copyright 2018  Xiaohui Zhang
  # Apache 2.0.
  #
  use strict;
  use warnings;
  use Getopt::Long;
  
  my $Usage = <<EOU;
  # This is a simple script to add unigrams to an ARPA lm file.
  Usage: utils/lang/add_unigrams_arpa.pl [options] <oov-prob-file> <scale> <input-arpa >output-arpa
  <oov-prob-file> contains a list of words and their probabilities, e.g. "jack 0.2". All probs will be
  scaled by a positive scalar <scale> and then be used as the unigram prob. of the added word.
  The scale should approximiately relect the OOV rate of the language in concern.
  EOU
  
  my @F;
  my @OOVS;
  
  if (@ARGV != 2) {
    die $Usage;
  }
  
  # Gets parameters.
  my $oov_prob_file = shift @ARGV;
  my $scale = shift @ARGV;
  my $arpa_in = shift @ARGV;
  my $arpa_out = shift @ARGV;
  
  # Opens files.
  open(F, "<$oov_prob_file") || die "$0: Fail to open $oov_prob_file
  ";
  while (<F>) { push @OOVS, $_; }
  my $num_oovs = @OOVS;
  
  $scale > 0.0 || die "Bad scale";
  print STDERR "$0: Creating LM file with additional unigrams, using $oov_prob_file
  ";
  
  my %vocab;
  my $unigram = 0;
  my $num_unigrams = 0;
  my @lines;
  
  # Parse and record the head and unigrams in the ARPA LM.
  while(<STDIN>) {
    if (m/^ngram 1=(\d+)/) { $num_unigrams = $1; }
    
    if (m/^\\2-grams:$/) { last; }
    if (m/^\\1-grams:$/) { $unigram = 1; push(@lines, $_); next; }
    if (m/^\\2-grams:$/) { $unigram = 0; }
  
    my @col = split(" ", $_);
    if ( $unigram == 1 ) {
      # Record in-vocab words into a map.
      if ( @col > 0 ) {
        my $word = $col[1];
        $vocab{$word} = 1;
        push(@lines, $_);
      } else {
        # Insert out-of-vocab words and their probs into the unigram list.
        foreach my $l (@OOVS) {
          my @A = split(" ", $l);
          @A == 2 || die "bad line in oov2prob: $_;";
          my $word = $A[0];
          my $prob = $A[1];
          if (exists($vocab{$word})) { next; }
          $num_unigrams ++;
          my $log10prob = (log($prob * $scale) / log(10.0));
          $vocab{$word} = 1;
          my $line = sprintf("%.6f\t$word
  ", $log10prob);
          push(@lines, $line);
        }
      }
    } else { push(@lines, $_); }
  }
  
  # Print the head and unigrams, with the updated # unigrams in the head.
  foreach my $l (@lines) {
    if ($l =~ m/ngram 1=/) {
      print "ngram 1=$num_unigrams
  ";
    } else {
      print $l;
    }
  }
  
  # Print the left fields.
  print "
  \\2-grams:
  ";
  while(<STDIN>) {
    print;
  }
  
  close(F);
  exit 0