Blame view

egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl 1.67 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  #!/usr/bin/env perl
  use warnings; #sed replacement for -w perl parameter
  
  # makes unigram decoding-graph FSTs specific to each utterances, where the
  # supplied top-n-words list together with the supervision text of the utterance are
  # combined.
  
  if (@ARGV != 1) {
    print STDERR "** Warning: this script is deprecated and will be removed.  See
  " .
                 "** steps/cleanup/make_biased_lm_graphs.sh.
  " .
                 "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive
  " .
                 "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\
  " .
                 "  make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... 
  ";
    exit(1);
  }
  
  ($top_words_file) = @ARGV;
  
  open(F, "<$top_words_file") || die "opening $top_words_file";
  
  %top_word_probs = ( );
  
  while(<F>) {
    @A = split;
    (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file";
    $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_
  ";
    $top_word_probs{$A[1]} += $A[0];
  }
  
  while (<STDIN>) {
    @A = split;
    $utterance_id = shift @A;
    print "$utterance_id
  ";
    $num_words = @A + 0;  # length of array @A
    %word_probs = %top_word_probs;
    foreach $w (@A) {
      $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_";
      $word_probs{$w} += 1.0 / $num_words;
    }
    foreach $w (keys %word_probs) {
      $prob = $word_probs{$w};
      $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id
  ";
      $cost = -log($prob);
      print "0 0 $w $w $cost
  ";
    }
    $final_cost = -log(1.0 / $num_words);
    print "0 $final_cost
  ";
    print "
  "; # Empty line terminates the FST in the text-archive format.
  }