Blame view

scripts/rnnlm/compute_sentence_scores_back.sh 2.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  #!/bin/bash
  
  # Copyright 2017  Hainan Xu
  #           2017  Szu-Jui Chen
  
  # This script is very similar to rnnlm/compute_sentence_scores.sh, where it do the
  # same procedure for reversed data. And it computes log-likelihoods from a
  # Kaldi-RNNLM model instead of that of Mikolov's RNNLM. Because Kaldi-RNNLM uses
  # letter-features which does not need an <OOS> symbol, we don't need the "unk.probs"
  # file any more to add as a penalty term in sentence likelihoods.
  
  ensure_normalized_probs=false  # If true then the probabilities computed by the
                                 # RNNLM will be correctly normalized. Note it is
                                 # OK to set it to false because Kaldi-RNNLM is
                                 # trained in a way that ensures the sum of probabilities
                                 # is close to 1.
  
  . ./path.sh || exit 1;
  . utils/parse_options.sh
  
  if [ $# != 4 ]; then
    echo "Usage: $0 <rnn-dir> <temp-dir> <input-text> <output-scores>"
    exit 1;
  fi
  
  dir=$1
  tempdir=$2
  text_in=$3
  scores_out=$4
  
  if [ -f $dir/word_embedding.final.mat ]; then
    word_embedding=$dir/word_embedding.final.mat
  else
    [ ! -f $dir/feat_embedding.final.mat ] &&
               echo "$0: expect file $dir/feat_embedding.final.mat to exit"
    word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"
  fi
  
  for x in final.raw config/words.txt; do
    if [ ! -f $dir/$x ]; then 
      echo "$0: expected file $dir/$x to exist."
      exit 1;
    fi
  done
  
  mkdir -p $tempdir
  cat $text_in | sym2int.pl -f 2- $dir/config/words.txt | \
      awk '{printf("%s ",$1);for(i=NF;i>1;i--) printf("%s ",$i); print""}' > $tempdir/text.int
      
  special_symbol_opts=$(cat ${dir}/special_symbol_opts.txt)
  
  rnnlm-sentence-probs --normalize-probs=$ensure_normalized_probs \
         $special_symbol_opts $dir/final.raw "$word_embedding" $tempdir/text.int > $tempdir/loglikes.rnn
  # Now $tempdir/loglikes.rnn has the following structure
  # utt-id log P(word1 | <s>) log P(word2 | <s> word1) ... log P(</s> | all word histories)
  # for example,
  #
  # en_4156-A_058697-058813-2 -3.57205 -2.70411 -4.29876 -3.63707 -6.00299 -2.11093 -2.03955
  # en_4156-A_058697-058813-3 -6.6074 -1.21244 -3.89991 -3.23747 -5.35102 -1.90448 -1.77809
  # en_4156-A_058697-058813-4 -5.09022 -1.24148 -4.76337 -4.75594 -5.77118 -2.08555 -2.18403
  # en_4156-A_058697-058813-5 -4.54489 -2.97485 -3.93646 -3.28041 -5.18779 -2.83356 -1.72601
  # en_4156-A_058697-058813-6 -2.31464 -3.74738 -4.03309 -3.22942 -5.66818 -2.0396 -1.64734
  # en_4156-A_058697-058813-7 -5.0728 -2.96303 -4.6539 -3.20266 -5.40682 -2.10625 -1.90956
  
  [ $(cat $tempdir/loglikes.rnn | wc -l) -ne $(cat $tempdir/text.int | wc -l) ] && \
    echo "$0: rnnlm rescoring failed" && exit 1;
    
  # We need the negative log-probabilities
  cat $tempdir/loglikes.rnn | awk '{sum=0;for(i=2;i<=NF;i++)sum-=$i; print $1,sum}' >$scores_out