Blame view

egs/wsj/s5/utils/rnnlm_compute_scores.sh 3.36 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  #!/bin/bash
  
  # Compute scores from RNNLM.  This script takes a directory
  # $dir (e.g. dir=local/rnnlm/rnnlm.voc30.hl30 ),
  # where it expects the files:
  #  rnnlm  wordlist.rnn  unk.probs,
  # and also an input file location where it can get the sentences to score, and
  # an output file location to put the scores (negated logprobs) for each
  # sentence.  This script uses the Kaldi-style "archive" format, so the input and
  # output files will have a first field that corresponds to some kind of
  # utterance-id or, in practice, utterance-id-1, utterance-id-2, etc., for the
  # N-best list.
  #
  # Here, "wordlist.rnn" is the set of words, like a vocabulary,
  # that the RNN was trained on (note, it won't include <s> or </s>),
  # plus <RNN_UNK> which is a kind of class where we put low-frequency
  # words; unk.probs gives the probs for words given this class, and it
  # has, on each line, "word prob".
  
  rnnlm_ver=rnnlm-0.3e
  ensure_normalized_probs=false  # if true then we add the neccesary options to
                                 # normalize the probabilities of RNNLM
                                 # e.g. when using faster-rnnlm in the nce mode
  
  . ./path.sh || exit 1;
  . utils/parse_options.sh
  
  rnnlm=$KALDI_ROOT/tools/$rnnlm_ver/rnnlm
  
  [ ! -f $rnnlm ] && echo No such program $rnnlm && exit 1;
  
  if [ $# != 4 ]; then
    echo "Usage: rnnlm_compute_scores.sh <rnn-dir> <temp-dir> <input-text> <output-scores>"
    exit 1;
  fi
  
  dir=$1
  tempdir=$2
  text_in=$3
  scores_out=$4
  
  for x in rnnlm wordlist.rnn unk.probs; do
    if [ ! -f $dir/$x ]; then 
      echo "rnnlm_compute_scores.sh: expected file $dir/$x to exist."
      exit 1;
    fi
  done
  
  mkdir -p $tempdir
  cat $text_in | awk '{for (x=2;x<=NF;x++) {printf("%s ", $x)} printf("
  ");}' >$tempdir/text
  cat $text_in | awk '{print $1}' > $tempdir/ids # e.g. utterance ids.
  cat $tempdir/text | awk -v voc=$dir/wordlist.rnn -v unk=$dir/unk.probs \
    -v logprobs=$tempdir/loglikes.oov \
   'BEGIN{ while((getline<voc)>0) { invoc[$1]=1; } while ((getline<unk)>0){ unkprob[$1]=$2;} }
    { logprob=0;
      if (NF==0) { printf "<RNN_UNK>"; logprob = log(1.0e-07);
        print "Warning: empty sequence." | "cat 1>&2"; }
      for (x=1;x<=NF;x++) { w=$x;  
      if (invoc[w]) { printf("%s ",w); } else {
        printf("<RNN_UNK> ");
        if (unkprob[w] != 0) { logprob += log(unkprob[w]); }
        else { print "Warning: unknown word ", w | "cat 1>&2"; logprob += log(1.0e-07); }}}
      printf("
  "); print logprob > logprobs } ' > $tempdir/text.nounk
  
  # OK, now we compute the scores on the text with OOVs replaced
  # with <RNN_UNK>
  
  if [ $rnnlm_ver == "faster-rnnlm" ]; then
    extra_options=
    if [ "$ensure_normalized_probs" = true ]; then
      extra_options="--nce-accurate-test 1"
    fi
    $rnnlm $extra_options -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \
       awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
  else
    # add the utterance_id as required by Mikolove's rnnlm
    paste $tempdir/ids $tempdir/text.nounk > $tempdir/id_text.nounk
  
    $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/id_text.nounk -nbest -debug 0 | \
       awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
  fi
  
  [ `cat $tempdir/loglikes.rnn | wc -l` -ne `cat $tempdir/loglikes.oov | wc -l` ] && \
    echo "rnnlm rescoring failed" && exit 1;
  
  paste $tempdir/loglikes.rnn $tempdir/loglikes.oov | awk '{print -($1+$2);}' >$tempdir/scores
  
  # scores out, with utterance-ids.
  paste $tempdir/ids $tempdir/scores  > $scores_out