compute_sentence_scores_back.sh
2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
# Copyright 2017 Hainan Xu
# 2017 Szu-Jui Chen
# This script is very similar to rnnlm/compute_sentence_scores.sh, where it do the
# same procedure for reversed data. And it computes log-likelihoods from a
# Kaldi-RNNLM model instead of that of Mikolov's RNNLM. Because Kaldi-RNNLM uses
# letter-features which does not need an <OOS> symbol, we don't need the "unk.probs"
# file any more to add as a penalty term in sentence likelihoods.
ensure_normalized_probs=false # If true then the probabilities computed by the
# RNNLM will be correctly normalized. Note it is
# OK to set it to false because Kaldi-RNNLM is
# trained in a way that ensures the sum of probabilities
# is close to 1.
. ./path.sh || exit 1;
. utils/parse_options.sh
if [ $# != 4 ]; then
echo "Usage: $0 <rnn-dir> <temp-dir> <input-text> <output-scores>"
exit 1;
fi
dir=$1
tempdir=$2
text_in=$3
scores_out=$4
if [ -f $dir/word_embedding.final.mat ]; then
word_embedding=$dir/word_embedding.final.mat
else
[ ! -f $dir/feat_embedding.final.mat ] &&
echo "$0: expect file $dir/feat_embedding.final.mat to exit"
word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"
fi
for x in final.raw config/words.txt; do
if [ ! -f $dir/$x ]; then
echo "$0: expected file $dir/$x to exist."
exit 1;
fi
done
mkdir -p $tempdir
cat $text_in | sym2int.pl -f 2- $dir/config/words.txt | \
awk '{printf("%s ",$1);for(i=NF;i>1;i--) printf("%s ",$i); print""}' > $tempdir/text.int
special_symbol_opts=$(cat ${dir}/special_symbol_opts.txt)
rnnlm-sentence-probs --normalize-probs=$ensure_normalized_probs \
$special_symbol_opts $dir/final.raw "$word_embedding" $tempdir/text.int > $tempdir/loglikes.rnn
# Now $tempdir/loglikes.rnn has the following structure
# utt-id log P(word1 | <s>) log P(word2 | <s> word1) ... log P(</s> | all word histories)
# for example,
#
# en_4156-A_058697-058813-2 -3.57205 -2.70411 -4.29876 -3.63707 -6.00299 -2.11093 -2.03955
# en_4156-A_058697-058813-3 -6.6074 -1.21244 -3.89991 -3.23747 -5.35102 -1.90448 -1.77809
# en_4156-A_058697-058813-4 -5.09022 -1.24148 -4.76337 -4.75594 -5.77118 -2.08555 -2.18403
# en_4156-A_058697-058813-5 -4.54489 -2.97485 -3.93646 -3.28041 -5.18779 -2.83356 -1.72601
# en_4156-A_058697-058813-6 -2.31464 -3.74738 -4.03309 -3.22942 -5.66818 -2.0396 -1.64734
# en_4156-A_058697-058813-7 -5.0728 -2.96303 -4.6539 -3.20266 -5.40682 -2.10625 -1.90956
[ $(cat $tempdir/loglikes.rnn | wc -l) -ne $(cat $tempdir/text.int | wc -l) ] && \
echo "$0: rnnlm rescoring failed" && exit 1;
# We need the negative log-probabilities
cat $tempdir/loglikes.rnn | awk '{sum=0;for(i=2;i<=NF;i++)sum-=$i; print $1,sum}' >$scores_out