Blame view
scripts/rnnlm/compute_sentence_scores_back.sh
2.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
#!/bin/bash # Copyright 2017 Hainan Xu # 2017 Szu-Jui Chen # This script is very similar to rnnlm/compute_sentence_scores.sh, where it do the # same procedure for reversed data. And it computes log-likelihoods from a # Kaldi-RNNLM model instead of that of Mikolov's RNNLM. Because Kaldi-RNNLM uses # letter-features which does not need an <OOS> symbol, we don't need the "unk.probs" # file any more to add as a penalty term in sentence likelihoods. ensure_normalized_probs=false # If true then the probabilities computed by the # RNNLM will be correctly normalized. Note it is # OK to set it to false because Kaldi-RNNLM is # trained in a way that ensures the sum of probabilities # is close to 1. . ./path.sh || exit 1; . utils/parse_options.sh if [ $# != 4 ]; then echo "Usage: $0 <rnn-dir> <temp-dir> <input-text> <output-scores>" exit 1; fi dir=$1 tempdir=$2 text_in=$3 scores_out=$4 if [ -f $dir/word_embedding.final.mat ]; then word_embedding=$dir/word_embedding.final.mat else [ ! -f $dir/feat_embedding.final.mat ] && echo "$0: expect file $dir/feat_embedding.final.mat to exit" word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|" fi for x in final.raw config/words.txt; do if [ ! -f $dir/$x ]; then echo "$0: expected file $dir/$x to exist." exit 1; fi done mkdir -p $tempdir cat $text_in | sym2int.pl -f 2- $dir/config/words.txt | \ awk '{printf("%s ",$1);for(i=NF;i>1;i--) printf("%s ",$i); print""}' > $tempdir/text.int special_symbol_opts=$(cat ${dir}/special_symbol_opts.txt) rnnlm-sentence-probs --normalize-probs=$ensure_normalized_probs \ $special_symbol_opts $dir/final.raw "$word_embedding" $tempdir/text.int > $tempdir/loglikes.rnn # Now $tempdir/loglikes.rnn has the following structure # utt-id log P(word1 | <s>) log P(word2 | <s> word1) ... log P(</s> | all word histories) # for example, # # en_4156-A_058697-058813-2 -3.57205 -2.70411 -4.29876 -3.63707 -6.00299 -2.11093 -2.03955 # en_4156-A_058697-058813-3 -6.6074 -1.21244 -3.89991 -3.23747 -5.35102 -1.90448 -1.77809 # en_4156-A_058697-058813-4 -5.09022 -1.24148 -4.76337 -4.75594 -5.77118 -2.08555 -2.18403 # en_4156-A_058697-058813-5 -4.54489 -2.97485 -3.93646 -3.28041 -5.18779 -2.83356 -1.72601 # en_4156-A_058697-058813-6 -2.31464 -3.74738 -4.03309 -3.22942 -5.66818 -2.0396 -1.64734 # en_4156-A_058697-058813-7 -5.0728 -2.96303 -4.6539 -3.20266 -5.40682 -2.10625 -1.90956 [ $(cat $tempdir/loglikes.rnn | wc -l) -ne $(cat $tempdir/text.int | wc -l) ] && \ echo "$0: rnnlm rescoring failed" && exit 1; # We need the negative log-probabilities cat $tempdir/loglikes.rnn | awk '{sum=0;for(i=2;i<=NF;i++)sum-=$i; print $1,sum}' >$scores_out |