Blame view
scripts/rnnlm/lmrescore_back.sh
4.97 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
#!/bin/bash # Copyright 2017 Hainan Xu # Apache 2.0 # This script rescores lattices with KALDI RNNLM trained on reversed text. # The input directory should already be rescored with a forward RNNLM, preferably # with the pruned algorithm, since smaller lattices make rescoring much faster. # An example of the forward pruned rescoring is at # egs/swbd/s5c/local/rnnlm/run_tdnn_lstm.sh # One example script for backward RNNLM rescoring is at # egs/swbd/s5c/local/rnnlm/run_tdnn_lstm_back.sh # Begin configuration section. cmd=run.pl skip_scoring=false max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order # if it's set, it merges histories in the lattice if they share # the same ngram history and this prevents the lattice from # exploding exponentially. Details of the n-gram approximation # method are described in section 2.3 of the paper # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm weight=0.5 # Interpolation weight for RNNLM. normalize=false # If true, we add a normalization step to the output of the RNNLM # so that it adds up to *exactly* 1. Note that this is not necessary # as in our RNNLM setup, a properly trained network would automatically # have its normalization term close to 1. The details of this # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf # End configuration section. echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh if [ $# != 5 ]; then echo "Does language model rescoring of lattices (remove old LM, add new LM)" echo "with Kaldi RNNLM trained on reversed text. See comments in file for details" echo "" echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\" echo " <data-dir> <input-decode-dir> <output-decode-dir>" echo " e.g.: $0 data/lang_tg exp/rnnlm_lstm/ data/test \\" echo " exp/tri3/test_rnnlm_forward exp/tri3/test_rnnlm_bidirection" echo "options: [--cmd (run.pl|queue.pl [queue opts])]" exit 1; fi [ -f path.sh ] && . ./path.sh; oldlang=$1 rnnlm_dir=$2 data=$3 indir=$4 outdir=$5 oldlm=$oldlang/G.fst if [ ! -f $oldlm ]; then echo "$0: file $oldlm not found; using $oldlang/G.carpa" oldlm=$oldlang/G.carpa fi [ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; [ ! -f $rnnlm_dir/final.raw ] && echo "$0: Missing file $rnnlm_dir/final.raw" && exit 1; [ ! -f $rnnlm_dir/feat_embedding.final.mat ] && [ ! -f $rnnlm_dir/word_embedding.final.mat ] && echo "$0: Missing word embedding file" && exit 1; [ ! -f $oldlang/words.txt ] &&\ echo "$0: Missing file $oldlang/words.txt" && exit 1; ! ls $indir/lat.*.gz >/dev/null &&\ echo "$0: No lattices input directory $indir" && exit 1; awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; normalize_opt= if $normalize; then normalize_opt="--normalize-probs=true" fi oldlm_command="fstproject --project_output=true $oldlm |" special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt) word_embedding= if [ -f $rnnlm_dir/word_embedding.final.mat ]; then word_embedding=$rnnlm_dir/word_embedding.final.mat else word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'" fi mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir # In order to rescore with a backward RNNLM, we first remove the original LM # scores with lattice-lmrescore, before reversing the lattices oldlm_weight=$(perl -e "print -1.0 * $weight;") if [ "$oldlm" == "$oldlang/G.fst" ]; then $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore --lm-scale=$oldlm_weight \ "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ lattice-reverse ark:- ark:- \| \ lattice-lmrescore-kaldi-rnnlm --lm-scale=$weight $special_symbol_opts \ --max-ngram-order=$max_ngram_order $normalize_opt \ $word_embedding "$rnnlm_dir/final.raw" ark:- ark:- \| \ lattice-reverse ark:- "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; else $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:- \| \ lattice-reverse ark:- ark:- \| \ lattice-lmrescore-kaldi-rnnlm --lm-scale=$weight $special_symbol_opts \ --max-ngram-order=$max_ngram_order $normalize_opt \ $word_embedding "$rnnlm_dir/final.raw" ark:- ark:- \| \ lattice-reverse ark:- "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; fi if ! $skip_scoring ; then err_msg="$0: Not scoring because local/score.sh does not exist or not executable." [ ! -x local/score.sh ] && echo $err_msg && exit 1; echo local/score.sh --cmd "$cmd" $data $oldlang $outdir local/score.sh --cmd "$cmd" $data $oldlang $outdir else echo "$0: Not scoring because --skip-scoring was specified." fi exit 0; |