Blame view
egs/fisher_swbd/s5/local/score_sclite.sh
5.65 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
#!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. # begin configuration section. cmd=run.pl stage=0 min_lmwt=5 max_lmwt=17 word_ins_penalty=0.0,0.5,1.0 #end configuration section. [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --stage (0|1|2) # start scoring script from part-way through." echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring " echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring " exit 1; fi data=$1 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. dir=$3 model=$dir/../final.mdl # assume model one level up from decoding dir. hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; hubdir=`dirname $hubscr` for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; done if [ -f $dir/../frame_shift ]; then frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" elif [ -f $dir/../frame_subsampling_factor ]; then factor=$(cat $dir/../frame_subsampling_factor) || exit 1 frame_shift_opt="--frame-shift=0.0$factor" echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" fi name=`basename $data`; # e.g. eval2000 mkdir -p $dir/scoring/log if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ mkdir -p $dir/score_LMWT_${wip}/ '&&' \ lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ lattice-1best ark:- ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ nbest-to-ctm $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; done fi if [ $stage -le 1 ]; then # Remove some stuff we don't want to score, from the ctm. for x in $dir/score_*/$name.ctm; do cp $x $dir/tmpf; cat $dir/tmpf | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \ grep -i -v -E '<UNK>' > $x; # grep -i -v -E '<UNK>|%HESITATION' > $x; # hesitation is scored case "$name" in eval2000* ) python local/format_acronyms_ctm_eval2000.py -i $x -o $x.mapped ;; rt03* ) python local/format_acronyms_ctm_rt03.py -i $x -o $x.mapped ;; esac cp $x $x.bk mv $x.mapped $x done fi # Score the set... if [ $stage -le 2 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1; done fi # For eval2000 score the subsets case "$name" in eval2000* ) # Score only the, swbd part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; done fi # Score only the, callhome part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \ grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \ grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1; done fi ;; rt03* ) # Score only the swbd part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; done fi # Score only the fisher part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \ grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1; done fi ;; esac exit 0 |