Blame view
egs/aspire/s5/local/multi_condition/get_ctm.sh
3.89 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# Copyright 2015 Johns Hopkins University (Authors: Vijayaditya Peddinti). Apache 2.0. set -e beam=7 decode_mbr=true filter_ctm_command=cp glm= stm= resolve_overlaps=true [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; echo $* if [ $# -ne 6 ]; then echo "Usage: $0 [options] <LMWT> <word-ins-penalty> <lang-dir> <data-dir> <model> <decode-dir>" echo " e.g.: $0 --decode-mbr true --beam 7 --glm data/dev_aspire/glm \\" echo " --stm data/dev_aspire/stm\\" echo " 12 1.5 data/dev_aspire_hires \\" echo " exp/nnet2_multicondition/nnet_ms_a/final.mdl \\" echo " exp/nnet2_multicondition/nnet_ms_a/decode_dev_aspire" echo "main options (for others, see top of script file)" echo " --beam <beam> # Decoding beam; default 7.0" echo " --decode-mbr <true|false> # do mbr decoding; default true" echo " --filter_ctm_command <string> # command for ctm filtering ;default cp" echo " --stm <stm-file> # stm file, will score if provided" echo " --glm <glm-file> # glm file, needs to be specified along with stm" exit 1; fi LMWT=$1 wip=$2 lang=$3 data_dir=$4 model=$5 decode_dir=$6 nj=$(cat $decode_dir/num_jobs) set -o pipefail mkdir -p $decode_dir/score_$LMWT/penalty_$wip if [ -f $decode_dir/../frame_shift ]; then frame_shift_opt="--frame-shift=$(cat $decode_dir/../frame_shift)" echo "$0: $decode_dir/../frame_shift exists, using $frame_shift_opt" elif [ -f $decode_dir/../frame_subsampling_factor ]; then factor=$(cat $decode_dir/../frame_subsampling_factor) || exit 1 frame_shift_opt="--frame-shift=0.0$factor" echo "$0: $decode_dir/../frame_subsampling_factor exists, using $frame_shift_opt" fi lat_files=`eval "echo $decode_dir/lat.{1..$nj}.gz"` lattice-scale --inv-acoustic-scale=$LMWT "ark:gunzip -c $lat_files|" ark:- | \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- | \ lattice-prune --beam=$beam ark:- ark:- | \ lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true --max-expand=10.0 --test=false \ $lang/phones/align_lexicon.int $model ark:- ark:- | \ lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping || exit 1; ctm=$decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping # combine the segment-wise ctm files, while resolving overlaps if $resolve_overlaps; then utils/ctm/resolve_ctm_overlaps.py $data_dir/segments \ $decode_dir/score_$LMWT/penalty_$wip/ctm.overlapping \ $decode_dir/score_$LMWT/penalty_$wip/ctm.merged || exit 1; ctm=$decode_dir/score_$LMWT/penalty_$wip/ctm.merged fi cat $ctm | utils/int2sym.pl -f 5 $lang/words.txt | \ utils/convert_ctm.pl $data_dir/segments $data_dir/reco2file_and_channel | \ sort -k1,1 -k2,2 -k3,3nb > $decode_dir/score_$LMWT/penalty_$wip/ctm || exit 1; # Remove some stuff we don't want to score, from the ctm. $filter_ctm_command $decode_dir/score_${LMWT}/penalty_$wip/ctm $decode_dir/score_${LMWT}/penalty_$wip/ctm.temp awk '$4 < 0.75 + 0.2*length($5)' < $decode_dir/score_${LMWT}/penalty_$wip/ctm.temp \ | perl -ane '@A = split; $word = $A[4]; if ($word =~ s/\._//g) { $word =~ s/\.$//; $word =~ s/.s/s/; } $A[4] = $word; print join("\t", @A), " "; ' \ > $decode_dir/score_${LMWT}/penalty_$wip/ctm.filt || exit 1; rm $decode_dir/score_${LMWT}/penalty_$wip/ctm.temp if [ ! -z $stm ]; then if [ -z $glm ]; then echo "glm file needs to be specified " && exit 1; fi echo "Scoring the ctm file locally as we have the transcripts." cp $stm $decode_dir/score_$LMWT/penalty_$wip/ stm=$decode_dir/score_$LMWT/penalty_$wip/`basename $stm` hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; hubdir=`dirname $hubscr` $hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm $decode_dir/score_$LMWT/penalty_$wip/ctm.filt || exit 1; fi |