Blame view
egs/iam/v1/local/score.sh
6.18 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#!/bin/bash # Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) # Apache 2.0 # This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the <unk>'s # using local/unk_arc_post_to_transcription.py and also it calls # steps/scoring/score_kaldi_cer.sh at the end. [ -f ./path.sh ] && . ./path.sh # begin configuration section. cmd=run.pl stage=0 decode_mbr=false stats=true beam=6 word_ins_penalty=0.0,0.5,1.0 min_lmwt=3 max_lmwt=13 iter=final #end configuration section. echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --stage (0|1|2) # start scoring script from part-way through." echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring " echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring " exit 1; fi data=$1 lang_or_graph=$2 dir=$3 model_path=`echo $dir |xargs dirname` symtab=$lang_or_graph/words.txt for f in $symtab $dir/lat.1.gz $data/text; do [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; done ref_filtering_cmd="cat" [ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" [ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" hyp_filtering_cmd="cat" [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" if $decode_mbr ; then echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" else echo "$0: scoring with word insertion penalty=$word_ins_penalty" fi mkdir -p $dir/scoring_kaldi cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do mkdir -p $dir/scoring_kaldi/penalty_$wip/log if $decode_mbr ; then $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ lattice-prune --beam=$beam ark:- ark:- \| \ lattice-mbr-decode --word-symbol-table=$symtab \ ark:- ark,t:- \| \ utils/int2sym.pl -f 2- $symtab \| \ $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; else $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ lattice-1best ark:- ark:- \| \ lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \ lattice-arc-post $model_path/final.mdl ark:- - \| \ local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \ $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; fi $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ compute-wer --text --mode=present \ "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; done fi if [ $stage -le 1 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do for lmwt in $(seq $min_lmwt $max_lmwt); do # adding /dev/null to the command list below forces grep to output the filename grep WER $dir/wer_${lmwt}_${wip} /dev/null done done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') if [ -z "$best_lmwt" ]; then echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." exit 1; fi if $stats; then mkdir -p $dir/scoring_kaldi/wer_details echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty $cmd $dir/scoring_kaldi/log/stats1.log \ cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; $cmd $dir/scoring_kaldi/log/stats2.log \ cat $dir/scoring_kaldi/wer_details/per_utt \| \ utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; $cmd $dir/scoring_kaldi/log/wer_bootci.log \ compute-wer-bootci --mode=present \ ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; fi fi steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ $data $lang_or_graph $dir # If we got here, the scoring was successful. # As a small aid to prevent confusion, we remove all wer_{?,??} files; # these originate from the previous version of the scoring files # i keep both statement here because it could lead to confusion about # the capabilities of the script (we don't do cer in the script) rm $dir/wer_{?,??} 2>/dev/null rm $dir/cer_{?,??} 2>/dev/null exit 0; |