Blame view
egs/aspire/s5/local/score_aspire.sh
6.54 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
#!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016. Apache 2.0. # This script generates the ctm files, filters and scores them if an stm file is available set -e set -x iter=final min_lmwt=1 max_lmwt=20 default_lmwt=12 # see tune_hyper description for more info word_ins_penalties=0.0,0.25,0.5,0.75,1.0 default_wip=0.0 ctm_beam=6 decode_mbr=true cmd=run.pl stage=1 resolve_overlaps=true tune_hyper=true # if true: # if the data set is "dev_aspire" we check for the # best lmwt and word_insertion_penalty, # else we use try to find the best values from dev_aspire decodes # if not found we use the default values . ./cmd.sh . ./path.sh . utils/parse_options.sh || exit 1; if [ $# -ne 5 ]; then echo "Usage: $0 [options] <lang-dir> <decode-dir> <actual-data-set> <segmented-data-set> <output-ctm-file>" echo " Options:" echo " --stage (1|2|3) # start scoring script from part-way through." echo "e.g.:" echo "$0 data/train data/lang exp/nnet3/tdnn" exit 1; fi lang=$1 decode_dir=$2 act_data_set=$3 segmented_data_set=$4 out_file=$5 model=$decode_dir/../$iter.mdl # assume model one level up from decoding dir. mkdir -p $decode_dir/scoring # create a python script to filter the ctm, for labels which are mapped # to null strings in the glm or which are not accepted by the scoring server python -c " import sys, re lines = map(lambda x: x.strip(), open('data/${act_data_set}/glm').readlines()) patterns = [] for line in lines: if re.search('=>', line) is not None: parts = re.split('=>', line.split('/')[0]) if parts[1].strip() == '': patterns.append(parts[0].strip()) print '|'.join(patterns) " > $decode_dir/scoring/glm_ignore_patterns ignore_patterns=$(cat $decode_dir/scoring/glm_ignore_patterns) echo "$0: Ignoring these patterns from the ctm ", $ignore_patterns cat << EOF > $decode_dir/scoring/filter_ctm.py import sys file = open(sys.argv[1]) out_file = open(sys.argv[2], 'w') ignore_set = "$ignore_patterns".split("|") ignore_set.append("[noise]") ignore_set.append("[laughter]") ignore_set.append("[vocalized-noise]") ignore_set.append("!SIL") ignore_set.append("<unk>") ignore_set.append("%hesitation") ignore_set = set(ignore_set) print ignore_set for line in file: if line.split()[4] not in ignore_set: out_file.write(line) out_file.close() EOF filter_ctm_command="python $decode_dir/scoring/filter_ctm.py " if $tune_hyper ; then # find the best lmwt and word_insertion_penalty based on the transcripts # provided for dev_aspire, for other data sets just copy the values from dev_aspire decode directories # or use the default values if [ $stage -le 1 ]; then if [[ "$act_data_set" =~ "dev_aspire" ]]; then wip_string=$(echo $word_ins_penalties | sed 's/,/ /g') temp_wips=($wip_string) $cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \ wips=\(0 $wip_string\) \&\& \ wip=\${wips[WIP]} \&\& \ echo \$wip \&\& \ $cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \ local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \ --beam $ctm_beam --decode-mbr $decode_mbr \ --resolve-overlaps $resolve_overlaps \ --glm data/${act_data_set}/glm --stm data/${act_data_set}/stm \ LMWT \$wip $lang data/${segmented_data_set}_hires $model $decode_dir || exit 1; eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys"|utils/best_wer.sh 2>/dev/null eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys" | \ utils/best_wer.sh 2>/dev/null | python -c "import sys, re line = sys.stdin.readline() file_name=line.split()[-1] parts=file_name.split('/') penalty = re.sub('penalty_','',parts[-2]) lmwt = re.sub('score_','', parts[-3]) lmfile=open('$decode_dir/scoring/bestLMWT','w') lmfile.write(str(lmwt)) lmfile.close() wipfile=open('$decode_dir/scoring/bestWIP','w') wipfile.write(str(penalty)) wipfile.close() " || exit 1; LMWT=$(cat $decode_dir/scoring/bestLMWT) word_ins_penalty=$(cat $decode_dir/scoring/bestWIP) fi fi if [[ "$act_data_set" =~ "test_aspire" ]] || [[ "$act_data_set" =~ "eval_aspire" ]]; then # check for the best values from dev_aspire decodes dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g") if [ -f $dev_decode_dir/scoring/bestLMWT ]; then LMWT=$(cat $dev_decode_dir/scoring/bestLMWT) echo "Using the bestLMWT $LMWT value found in $dev_decode_dir" else LMWT=$default_lmwt # default LMWT in case hyper-parameter tuning results are not available echo "Unable to find the bestLMWT in the dev decode dir $dev_decode_dir" echo "Keeping the default value $LMWT" fi if [ -f $dev_decode_dir/scoring/bestWIP ]; then word_ins_penalty=$(cat $dev_decode_dir/scoring/bestWIP) echo "Using the bestWIP $word_ins_penalty value found in $dev_decode_dir" else word_ins_penalty=$default_wip # default WIP in case hyper-parameter tuning results are not available echo "Unable to find the bestWIP in the dev decode dir $dev_decode_dir" echo "Keeping the default/user-specified value $word_ins_penalty" fi else echo "Using the default values for LMWT and word_ins_penalty" fi fi # lattice to ctm conversion and scoring. if [ $stage -le 2 ]; then echo "Generating CTMs with LMWT $LMWT and word insertion penalty of $word_ins_penalty" local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \ --beam $ctm_beam --decode-mbr $decode_mbr \ $LMWT $word_ins_penalty $lang data/${segmented_data_set}_hires $model $decode_dir 2>$decode_dir/scoring/finalctm.LMWT$LMWT.WIP$word_ins_penalty.log || exit 1; fi # copy the ctms to the specified output files if [ $stage -le 3 ]; then cat $decode_dir/score_$LMWT/penalty_$word_ins_penalty/ctm.filt | \ awk '{split($1, parts, "-"); printf("%s 1 %s %s %s ", parts[1], $3, $4, $5)}' > $out_file cat data/${segmented_data_set}_hires/wav.scp | \ awk '{split($1, parts, "-"); printf("%s ", parts[1])}' > $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names local/multi_condition/fill_missing_recordings.py \ $out_file $out_file.submission \ $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names echo "Generated the ctm @ $out_file.submission from the ctm file $decode_dir/score_${LMWT}/penalty_$word_ins_penalty/ctm.filt" fi |