Blame view

egs/mgb5/s5/local/score.sh 5.59 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
  #!/bin/bash
  # Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
  # Apache 2.0
  
  # See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
  
  [ -f ./path.sh ] && . ./path.sh
  
  # begin configuration section.
  cmd=run.pl
  stage=0
  decode_mbr=false
  stats=true
  beam=6
  word_ins_penalty=0.0,0.5,1.0
  min_lmwt=7
  max_lmwt=17
  iter=final
  #end configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
    echo " Options:"
    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
    echo "    --stage (0|1|2)                 # start scoring script from part-way through."
    echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
    echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
    echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
    exit 1;
  fi
  
  data=$1
  lang_or_graph=$2
  dir=$3
  
  symtab=$lang_or_graph/words.txt
  
  for f in $symtab $dir/lat.1.gz $data/text; do
    [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
  done
  
  
  ref_filtering_cmd="cat"
  [ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
  [ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
  hyp_filtering_cmd="cat"
  [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
  [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
  
  
  if $decode_mbr ; then
    echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
  else
    echo "$0: scoring with word insertion penalty=$word_ins_penalty"
  fi
  
  
  mkdir -p $dir/scoring_kaldi
  cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
  if [ $stage -le 0 ]; then
  
    for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
      mkdir -p $dir/scoring_kaldi/penalty_$wip/log
  
      if $decode_mbr ; then
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
          acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
          lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
          lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
          lattice-prune --beam=$beam ark:- ark:- \| \
          lattice-mbr-decode  --word-symbol-table=$symtab \
          ark:- ark,t:- \| \
          utils/int2sym.pl -f 2- $symtab \| \
          $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
  
      else
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
          lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
          lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
          lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
          utils/int2sym.pl -f 2- $symtab \| \
          $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
      fi
  
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
        cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
        compute-wer --text --mode=present \
        ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
  
    done
  fi
  
  
  
  if [ $stage -le 1 ]; then
  
    for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
      for lmwt in $(seq $min_lmwt $max_lmwt); do
        # adding /dev/null to the command list below forces grep to output the filename
        grep WER $dir/wer_${lmwt}_${wip} /dev/null
      done
    done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
  
    best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
    best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
    best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
  
    if [ -z "$best_lmwt" ]; then
      echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
      exit 1;
    fi
  
    if $stats; then
      mkdir -p $dir/scoring_kaldi/wer_details
      echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
      echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
  
      $cmd $dir/scoring_kaldi/log/stats1.log \
        cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
        align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
        utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
         utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
  
      $cmd $dir/scoring_kaldi/log/stats2.log \
        cat $dir/scoring_kaldi/wer_details/per_utt \| \
        utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
        sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
  
      $cmd $dir/scoring_kaldi/log/wer_bootci.log \
        compute-wer-bootci --mode=present \
          ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
          '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
  
    fi
  fi
  
  # If we got here, the scoring was successful.
  # As a  small aid to prevent confusion, we remove all wer_{?,??} files;
  # these originate from the previous version of the scoring files
  # i keep both statement here because it could lead to confusion about
  # the capabilities of the script (we don't do cer in the script)
  rm $dir/wer_{?,??} 2>/dev/null
  rm $dir/cer_{?,??} 2>/dev/null
  
  exit 0;