Blame view

egs/reverb/s5/local/score.sh 6.09 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
  #!/bin/bash
  # Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
  # Apache 2.0
  
  # See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
  
  [ -f ./path.sh ] && . ./path.sh
  
  # begin configuration section.
  cmd=run.pl
  stage=0
  decode_mbr=false
  stats=true
  beam=6
  word_ins_penalty=0.0,0.5,1.0
  min_lmwt=7
  max_lmwt=17
  iter=final
  #end configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
    echo " Options:"
    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
    echo "    --stage (0|1|2)                 # start scoring script from part-way through."
    echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
    echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
    echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
    exit 1;
  fi
  
  data=$1
  lang_or_graph=$2
  dir=$3
  
  symtab=$lang_or_graph/words.txt
  
  for f in $symtab $dir/lat.1.gz $data/text; do
    [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
  done
  
  
  ref_filtering_cmd="cat"
  [ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
  [ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
  hyp_filtering_cmd="cat"
  [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
  [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
  
  
  if $decode_mbr ; then
    echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
  else
    echo "$0: scoring with word insertion penalty=$word_ins_penalty"
  fi
  
  
  mkdir -p $dir/scoring_kaldi
  if echo $data | grep -q "real"; then
    tasks="\
    near_room1 far_room1"
  elif echo $data | grep -q "cln"; then
    tasks="\
    cln_room1 cln_room2 cln_room3"
  else
    tasks="\
    near_room1 far_room1 \
    near_room2 far_room2 \
    near_room3 far_room3"
  fi
  for task in ${tasks}; do
    grep $task $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt_${task}.txt || exit 1;
  done
  
  if [ $stage -le 0 ]; then
  
    for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
      mkdir -p $dir/scoring_kaldi/penalty_$wip/log
  
      if $decode_mbr ; then
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
          acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
          lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
          lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
          lattice-prune --beam=$beam ark:- ark:- \| \
          lattice-mbr-decode  --word-symbol-table=$symtab \
          ark:- ark,t:- \| \
          utils/int2sym.pl -f 2- $symtab \| \
          $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
  
      else
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
          lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
          lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
          lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
          utils/int2sym.pl -f 2- $symtab \| \
          $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
      fi
      for task in ${tasks}; do
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
          grep $task $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
          compute-wer --text --mode=present \
          ark:$dir/scoring_kaldi/test_filt_${task}.txt  ark,p:- ">&" $dir/wer_LMWT_${wip}_${task} || exit 1;
      done
    done
  fi
  
  
  
  if [ $stage -le 1 ]; then
    for task in ${tasks}; do 
      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
        for lmwt in $(seq $min_lmwt $max_lmwt); do
          # adding /dev/null to the command list below forces grep to output the filename
          grep WER $dir/wer_${lmwt}_${wip}_${task} /dev/null
        done
      done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer_${task} || exit 1
    
      best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer_${task})
      best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-2; print $N}')
      best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}')
  
      if [ -z "$best_lmwt" ]; then
        echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
        exit 1;
      fi
      if $stats; then
        mkdir -p $dir/scoring_kaldi/wer_details
        echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
        echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
  
        $cmd $dir/scoring_kaldi/log/stats1.log \
          cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
          align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt_${task}.txt ark:- ark,t:- \|  \
          utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
           utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
  
        $cmd $dir/scoring_kaldi/log/stats2.log \
          cat $dir/scoring_kaldi/wer_details/per_utt \| \
          utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
          sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
  
        $cmd $dir/scoring_kaldi/log/wer_bootci.log \
          compute-wer-bootci --mode=present \
            ark:$dir/scoring_kaldi/test_filt_${task}.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
            '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
  
      fi
    done
  fi
  
  # If we got here, the scoring was successful.
  # As a  small aid to prevent confusion, we remove all wer_{?,??} files;
  # these originate from the previous version of the scoring files
  # i keep both statement here because it could lead to confusion about
  # the capabilities of the script (we don't do cer in the script)
  rm $dir/wer_{?,??} 2>/dev/null
  rm $dir/cer_{?,??} 2>/dev/null
  
  exit 0;