Blame view

egs/fisher_swbd/s5/local/score_sclite.sh 5.65 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  #!/bin/bash
  # Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
  
  # begin configuration section.
  cmd=run.pl
  stage=0
  min_lmwt=5
  max_lmwt=17
  word_ins_penalty=0.0,0.5,1.0
  #end configuration section.
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
    echo " Options:"
    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
    echo "    --stage (0|1|2)                 # start scoring script from part-way through."
    echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
    echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
    exit 1;
  fi
  
  data=$1
  lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
  dir=$3
  
  model=$dir/../final.mdl # assume model one level up from decoding dir.
  
  hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
  [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
  hubdir=`dirname $hubscr`
  
  for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
       $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
  done
  
  if [ -f $dir/../frame_shift ]; then
    frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
    echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
  elif [ -f $dir/../frame_subsampling_factor ]; then
    factor=$(cat $dir/../frame_subsampling_factor) || exit 1
    frame_shift_opt="--frame-shift=0.0$factor"
    echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
  fi
  
  name=`basename $data`; # e.g. eval2000
  
  mkdir -p $dir/scoring/log
  
  if [ $stage -le 0 ]; then
    for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
        mkdir -p $dir/score_LMWT_${wip}/ '&&' \
        lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
        lattice-1best ark:- ark:- \| \
        lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
        nbest-to-ctm $frame_shift_opt ark:- - \| \
        utils/int2sym.pl -f 5 $lang/words.txt  \| \
        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
    done
  fi
  
  if [ $stage -le 1 ]; then
  # Remove some stuff we don't want to score, from the ctm.
    for x in $dir/score_*/$name.ctm; do
      cp $x $dir/tmpf;
      cat $dir/tmpf | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
        grep -i -v -E '<UNK>' > $x;
  #      grep -i -v -E '<UNK>|%HESITATION' > $x;  # hesitation is scored
      case "$name" in eval2000* )
        python local/format_acronyms_ctm_eval2000.py -i $x -o $x.mapped
       ;;
      rt03* )
        python local/format_acronyms_ctm_rt03.py -i $x -o $x.mapped
       ;;
      esac
      cp $x $x.bk
      mv $x.mapped $x
    done
  fi
  
  # Score the set...
  if [ $stage -le 2 ]; then
    for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
        cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1;
    done
  fi
  
  # For eval2000 score the subsets
  case "$name" in eval2000* )
    # Score only the, swbd part...
    if [ $stage -le 3 ]; then
      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
          grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
          grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
      done
    fi
    # Score only the, callhome part...
    if [ $stage -le 3 ]; then
      for  wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \
          grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \
          grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \
          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1;
      done
    fi
   ;;
  
  rt03* )
  
    # Score only the swbd part...
    if [ $stage -le 3 ]; then
      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
          grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
          grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
      done
    fi
    # Score only the fisher part...
    if [ $stage -le 3 ]; then
      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \
          grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
          grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1;
      done
    fi
   ;;
  esac
  
  exit 0