Blame view

egs/aspire/s5/local/score_aspire.sh 6.54 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
  #!/bin/bash
  
  # Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016.  Apache 2.0.
  # This script generates the ctm files, filters and scores them if an stm file is available
  
  set -e
  set -x
  
  iter=final
  min_lmwt=1
  max_lmwt=20
  default_lmwt=12 # see tune_hyper description for more info
  word_ins_penalties=0.0,0.25,0.5,0.75,1.0
  default_wip=0.0
  ctm_beam=6
  decode_mbr=true
  cmd=run.pl
  stage=1
  resolve_overlaps=true
  tune_hyper=true # if true:
                  #    if the data set is "dev_aspire" we check for the
                  #       best lmwt and word_insertion_penalty,
                  #    else we use try to find the best values from dev_aspire decodes
                  #         if not found we use the default values
  
  . ./cmd.sh
  . ./path.sh
  . utils/parse_options.sh || exit 1;
  
  if [ $# -ne 5 ]; then
    echo "Usage: $0 [options] <lang-dir> <decode-dir> <actual-data-set> <segmented-data-set> <output-ctm-file>"
    echo " Options:"
    echo "    --stage (1|2|3)  # start scoring script from part-way through."
    echo "e.g.:"
    echo "$0 data/train data/lang exp/nnet3/tdnn"
    exit 1;
  fi
  
  lang=$1
  decode_dir=$2
  act_data_set=$3
  segmented_data_set=$4
  out_file=$5
  
  model=$decode_dir/../$iter.mdl # assume model one level up from decoding dir.
  
  mkdir -p $decode_dir/scoring
  # create a python script to filter the ctm, for labels which are mapped
  # to null strings in the glm or which are not accepted by the scoring server
  python -c "
  import sys, re
  lines = map(lambda x: x.strip(), open('data/${act_data_set}/glm').readlines())
  patterns = []
  for line in lines:
    if re.search('=>', line) is not None:
      parts = re.split('=>', line.split('/')[0])
      if parts[1].strip() == '':
        patterns.append(parts[0].strip())
  print '|'.join(patterns)
  " > $decode_dir/scoring/glm_ignore_patterns
  
  ignore_patterns=$(cat $decode_dir/scoring/glm_ignore_patterns)
  echo "$0: Ignoring these patterns from the ctm ", $ignore_patterns
  cat << EOF > $decode_dir/scoring/filter_ctm.py
  import sys
  file = open(sys.argv[1])
  out_file = open(sys.argv[2], 'w')
  ignore_set = "$ignore_patterns".split("|")
  ignore_set.append("[noise]")
  ignore_set.append("[laughter]")
  ignore_set.append("[vocalized-noise]")
  ignore_set.append("!SIL")
  ignore_set.append("<unk>")
  ignore_set.append("%hesitation")
  ignore_set = set(ignore_set)
  print ignore_set
  for line in file:
    if line.split()[4] not in ignore_set:
      out_file.write(line)
  out_file.close()
  EOF
  
  filter_ctm_command="python $decode_dir/scoring/filter_ctm.py "
  
  if  $tune_hyper ; then
    # find the best lmwt and word_insertion_penalty based on the transcripts
    # provided for dev_aspire, for other data sets just copy the values from dev_aspire decode directories
    # or use the default values
  
    if [ $stage -le 1 ]; then
      if [[ "$act_data_set" =~ "dev_aspire" ]]; then
        wip_string=$(echo $word_ins_penalties | sed 's/,/ /g')
        temp_wips=($wip_string)
        $cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \
          wips=\(0 $wip_string\) \&\& \
          wip=\${wips[WIP]} \&\& \
          echo \$wip \&\& \
          $cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \
            local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
              --beam $ctm_beam --decode-mbr $decode_mbr \
              --resolve-overlaps $resolve_overlaps \
              --glm data/${act_data_set}/glm --stm data/${act_data_set}/stm \
            LMWT \$wip $lang data/${segmented_data_set}_hires $model $decode_dir || exit 1;
  
        eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys"|utils/best_wer.sh 2>/dev/null
        eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys" | \
         utils/best_wer.sh 2>/dev/null | python -c "import sys, re
  line = sys.stdin.readline()
  file_name=line.split()[-1]
  parts=file_name.split('/')
  penalty = re.sub('penalty_','',parts[-2])
  lmwt = re.sub('score_','', parts[-3])
  lmfile=open('$decode_dir/scoring/bestLMWT','w')
  lmfile.write(str(lmwt))
  lmfile.close()
  wipfile=open('$decode_dir/scoring/bestWIP','w')
  wipfile.write(str(penalty))
  wipfile.close()
  " || exit 1;
          LMWT=$(cat $decode_dir/scoring/bestLMWT)
          word_ins_penalty=$(cat $decode_dir/scoring/bestWIP)
      fi
    fi
  
  
    if [[ "$act_data_set" =~ "test_aspire" ]] || [[ "$act_data_set" =~ "eval_aspire" ]]; then
      # check for the best values from dev_aspire decodes
      dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g")
      if [ -f $dev_decode_dir/scoring/bestLMWT ]; then
        LMWT=$(cat $dev_decode_dir/scoring/bestLMWT)
        echo "Using the bestLMWT $LMWT value found in  $dev_decode_dir"
      else
        LMWT=$default_lmwt # default LMWT in case hyper-parameter tuning results are not available
        echo "Unable to find the bestLMWT in the  dev decode dir $dev_decode_dir"
        echo "Keeping the default value $LMWT"
      fi
      if [ -f $dev_decode_dir/scoring/bestWIP ]; then
        word_ins_penalty=$(cat $dev_decode_dir/scoring/bestWIP)
        echo "Using the bestWIP $word_ins_penalty value found in  $dev_decode_dir"
      else
        word_ins_penalty=$default_wip # default WIP in case hyper-parameter tuning results are not available
        echo "Unable to find the bestWIP in the  dev decode dir $dev_decode_dir"
        echo "Keeping the default/user-specified value $word_ins_penalty"
      fi
    else
      echo "Using the default values for LMWT and word_ins_penalty"
    fi
  
  fi
  
  # lattice to ctm conversion and scoring.
  if [ $stage -le 2 ]; then
    echo "Generating CTMs with LMWT $LMWT and word insertion penalty of $word_ins_penalty"
    local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
      --beam $ctm_beam --decode-mbr $decode_mbr \
      $LMWT $word_ins_penalty $lang data/${segmented_data_set}_hires $model $decode_dir 2>$decode_dir/scoring/finalctm.LMWT$LMWT.WIP$word_ins_penalty.log || exit 1;
  fi
  
  
  # copy the ctms to the specified output files
  if [ $stage -le 3 ]; then
    cat $decode_dir/score_$LMWT/penalty_$word_ins_penalty/ctm.filt | \
      awk '{split($1, parts, "-"); printf("%s 1 %s %s %s
  ", parts[1], $3, $4, $5)}' > $out_file
  
    cat data/${segmented_data_set}_hires/wav.scp | \
      awk '{split($1, parts, "-"); printf("%s
  ", parts[1])}' > $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names
  
    local/multi_condition/fill_missing_recordings.py \
      $out_file $out_file.submission \
      $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names
  
    echo "Generated the ctm @ $out_file.submission from the ctm file $decode_dir/score_${LMWT}/penalty_$word_ins_penalty/ctm.filt"
  fi