Blame view

egs/timit/s5/local/score_combine.sh 4.22 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  #!/bin/bash
  
  # Copyright 2013  Arnab Ghoshal
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  # Script for system combination using minimum Bayes risk decoding.
  # This calls lattice-combine to create a union of lattices that have been 
  # normalized by removing the total forward cost from them. The resulting lattice
  # is used as input to lattice-mbr-decode. This should not be put in steps/ or 
  # utils/ since the scores on the combined lattice must not be scaled.
  
  # begin configuration section.
  cmd=run.pl
  min_lmwt=1
  max_lmwt=10
  lat_weights=
  stage=0
  #end configuration section.
  
  help_message="Usage: "$(basename $0)" [options] <data-dir> <graph-dir|lang-dir> <decode-dir1> <decode-dir2> [decode-dir3 ... ] <out-dir>
  Options:
    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
    --min-lmwt INT                  # minumum LM-weight for lattice rescoring 
    --max-lmwt INT                  # maximum LM-weight for lattice rescoring
    --lat-weights STR               # colon-separated string of lattice weights
  ";
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -lt 5 ]; then
    printf "$help_message
  ";
    exit 1;
  fi
  
  data=$1
  graphdir=$2
  odir=${@: -1}  # last argument to the script
  shift 2;
  decode_dirs=( $@ )  # read the remaining arguments into an array
  unset decode_dirs[${#decode_dirs[@]}-1]  # 'pop' the last argument which is odir
  num_sys=${#decode_dirs[@]}  # number of systems to combine
  
  model=${decode_dirs[0]}/../final.mdl # assume model one level up from decoding dir.
  
  hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
  [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
  hubdir=`dirname $hubscr`
  
  phonemap="conf/phones.60-48-39.map"
  
  symtab=$graphdir/words.txt
  [ ! -f $symtab ] && echo "$0: missing word symbol table '$symtab'" && exit 1;
  [ ! -f $data/text ] && echo "$0: missing reference '$data/text'" && exit 1;
  
  
  mkdir -p $odir/log
  
  for i in `seq 0 $[num_sys-1]`; do
    model=${decode_dirs[$i]}/../final.mdl  # model one level up from decode dir
    for f in $model ${decode_dirs[$i]}/lat.1.gz ; do
      [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
    done
    #lats[$i]="\"ark:gunzip -c ${decode_dirs[$i]}/lat.*.gz |\""
    lats[$i]="'ark:gunzip -c ${decode_dirs[$i]}/lat.*.gz |'"
  done
  
  mkdir -p $odir/scoring/log
  
  # Map reference to 39 phone classes, the silence is optional (.):
  local/timit_norm_trans.pl -i $data/stm -m $phonemap -from 48 -to 39 | \
   sed 's: sil: (sil):g' > $odir/scoring/stm_39phn
  cp $data/glm $odir/scoring/glm_39phn
  
  if [ $stage -le 0 ]; then
    if [ -z "$lat_weights" ]; then
      $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
        lattice-combine --inv-acoustic-scale=LMWT ${lats[@]} ark:- \| \
        lattice-to-ctm-conf ark:- - \| \
        utils/int2sym.pl -f 5 $symtab '>' $odir/scoring/LMWT.ctm || exit 1;
    else
      $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
        lattice-combine --inv-acoustic-scale=LMWT --lat-weights=$lat_weights \
        ${lats[@]} ark:- \| \
        lattice-to-ctm-conf ark:- - \| \
        utils/int2sym.pl -f 5 $symtab '>' $odir/scoring/LMWT.ctm || exit 1;
    fi
  fi
  
  if [ $stage -le 1 ]; then
    $cmd LMWT=$min_lmwt:$max_lmwt $odir/scoring/log/ctm_convert.LMWT.log \
      local/timit_norm_trans.pl -i $odir/scoring/LMWT.ctm -m $phonemap -from 48 -to 39 \
      '>' $odir/scoring/LMWT.ctm_39phn || exit 1;
  fi
  
  # Score the set...
  $cmd LMWT=$min_lmwt:$max_lmwt $odir/scoring/log/score.LMWT.log \
    mkdir $odir/score_LMWT ';' \
    cp $odir/scoring/stm_39phn $odir/score_LMWT/ '&&' cp $odir/scoring/LMWT.ctm_39phn $odir/score_LMWT/ctm_39phn '&&' \
    $hubscr -p $hubdir -V -l english -h hub5 -g $odir/scoring/glm_39phn -r $odir/score_LMWT/stm_39phn $odir/score_LMWT/ctm_39phn || exit 1;
  
  
  exit 0