Blame view

egs/aspire/s5/local/multi_condition/prep_test_aspire.sh 14.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
  #!/bin/bash
  # Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2015.  Apache 2.0.
  # This script generates the ctm files for dev_aspire, test_aspire and eval_aspire 
  # for scoring with ASpIRE scoring server.
  # It also provides the WER for dev_aspire data.
  
  iter=final
  mfccdir=mfcc_reverb_submission
  stage=0
  decode_num_jobs=200
  num_jobs=30
  LMWT=12
  word_ins_penalty=0
  min_lmwt=9
  max_lmwt=20
  word_ins_penalties=0.0,0.25,0.5,0.75,1.0
  decode_mbr=true
  acwt=0.1
  lattice_beam=8
  ctm_beam=6
  do_segmentation=true
  max_count=100 # parameter for extract_ivectors.sh
  sub_speaker_frames=1500
  overlap=5
  window=30
  affix=
  ivector_scale=1.0
  pad_frames=0  # this did not seem to be helpful but leaving it as an option.
  tune_hyper=true
  pass2_decode_opts=
  filter_ctm=true
  weights_file=
  silence_weight=0.00001
  . ./cmd.sh
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir>"
    echo " Options:"
    echo "    --stage (0|1|2)                 # start scoring script from part-way through."
    echo "e.g.:"
    echo "$0 data/train data/lang exp/nnet2_multicondition/nnet_ms_a"
    exit 1;
  fi
  
  data_dir=$1 #select from {dev_aspire, test_aspire, eval_aspire}
  lang=$2 # data/lang
  dir=$3 # exp/nnet2_multicondition/nnet_ms_a
  
  model_affix=`basename $dir`
  ivector_dir=`dirname $dir`
  ivector_affix=${affix:+_$affix}_${model_affix}_iter$iter
  affix=_${affix}_iter${iter}
  act_data_dir=${data_dir}
  if [ "$data_dir" == "test_aspire" ]; then
    out_file=single_dev_test${affix}_$model_affix.ctm
  elif [ "$data_dir" == "eval_aspire" ]; then
    out_file=single_eval${affix}_$model_affix.ctm
  else
    if [ $stage -le 1 ]; then
      echo "Creating the data dir with whole recordings without segmentation"
      # create a whole directory without the segments
      unseg_dir=data/${data_dir}_whole
      src_dir=data/$data_dir
      mkdir -p $unseg_dir
      echo "Creating the $unseg_dir/wav.scp file"
      cp $src_dir/wav.scp $unseg_dir
  
      echo "Creating the $unseg_dir/reco2file_and_channel file"
      cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel
      cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk
      utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt
  
      steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" --mfcc-config conf/mfcc_hires.conf $unseg_dir exp/make_mfcc_reverb/${data_dir}_whole $mfccdir || exit 1;
      steps/compute_cmvn_stats.sh $unseg_dir exp/make_mfcc_reverb/${data_dir}_whole $mfccdir || exit 1;
    fi
    data_dir=${data_dir}_whole
    out_file=single_dev${affix}_${model_affix}.ctm
  fi
  
  num_jobs=`cat data/${act_data_dir}/wav.scp|wc -l`
  segmented_data_dir=${data_dir}
  # extract the ivectors
  if $do_segmentation; then
    segmented_data_dir=${data_dir}_uniformsegmented_win${window}_over${overlap}
  fi
  
  if [ $stage -le 2 ]; then
    echo "Generating uniform segments with length $window and overlap $overlap."
    rm -rf data/$segmented_data_dir
    copy_data_dir.sh --validate-opts "--no-text" data/$data_dir data/$segmented_data_dir || exit 1;
    cp data/$data_dir/reco2file_and_channel data/$segmented_data_dir/ || exit 1;
    python local/multi_condition/create_uniform_segments.py --overlap $overlap --window $window data/$segmented_data_dir  || exit 1;
    for file in cmvn.scp feats.scp; do
      rm -f data/$segmented_data_dir/$file
    done
    utils/validate_data_dir.sh --no-text --no-feats data/$segmented_data_dir || exit 1;
  fi
  
  if [ $stage -le 3 ]; then
    echo "Extracting features for the segments"
     # extract the features/i-vectors once again so that they are indexed by utterance and not by recording
    rm -rf data/${segmented_data_dir}_hires
    copy_data_dir.sh --validate-opts "--no-text " data/${segmented_data_dir} data/${segmented_data_dir}_hires || exit 1;
    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
        --cmd "$train_cmd" data/${segmented_data_dir}_hires \
        exp/make_reverb_hires/${segmented_data_dir} $mfccdir || exit 1;
    steps/compute_cmvn_stats.sh data/${segmented_data_dir}_hires exp/make_reverb_hires/${segmented_data_dir} $mfccdir || exit 1;
    utils/fix_data_dir.sh data/${segmented_data_dir}_hires
    utils/validate_data_dir.sh --no-text data/${segmented_data_dir}_hires
  fi
  
  if [ $stage -le 4 ]; then
    echo "Extracting i-vectors, stage 1"
    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
      --max-count $max_count \
      data/${segmented_data_dir}_hires $ivector_dir/extractor \
      $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}_stage1 || exit 1;
  fi
  if [ $ivector_scale != 1.0 ] && [ $ivector_scale != 1 ]; then
    ivector_scale_affix=_scale$ivector_scale
  else
    ivector_scale_affix=
  fi
  
  if [ $stage -le 5 ]; then
    if [ "$ivector_scale_affix" != "" ]; then
      echo "$0: Scaling iVectors, stage 1"
      srcdir=$ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}_stage1
      outdir=$ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}${ivector_scale_affix}_stage1
      mkdir -p $outdir
      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- | \
        copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp || exit 1;
      cp $srcdir/ivector_period $outdir/ivector_period
    fi
  fi
  
  decode_dir=$dir/decode_${segmented_data_dir}${affix}_pp
  # generate the lattices
  if [ $stage -le 6 ]; then
    echo "Generating lattices, stage 1"
    local/multi_condition/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \
      --online-ivector-dir $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}${ivector_scale_affix}_stage1 \
      --skip-scoring true --iter $iter \
      exp/tri5a/graph_pp data/${segmented_data_dir}_hires ${decode_dir}_stage1 || exit 1;
  fi
  
  if [ $stage -le 7 ]; then
    echo "$0: generating CTM from stage-1 lattices"
    local/multi_condition/get_ctm_conf.sh --cmd "$decode_cmd" \
      --use-segments false --iter $iter \
      data/${segmented_data_dir}_hires \
      ${lang} \
      ${decode_dir}_stage1 || exit 1;
  fi
  
  if [ $stage -le 8 ]; then
    if $filter_ctm; then
      if [ ! -z $weights_file ]; then
        echo "$0: Using provided weights file $weights_file"
        ivector_extractor_input=$weights_file
      else
        ctm=${decode_dir}_stage1/score_10/${segmented_data_dir}_hires.ctm 
        echo "$0: generating weights file from stage-1 ctm $ctm"
        
        feat-to-len scp:data/${segmented_data_dir}_hires/feats.scp ark,t:- >${decode_dir}_stage1/utt.lengths.$affix
        if [ ! -f $ctm ]; then  echo "$0: stage 8: expected ctm to exist: $ctm"; exit 1; fi
        cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
        grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
        grep -v -F '[laughter]' | grep -v -F '<unk>' | \
        perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
         $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
         open(L, "<$lengths") || die "opening lengths file";
         @all_utts = ();
         $utt2ref = { };
         while (<L>) {
           ($utt, $len) = split(" ", $_);
           push @all_utts, $utt;
           $array_ref = [ ];
           for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
           $utt2ref{$utt} = $array_ref;
         }
         while (<STDIN>) {
           @A = split(" ", $_);
           @A == 6 || die "bad ctm line $_";
           $utt = $A[0]; $beg = $A[2]; $len = $A[3];
           $beg_int = int($beg * 100) - $pad_frames; 
           $len_int = int($len * 100) + 2*$pad_frames;
           $array_ref = $utt2ref{$utt};
           !defined $array_ref  && die "No length info for utterance $utt";
           for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
             if ($t >= 0 && $t < @$array_ref) {
               ${$array_ref}[$t] = 1;
              }
            }
          }
          foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
            print $utt, " [ ", join(" ", @$array_ref), " ]
  ";
            } ' ${decode_dir}_stage1/utt.lengths.$affix $pad_frames $silence_weight   | gzip -c >${decode_dir}_stage1/weights${affix}.gz
            ivector_extractor_input=${decode_dir}_stage1/weights${affix}.gz
          fi
        else
          ivector_extractor_input=${decode_dir}_stage1
        fi
  fi
  
  if [ $stage -le 8 ]; then
    echo "Extracting i-vectors, stage 2 with input $ivector_extractor_input"
    # this does offline decoding, except we estimate the iVectors per
    # speaker, excluding silence (based on alignments from a GMM decoding), with a
    # different script.  This is just to demonstrate that script.
    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
    # up into "sub-speakers" of at least that many frames... can be useful if
    # acoustic conditions drift over time within the speaker's data.
    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \
      --silence-weight $silence_weight \
      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
      data/${segmented_data_dir}_hires $lang $ivector_dir/extractor \
      $ivector_extractor_input $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix} || exit 1;
  fi
  
  if [ $stage -le 9 ]; then
    echo "Generating lattices, stage 2 with --acwt $acwt"
    rm -f ${decode_dir}_tg/.error
    local/multi_condition/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config $pass2_decode_opts \
        --skip-scoring true --iter $iter --acwt $acwt --lattice-beam $lattice_beam \
        --online-ivector-dir $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix} \
       exp/tri5a/graph_pp data/${segmented_data_dir}_hires ${decode_dir}_tg || touch ${decode_dir}_tg/.error
    [ -f ${decode_dir}_tg/.error ] && echo "$0: Error decoding" && exit 1;
  fi
  
  if [ $stage -le 10 ]; then
    echo "Rescoring lattices"
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      --skip-scoring true \
      ${lang}_pp_test{,_fg} data/${segmented_data_dir}_hires \
      ${decode_dir}_{tg,fg} || exit 1;
  fi
  
  # tune the LMWT and WIP
  # make command for filtering the ctms
  decode_dir=${decode_dir}_fg
  if [ -z $iter ]; then
    model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
  else
    model=$decode_dir/../$iter.mdl
  fi
  
  mkdir -p $decode_dir/scoring
  # create a python script to filter the ctm, for labels which are mapped
  # to null strings in the glm or which are not accepted by the scoring server
  python -c "
  import sys, re
  lines = map(lambda x: x.strip(), open('data/${act_data_dir}/glm').readlines())
  patterns = []
  for line in lines:
    if re.search('=>', line) is not None:
      parts = re.split('=>', line.split('/')[0])
      if parts[1].strip() == '':
        patterns.append(parts[0].strip())
  print '|'.join(patterns)
  " > $decode_dir/scoring/glm_ignore_patterns || exit 1;
  
  ignore_patterns=$(cat $decode_dir/scoring/glm_ignore_patterns)
  echo "$0: Ignoring these patterns from the ctm ", $ignore_patterns 
  cat << EOF > $decode_dir/scoring/filter_ctm.py
  import sys
  file = open(sys.argv[1])
  out_file = open(sys.argv[2], 'w')
  ignore_set = "$ignore_patterns".split("|")
  ignore_set.append("[noise]")
  ignore_set.append("[laughter]")
  ignore_set.append("[vocalized-noise]")
  ignore_set.append("!SIL")
  ignore_set.append("<unk>")
  ignore_set.append("%hesitation")
  ignore_set = set(ignore_set)
  print ignore_set
  for line in file:
    if line.split()[4] not in ignore_set:
      out_file.write(line)
  out_file.close()
  EOF
  
  filter_ctm_command="python $decode_dir/scoring/filter_ctm.py "
  
  if  $tune_hyper ; then
    if [ $stage -le 11 ]; then
      if [ "$act_data_dir" == "dev_aspire" ]; then
        wip_string=$(echo $word_ins_penalties | sed 's/,/ /g')
        temp_wips=($wip_string)
        $decode_cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \
          wips=\(0 $wip_string\) \&\& \
          wip=\${wips[WIP]} \&\& \
          echo \$wip \&\& \
          $decode_cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \
            local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
              --window $window --overlap $overlap \
              --beam $ctm_beam --decode-mbr $decode_mbr \
              --glm data/${act_data_dir}/glm --stm data/${act_data_dir}/stm \
            LMWT \$wip $lang data/${segmented_data_dir}_hires $model $decode_dir || exit 1; 
  
        eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys"|utils/best_wer.sh 2>/dev/null
        eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys" | \
         utils/best_wer.sh 2>/dev/null | python -c "import sys, re
  line = sys.stdin.readline()
  file_name=line.split()[-1]
  parts=file_name.split('/')
  penalty = re.sub('penalty_','',parts[-2])
  lmwt = re.sub('score_','', parts[-3])
  lmfile=open('$decode_dir/scoring/bestLMWT','w')
  lmfile.write(str(lmwt))
  lmfile.close()
  wipfile=open('$decode_dir/scoring/bestWIP','w')
  wipfile.write(str(penalty))
  wipfile.close()
  " || exit 1;
          LMWT=$(cat $decode_dir/scoring/bestLMWT)
          word_ins_penalty=$(cat $decode_dir/scoring/bestWIP)
      fi
    fi
    if [ "$act_data_dir" == "test_aspire" ] || [ "$act_data_dir" == "eval_aspire" ]; then
      dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g")
      if [ -f $dev_decode_dir/scoring/bestLMWT ]; then
        LMWT=$(cat $dev_decode_dir/scoring/bestLMWT)
        echo "Using the bestLMWT $LMWT value found in  $dev_decode_dir"
      else
        echo "Unable to find the bestLMWT in the  dev decode dir $dev_decode_dir"
        echo "Keeping the default/user-specified value"
      fi
      if [ -f $dev_decode_dir/scoring/bestWIP ]; then
        word_ins_penalty=$(cat $dev_decode_dir/scoring/bestWIP)
        echo "Using the bestWIP $word_ins_penalty value found in  $dev_decode_dir"
      else
        echo "Unable to find the bestWIP in the  dev decode dir $dev_decode_dir"
        echo "Keeping the default/user-specified value"
      fi
    else
      echo "Using the default/user-specified values for LMWT and word_ins_penalty"
    fi
  fi
  
  # lattice to ctm conversion and scoring.
  if [ $stage -le 12 ]; then
    echo "Generating CTMs with LMWT $LMWT and word insertion penalty of $word_ins_penalty"
    local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \
      --beam $ctm_beam --decode-mbr $decode_mbr \
      $LMWT $word_ins_penalty $lang data/${segmented_data_dir}_hires $model $decode_dir 2>$decode_dir/scoring/finalctm.LMWT$LMWT.WIP$word_ins_penalty.log || exit 1;
  fi
  
  if [ $stage -le 13 ]; then
    cat $decode_dir/score_$LMWT/penalty_$word_ins_penalty/ctm.filt | awk '{split($1, parts, "-"); printf("%s 1 %s %s %s
  ", parts[1], $3, $4, $5)}' > $out_file
    cat data/${segmented_data_dir}_hires/wav.scp | awk '{split($1, parts, "-"); printf("%s
  ", parts[1])}' > $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names 
    local/multi_condition/fill_missing_recordings.py $out_file $out_file.submission $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names
    echo "Generated the ctm @ $out_file.submission from the ctm file $decode_dir/score_${LMWT}/penalty_$word_ins_penalty/ctm.filt"
  fi