Blame view

egs/wsj/s5/local/online/run_nnet2.sh 9.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
  #!/bin/bash
  
  # this is our online-nnet2 build.  it's a "multi-splice" system (i.e. we have
  # splicing at various layers), with p-norm nonlinearities.  We use the "accel2"
  # script which uses between 2 and 14 GPUs depending how far through training it
  # is.  You can safely reduce the --num-jobs-final to however many GPUs you have
  # on your system.
  
  # For joint training with RM, this script is run using the following command line,
  # and note that the --stage 8 option is only needed in case you already ran the
  # earlier stages.
  # local/online/run_nnet2.sh --stage 8 --dir exp/nnet2_online/nnet_ms_a_partial --exit-train-stage 15
  
  . ./cmd.sh
  
  
  stage=0
  train_stage=-10
  use_gpu=true
  dir=exp/nnet2_online/nnet_ms_a
  exit_train_stage=-100
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  if $use_gpu; then
    if ! cuda-compiled; then
      cat <<EOF && exit 1 
  This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
  If you want to use GPUs (and have them), go to src/, and configure and make on a machine
  where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
  EOF
    fi
    parallel_opts="--gpu 1" 
    num_threads=1
    minibatch_size=512
    # the _a is in case I want to change the parameters.
  else
    num_threads=16
    minibatch_size=128
    parallel_opts="--num-threads $num_threads" 
  fi
  
  local/online/run_nnet2_common.sh --stage $stage || exit 1;
  
  if [ $stage -le 8 ]; then
    # last splicing was instead: layer3/-4:2" 
    steps/nnet2/train_multisplice_accel2.sh --stage $train_stage \
      --exit-stage $exit_train_stage \
      --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
      --num-hidden-layers 4 \
      --splice-indexes "layer0/-1:0:1 layer1/-2:1 layer2/-4:2" \
      --feat-type raw \
      --online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
      --cmvn-opts "--norm-means=false --norm-vars=false" \
      --num-threads "$num_threads" \
      --minibatch-size "$minibatch_size" \
      --parallel-opts "$parallel_opts" \
      --io-opts "--max-jobs-run 12" \
      --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
      --cmd "$decode_cmd" \
      --pnorm-input-dim 2000 \
      --pnorm-output-dim 250 \
      --mix-up 12000 \
      data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
  fi
  
  if [ $stage -le 9 ]; then
    # If this setup used PLP features, we'd have to give the option --feature-type plp
    # to the script below.
    iter_opt=
    [ $exit_train_stage -gt 0 ] && iter_opt="--iter $exit_train_stage"
    steps/online/nnet2/prepare_online_decoding.sh $iter_opt --mfcc-config conf/mfcc_hires.conf \
      data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
  fi
  
  if [ $exit_train_stage -gt 0 ]; then
    echo "$0: not testing since you only ran partial training (presumably in preparation"
    echo " for multilingual training"
    exit 0;
  fi
  
  if [ $stage -le 10 ]; then
    # this does offline decoding that should give the same results as the real
    # online decoding.
    for lm_suffix in tgpr bd_tgpr; do
      graph_dir=exp/tri4b/graph_${lm_suffix}
      # use already-built graphs.
      for year in eval92 dev93; do
        steps/nnet2/decode.sh --nj 8 --cmd "$decode_cmd" \
            --online-ivector-dir exp/nnet2_online/ivectors_test_$year \
           $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
      done
    done
  fi
  
  if [ $stage -le 11 ]; then
    # do the actual online decoding with iVectors, carrying info forward from 
    # previous utterances of the same speaker.
    for lm_suffix in tgpr bd_tgpr; do
      graph_dir=exp/tri4b/graph_${lm_suffix}
      for year in eval92 dev93; do
        steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
          "$graph_dir" data/test_${year} ${dir}_online/decode_${lm_suffix}_${year} || exit 1;
      done
    done
  fi
  
  if [ $stage -le 12 ]; then
    # this version of the decoding treats each utterance separately
    # without carrying forward speaker information.
    for lm_suffix in tgpr bd_tgpr; do
      graph_dir=exp/tri4b/graph_${lm_suffix}
      for year in eval92 dev93; do
        steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
          --per-utt true \
          "$graph_dir" data/test_${year} ${dir}_online/decode_${lm_suffix}_${year}_utt || exit 1;
      done
    done
  fi
  
  if [ $stage -le 13 ]; then
    # this version of the decoding treats each utterance separately
    # without carrying forward speaker information.  By setting --online false we
    # let it estimate the iVector from the whole utterance; it's then given to all
    # frames of the utterance.  So it's not really online.
    for lm_suffix in tgpr bd_tgpr; do
      graph_dir=exp/tri4b/graph_${lm_suffix}
      for year in eval92 dev93; do
        steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
          --per-utt true --online false \
          "$graph_dir" data/test_${year} ${dir}_online/decode_${lm_suffix}_${year}_utt_offline || exit 1;
      done
    done
  fi
  
  if [ $stage -le 14 ]; then
    # this does offline decoding, as stage 10, except we estimate the iVectors per
    # speaker, excluding silence (based on alignments from a GMM decoding), with a
    # different script.  This is just to demonstrate that script.
    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
    # up into "sub-speakers" of at least that many frames... can be useful if
    # acoustic conditions drift over time within the speaker's data.
    rm exp/nnet2_online/.error 2>/dev/null
    for year in eval92 dev93; do
      steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 8 \
        --sub-speaker-frames 1500 \
        data/test_${year}_hires data/lang exp/nnet2_online/extractor \
        exp/tri4b/decode_tgpr_$year exp/nnet2_online/ivectors_spk_test_${year} || touch exp/nnet2_online/.error &
    done
    wait
    [ -f exp/nnet2_online/.error ] && echo "$0: Error getting iVectors" && exit 1;
  
    for lm_suffix in bd_tgpr; do # just use the bd decoding, to avoid wasting time.
      graph_dir=exp/tri4b/graph_${lm_suffix}
      # use already-built graphs.
      for year in eval92 dev93; do
        steps/nnet2/decode.sh --nj 8 --cmd "$decode_cmd" \
            --online-ivector-dir exp/nnet2_online/ivectors_spk_test_$year \
           $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_spk || touch exp/nnet2_online/.error &
      done
    done
    wait
    [ -f exp/nnet2_online/.error ] && echo "$0: Error decoding" && exit 1;
  fi
  
  
  
  
  exit 0;
  
  # Here are results.
  
  # first, this is the baseline.  We choose as a baseline our best fMLLR+p-norm system trained
  # on si284, so this is a very good baseline.  For others you can see ../RESULTS.
  
  
  # %WER 7.13 [ 587 / 8234, 72 ins, 93 del, 422 sub ] exp/nnet5d_gpu/decode_bd_tgpr_dev93/wer_13
  # %WER 4.06 [ 229 / 5643, 31 ins, 16 del, 182 sub ] exp/nnet5d_gpu/decode_bd_tgpr_eval92/wer_14
  # %WER 9.35 [ 770 / 8234, 161 ins, 78 del, 531 sub ] exp/nnet5d_gpu/decode_tgpr_dev93/wer_12
  # %WER 6.59 [ 372 / 5643, 91 ins, 15 del, 266 sub ] exp/nnet5d_gpu/decode_tgpr_eval92/wer_12
  
  
  # Here is the offline decoding of our system (note: it still has the iVectors estimated frame
  # by frame, and for each utterance independently).
  
  for x in exp/nnet2_online/nnet_a_gpu/decode_*; do grep WER $x/wer_* | utils/best_wer.sh; done | grep -v utt
  %WER 7.53 [ 620 / 8234, 63 ins, 105 del, 452 sub ] exp/nnet2_online/nnet_a_gpu/decode_bd_tgpr_dev93/wer_12
  %WER 4.47 [ 252 / 5643, 27 ins, 22 del, 203 sub ] exp/nnet2_online/nnet_a_gpu/decode_bd_tgpr_eval92/wer_13
  %WER 9.91 [ 816 / 8234, 164 ins, 90 del, 562 sub ] exp/nnet2_online/nnet_a_gpu/decode_tgpr_dev93/wer_12
  %WER 7.12 [ 402 / 5643, 91 ins, 22 del, 289 sub ] exp/nnet2_online/nnet_a_gpu/decode_tgpr_eval92/wer_13
  
   # Here is the version of the above without iVectors, as done by
   # ./run_nnet2_baseline.sh.  It's about 0.5% absolute worse.
   # There is also an _online version of that decode directory, which is
   # essentially the same (we don't show the results here, as it's not really interesting).
   for x in exp/nnet2_online/nnet_a_gpu_baseline/decode_*; do grep WER $x/wer_* | utils/best_wer.sh; done
   %WER 8.03 [ 661 / 8234, 80 ins, 105 del, 476 sub ] exp/nnet2_online/nnet_a_gpu_baseline/decode_bd_tgpr_dev93/wer_11
   %WER 5.10 [ 288 / 5643, 43 ins, 22 del, 223 sub ] exp/nnet2_online/nnet_a_gpu_baseline/decode_bd_tgpr_eval92/wer_11
   %WER 10.51 [ 865 / 8234, 177 ins, 95 del, 593 sub ] exp/nnet2_online/nnet_a_gpu_baseline/decode_tgpr_dev93/wer_11
   %WER 7.34 [ 414 / 5643, 88 ins, 25 del, 301 sub ] exp/nnet2_online/nnet_a_gpu_baseline/decode_tgpr_eval92/wer_13
  
  # Next, truly-online decoding.
  # The results below are not quite as good as those in nnet_a_gpu, but I believe
  # the difference is that in this setup we're not using config files, and the
  # default beams/lattice-beams in the scripts are slightly different: 15.0/8.0
  # above, and 13.0/6.0 below.
  for x in exp/nnet2_online/nnet_a_gpu_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh; done | grep -v utt
  %WER 7.53 [ 620 / 8234, 74 ins, 97 del, 449 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_bd_tgpr_dev93/wer_11
  %WER 4.45 [ 251 / 5643, 35 ins, 19 del, 197 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_bd_tgpr_eval92/wer_12
  %WER 10.02 [ 825 / 8234, 166 ins, 88 del, 571 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_tgpr_dev93/wer_12
  %WER 6.91 [ 390 / 5643, 103 ins, 15 del, 272 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_tgpr_eval92/wer_10
  
  # Below is as above, but decoding each utterance separately.  It actualy seems slightly better,
  # which is counterintuitive.
  for x in exp/nnet2_online/nnet_a_gpu_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh; done | grep  utt
  %WER 7.55 [ 622 / 8234, 57 ins, 109 del, 456 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_bd_tgpr_dev93_utt/wer_13
  %WER 4.43 [ 250 / 5643, 27 ins, 21 del, 202 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_bd_tgpr_eval92_utt/wer_13
  %WER 9.98 [ 822 / 8234, 179 ins, 80 del, 563 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_tgpr_dev93_utt/wer_11
  %WER 7.12 [ 402 / 5643, 98 ins, 18 del, 286 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_tgpr_eval92_utt/wer_12