Blame view

egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh 13 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
  #!/bin/bash
  
  # 2016 Modified by Takafumi Moriya at Tokyo Institute of Technology
  # for Japanese speech recognition using CSJ.
  
  # Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely)
  # Apache 2.0
  
  # This example script trains a bottleneck feature extractor with 
  # 'Universal Context' topology as invented by Frantisek Grezl,
  # the network is on top of FBANK+f0 features.
  
  . ./cmd.sh
  . ./path.sh
  
  # Config:
  stage=0 # resume training with --stage=N
  use_dev=false
  # End of config.
  . utils/parse_options.sh || exit 1;
  #
  
  [ ! -e data-fbank/train ] && if [ $stage -le 1 ]; then
      # prepare the FBANK+f0 features
      # all evaluation sets
      for eval_num in eval1 eval2 eval3 ;do
  	dir=data-fbank/$eval_num; srcdir=data/$eval_num
  	(mkdir -p $dir; cp $srcdir/* $dir; )
  	steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 $dir $dir/log $dir/data || exit 1;
  	steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
      done
      # training set
      dir=data-fbank/train; srcdir=data/train
      (mkdir -p $dir; cp $srcdir/* $dir; )
      steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 $dir $dir/log $dir/data || exit 1;
      steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
  fi
  
  if [ $stage -le 2 ]; then
    # Prepare same subsets as in the main MFCC-GMM recipe, these will be used 
    # during during building GMM system from flat-start, later in the Tandem recipe.
    data=data-fbank
  
    if $use_dev ;then
      dev_set=train_dev
    # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
      utils/subset_data_dir.sh --first $data/train 4000 $data/$dev_set # 6hr 31min
      n=$[`cat data/train/segments | wc -l` - 4000]
      utils/subset_data_dir.sh --last $data/train $n $data/train_nodev
    else
      cp -r $data/train $data/train_nodev
    fi
    
    # Prepare data for training mono
    # Take the first 30k utterances (about 1/8th of the data)
    utils/subset_data_dir.sh --shortest $data/train_nodev 100000 $data/train_100kshort
    utils/subset_data_dir.sh $data/train_100kshort 30000 $data/train_30kshort
  
    # Take the first 100k utterances (just under half the data); we'll use
    # this for later stages of training.
    utils/subset_data_dir.sh --first $data/train_nodev 100000 $data/train_100k
    local/remove_dup_utts.sh 200 $data/train_100k $data/train_100k_nodup
  
    # Full training dataset,
    local/remove_dup_utts.sh 300 $data/train_nodev $data/train_nodup
  
    # split the data : 90% train 10% cross-validation (held-out)
    dir=$data/train_nodup
    utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
  fi
  
  #########################################################################################
  # Let's build universal-context bottleneck network
  # - Universal context MLP is a hierarchy of two bottleneck neural networks
  # - The first network can see a limited range of frames (11 frames)
  # - The second network sees concatenation of bottlneck outputs of the first 
  #   network, with temporal shifts -10 -5 0 5 10, (in total a range of 31 frames 
  #   in the original feature space)
  # - This structure has been reported to produce superior performance
  #   compared to a network with single bottleneck
  #
  if [ $stage -le 3 ]; then
    # 1st network, overall context +/-5 frames
    # - the topology is 90_1500_1500_80_1500_NSTATES, linear bottleneck
    dir=exp/nnet5b_uc-part1
    ali=exp/tri4_ali_nodup
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 80 --apply-cmvn true \
        --copy-feats false \
        --feat-type traps --splice 5 --traps-dct-basis 6 --learn-rate 0.008 \
      data-fbank/train_nodup_tr90 data-fbank/train_nodup_cv10 data/lang ${ali} ${ali} $dir || exit 1;
  fi
  if [ $stage -le 4 ]; then
    # Compose feature_transform for the next stage, 
    # - remaining part of the first network is fixed
    dir=exp/nnet5b_uc-part1
    feature_transform=$dir/final.feature_transform.part1
    nnet-concat $dir/final.feature_transform \
      "nnet-copy --remove-last-layers=4 --binary=false $dir/final.nnet - |" \
      "utils/nnet/gen_splice.py --fea-dim=80 --splice=2 --splice-step=5 |" \
      $feature_transform || exit 1
    
    # 2nd network, overall context +/-15 frames
    # - the topology will be 400_1500_1500_30_1500_NSTATES, again, the bottleneck is linear
    dir=exp/nnet5b_uc-part2
    ali=exp/tri4_ali_nodup
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 30 --apply-cmvn true \
      --feature-transform $feature_transform --learn-rate 0.008 \
      data-fbank/train_nodup_tr90 data-fbank/train_nodup_cv10 data/lang ${ali} ${ali} $dir || exit 1;
  fi
  #
  #########################################################################################
  
  if [ $stage -le 5 ]; then
    # Store the BN-features
    data=data-bn/nnet5b_uc-part2 
    srcdata=data-fbank
    nnet=exp/nnet5b_uc-part2
    
    # all evaluation sets
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 10 $data/$eval_num $srcdata/$eval_num \
  	  $nnet $data/$eval_num/log $data/$eval_num/data || exit 1
    done
    # trainig data (full set)
    steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 10 $data/train $srcdata/train \
      $nnet $data/train/log $data/train/data || exit 1
  
    # Compute CMVN of the BN-features
    dir=data-bn/nnet5b_uc-part2/train
    steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
  
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        dir=data-bn/nnet5b_uc-part2/$eval_num
        steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
    done
  fi
  
  if [ $stage -le 6 ]; then
    # Prepare BN-feature subsets same as with MFCCs in run.sh 
    data=data-bn/nnet5b_uc-part2
    
    if $use_dev ;then
        dev_set=train_dev
        # Use the first 4k sentences as dev set.
        utils/subset_data_dir.sh --first $data/train 4000 $data/$dev_set # 6hr 31min
        n=$[`cat data/train/segments | wc -l` - 4000]
        utils/subset_data_dir.sh --last $data/train $n $data/train_nodev
    else
        cp -r $data/train $data/train_nodev
    fi
  
    # Prepare data for training mono
    # Take the first 30k utterances (about 1/8th of the data)
    utils/subset_data_dir.sh --shortest $data/train_nodev 100000 $data/train_100kshort
    utils/subset_data_dir.sh $data/train_100kshort 30000 $data/train_30kshort
  
    # Take the first 100k utterances (just under half the data); we'll use
    # this for later stages of training.
    utils/subset_data_dir.sh --first $data/train_nodev 100000 $data/train_100k
    local/remove_dup_utts.sh 200 $data/train_100k $data/train_100k_nodup
  
    # Full dataset
    local/remove_dup_utts.sh 300 $data/train_nodev $data/train_nodup
  fi
  
  
  # Start building the tandem GMM system
  # - train from mono to tri4, run bmmi training
  bndata=data-bn/nnet5b_uc-part2
  
  if [ $stage -le 7 ]; then
    steps/tandem/train_mono.sh --nj 10 --cmd "$train_cmd" \
      data/train_30kshort $bndata/train_30kshort data/lang exp/tandem2uc-mono0a || exit 1;
  
    steps/tandem/align_si.sh --nj 10 --cmd "$train_cmd" \
       data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-mono0a exp/tandem2uc-mono0a_ali || exit 1;
  
    steps/tandem/train_deltas.sh --cmd "$train_cmd" \
        3200 30000 data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-mono0a_ali exp/tandem2uc-tri1 || exit 1;
     
    utils/mkgraph.sh data/lang_csj_tg exp/tandem2uc-tri1 exp/tandem2uc-tri1/graph_csj_tg
  
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        steps/tandem/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_tandem.config \
  	  exp/tandem2uc-tri1/graph_csj_tg data/$eval_num $bndata/$eval_num exp/tandem2uc-tri1/decode_${eval_num}_csj
    done
  fi
  
  if [ $stage -le 8 ]; then
    steps/tandem/align_si.sh --nj 10 --cmd "$train_cmd" \
       data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-tri1 exp/tandem2uc-tri1_ali || exit 1;
  
    steps/tandem/train_deltas.sh --cmd "$train_cmd" \
       4000 70000 data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-tri1_ali exp/tandem2uc-tri2 || exit 1;
  
    utils/mkgraph.sh data/lang_csj_tg exp/tandem2uc-tri2 exp/tandem2uc-tri2/graph_csj_tg || exit 1;
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        steps/tandem/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_tandem.config \
  	  exp/tandem2uc-tri2/graph_csj_tg data/$eval_num $bndata/$eval_num exp/tandem2uc-tri2/decode_${eval_num}_csj || exit 1;
    done
  fi
  
  if [ $stage -le 9 ]; then
    steps/tandem/align_si.sh --nj 10 --cmd "$train_cmd" \
      data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri2 exp/tandem2uc-tri2_ali || exit 1;
  
    # Train tri3, which is LDA+MLLT, on train_nodup data.
    steps/tandem/train_lda_mllt.sh --cmd "$train_cmd" \
       --splice-opts "--left-context=3 --right-context=3" \
       6000 140000 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri2_ali exp/tandem2uc-tri3 || exit 1;
  
    utils/mkgraph.sh data/lang_csj_tg exp/tandem2uc-tri3 exp/tandem2uc-tri3/graph_csj_tg || exit 1;
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        steps/tandem/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_tandem.config \
  	  exp/tandem2uc-tri3/graph_csj_tg data/$eval_num $bndata/$eval_num exp/tandem2uc-tri3/decode_${eval_num}_csj || exit 1;
    done
  fi
  
  if [ $stage -le 10 ]; then
    # From now, we start building a more serious system (with SAT), 
    # and we'll do the alignment with fMLLR.
    steps/tandem/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
      data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri3 exp/tandem2uc-tri3_ali_nodup || exit 1;
  
    steps/tandem/train_sat.sh  --cmd "$train_cmd" \
      11500 200000 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri3_ali_nodup exp/tandem2uc-tri4 || exit 1;
  
    utils/mkgraph.sh data/lang_csj_tg exp/tandem2uc-tri4 exp/tandem2uc-tri4/graph_csj_tg || exit 1
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        steps/tandem/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_tandem.config \
  	  exp/tandem2uc-tri4/graph_csj_tg data/$eval_num $bndata/$eval_num exp/tandem2uc-tri4/decode_${eval_num}_csj || exit 1
    done
  fi
  
  # bMMI starting from system in tandem2uc-tri4, use full dataset.
  if [ $stage -le 11 ]; then
    steps/tandem/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
      data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4 exp/tandem2uc-tri4_ali || exit 1;
    steps/tandem/make_denlats.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tandem2uc-tri4_ali \
      --sub-split 100 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4 exp/tandem2uc-tri4_denlats || exit 1;
  fi
  if [ $stage -le 12 ]; then
    steps/tandem/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --acwt 0.039 \
      data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4_{ali,denlats} exp/tandem2uc-tri4_mmi_b0.1 || exit 1;
  
    for eval_num in eval1 eval2 eval3 $dev_set ;do
        steps/tandem/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_tandem.config \
  	  --transform-dir exp/tandem2uc-tri4/decode_${eval_num}_csj \
  	  exp/tandem2uc-tri4/graph_csj_tg data/$eval_num $bndata/$eval_num exp/tandem2uc-tri4_mmi_b0.1/decode_${eval_num}_csj || exit 1;
    done
  fi
  
  echo success
  exit 0
  
  # Getting results [see RESULTS file]
  # for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
  # We use config parameters of swbd resipe.
  :<<EOF
  === evaluation set 1 ===
  %WER 16.46 [ 4285 / 26028, 556 ins, 910 del, 2819 sub ] exp/tandem2uc-tri1/decode_eval1_csj/wer_20_0.0
  %WER 15.32 [ 3987 / 26028, 523 ins, 825 del, 2639 sub ] exp/tandem2uc-tri2/decode_eval1_csj/wer_20_0.0
  %WER 14.28 [ 3718 / 26028, 475 ins, 744 del, 2499 sub ] exp/tandem2uc-tri3/decode_eval1_csj/wer_20_0.0
  %WER 13.51 [ 3517 / 26028, 450 ins, 738 del, 2329 sub ] exp/tandem2uc-tri4/decode_eval1_csj/wer_20_0.5
  %WER 14.93 [ 3885 / 26028, 584 ins, 711 del, 2590 sub ] exp/tandem2uc-tri4/decode_eval1_csj.si/wer_20_0.0
  %WER 12.42 [ 3232 / 26028, 399 ins, 671 del, 2162 sub ] exp/tandem2uc-tri4_mmi_b0.1/decode_eval1_csj/wer_20_1.0
  === evaluation set 2 ===
  %WER 12.54 [ 3343 / 26661, 474 ins, 525 del, 2344 sub ] exp/tandem2uc-tri1/decode_eval2_csj/wer_20_0.0
  %WER 12.19 [ 3250 / 26661, 371 ins, 596 del, 2283 sub ] exp/tandem2uc-tri2/decode_eval2_csj/wer_20_1.0
  %WER 11.19 [ 2984 / 26661, 354 ins, 511 del, 2119 sub ] exp/tandem2uc-tri3/decode_eval2_csj/wer_20_0.5
  %WER 9.96 [ 2655 / 26661, 349 ins, 427 del, 1879 sub ] exp/tandem2uc-tri4/decode_eval2_csj/wer_20_0.5
  %WER 11.96 [ 3188 / 26661, 504 ins, 427 del, 2257 sub ] exp/tandem2uc-tri4/decode_eval2_csj.si/wer_20_0.0
  %WER 9.30 [ 2480 / 26661, 312 ins, 387 del, 1781 sub ] exp/tandem2uc-tri4_mmi_b0.1/decode_eval2_csj/wer_20_1.0
  === evaluation set 3 ===
  %WER 18.19 [ 3127 / 17189, 555 ins, 510 del, 2062 sub ] exp/tandem2uc-tri1/decode_eval3_csj/wer_20_0.5
  %WER 17.80 [ 3060 / 17189, 522 ins, 535 del, 2003 sub ] exp/tandem2uc-tri2/decode_eval3_csj/wer_20_1.0
  %WER 15.88 [ 2729 / 17189, 520 ins, 423 del, 1786 sub ] exp/tandem2uc-tri3/decode_eval3_csj/wer_20_0.5
  %WER 14.88 [ 2557 / 17189, 556 ins, 359 del, 1642 sub ] exp/tandem2uc-tri4/decode_eval3_csj/wer_20_0.5
  %WER 17.03 [ 2927 / 17189, 592 ins, 417 del, 1918 sub ] exp/tandem2uc-tri4/decode_eval3_csj.si/wer_20_1.0
  %WER 13.44 [ 2311 / 17189, 430 ins, 340 del, 1541 sub ] exp/tandem2uc-tri4_mmi_b0.1/decode_eval3_csj/wer_20_1.0
  EOF