Blame view

egs/rm/s5/local/nnet/run_dummy_ivec.sh 5.04 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
  #!/bin/bash
  
  # Copyright 2015  Brno University of Technology (Author: Karel Vesely)
  # Apache 2.0
  
  # This example demonstrates how to add i-vector on DNN input (or any other side-info). 
  # A fixed vector is pasted to all the frames of an utterance and forwarded to nn-input `as-is', 
  # bypassing both the feaure transform and global CMVN normalization.
  #
  # The i-vector is simulated by a dummy vector [ 0 0 0 ],
  # note that all the scripts get an extra option '--ivector'
  #
  # First we train NN with w/o RBM pre-training, then we do the full recipe:
  # RBM pre-training, per-frame training, and sequence-discriminative training.
  
  # Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
  # the value 0.1 is better both for decoding and sMBR.
  
  . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
             ## This relates to the queue.
  
  . ./path.sh ## Source the tools/utils (import the queue.pl)
  
  dev=data-fbank/test
  train=data-fbank/train
  
  dev_original=data/test
  train_original=data/train
  
  gmm=exp/tri3b
  
  stage=0
  . utils/parse_options.sh
  
  set -uexo pipefail
  
  # Make the FBANK features
  [ ! -e $dev ] && if [ $stage -le 0 ]; then
    # Dev set
    utils/copy_data_dir.sh $dev_original $dev; rm $dev/{cmvn,feats}.scp
    steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
       $dev $dev/log $dev/data
    steps/compute_cmvn_stats.sh $dev $dev/log $dev/data
    # Training set
    utils/copy_data_dir.sh $train_original $train; rm $train/{cmvn,feats}.scp
    steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
       $train $train/log $train/data
    steps/compute_cmvn_stats.sh $train $train/log $train/data
    # Split the training set
    utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
  fi
  
  # Create ark with dummy-ivectors,
  [ ! -e data/dummy_ivec.ark ] && cat {$train,$dev}/feats.scp | awk '{ print $1, "[ 0 0 0 ]"; }' >data/dummy_ivec.ark
  ivector=ark:data/dummy_ivec.ark
  
  # 1) Build NN, no pre-training (script test),
  if [ $stage -le 1 ]; then
    # Train the DNN optimizing per-frame cross-entropy.
    dir=exp/dnn4h-dummy-ivec
    ali=${gmm}_ali
    # Train
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --hid-layers 4 --hid-dim 1024 --learn-rate 0.008 \
      --ivector $ivector \
      --cmvn-opts "--norm-means=true --norm-vars=true" \
      --delta-opts "--delta-order=2" --splice 5 \
      ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir
    # Decode (reuse HCLG graph)
    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
      --ivector $ivector \
      $gmm/graph $dev $dir/decode
  fi
  
  # 2) Build NN, with pre-training (script test),
  if [ $stage -le 2 ]; then
    # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
    dir=exp/dnn4h-dummy-ivec_pretrain-dbn
    $cuda_cmd $dir/log/pretrain_dbn.log \
      steps/nnet/pretrain_dbn.sh \
        --ivector $ivector \
        --cmvn-opts "--norm-means=true --norm-vars=true" \
        --delta-opts "--delta-order=2" --splice 5 \
        --hid-dim 1024 --rbm-iter 20 $train $dir
  fi
  
  if [ $stage -le 3 ]; then
    # Train the DNN optimizing per-frame cross-entropy.
    dir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn
    ali=${gmm}_ali
    feature_transform=exp/dnn4h-dummy-ivec_pretrain-dbn/final.feature_transform
    dbn=exp/dnn4h-dummy-ivec_pretrain-dbn/6.dbn
    # Train
    $cuda_cmd $dir/log/train_nnet.log \
      steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
      --ivector $ivector \
      ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir
    # Decode (reuse HCLG graph)
    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
      --ivector $ivector \
      $gmm/graph $dev $dir/decode
  fi
  
  
  # Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
  # Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
  # the value 0.1 is better both for decoding and sMBR.
  dir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr
  srcdir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn
  acwt=0.1
  
  if [ $stage -le 4 ]; then
    # First we generate lattices and alignments:
    steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
      --ivector $ivector \
      $train data/lang $srcdir ${srcdir}_ali
    steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
      --ivector $ivector \
      $train data/lang $srcdir ${srcdir}_denlats
  fi
  
  if [ $stage -le 5 ]; then
    # Re-train the DNN by 6 iterations of sMBR 
    steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
      --ivector $ivector \
      $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
    # Decode
    for ITER in 1 3 6; do
      steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
        --ivector $ivector \
        --nnet $dir/${ITER}.nnet --acwt $acwt \
        $gmm/graph $dev $dir/decode_it${ITER} || exit 1
    done 
  fi
  
  echo Success
  exit 0
  
  # Getting results [see RESULTS file]
  # for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done