run_nnet2_perturbed.sh 7.28 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178


#!/bin/bash

. ./cmd.sh


stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_perturbed


. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh


if $use_gpu; then
  if ! cuda-compiled; then
    cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
EOF
  fi
  parallel_opts="--gpu 1"
  num_threads=1
  minibatch_size=512
else
  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
  # almost the same, but this may be a little bit slow.
  num_threads=16
  minibatch_size=128
  parallel_opts="--num-threads $num_threads"
fi


if [ $stage -le 1 ]; then
  # Note: if you've already run run_online_decoding_nnet2.sh you can
  # skip this stage.
  mkdir -p exp/nnet2_online
  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
    data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
fi

if [ $stage -le 2 ]; then
  # Note: if you've already run run_online_decoding_nnet2.sh you can
  # skip this stage.
  # use a smaller iVector dim (50) than the default (100) because RM has a very
  # small amount of data.
  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
    --ivector-dim 50 \
   data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi

if [ $stage -le 3 ]; then
  # Dump perturbed versions of the features.
  # store them in a sub-directory of the experimental directory.
  featdir=exp/perturbed_mfcc/feats; mkdir -p $featdir
  if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
    utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$featdir $featdir/storage
  fi
  # We can afford to run 80 jobs as we have 4 separate machines for storage.
  steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc --nj 80 \
    conf/mfcc.conf "$featdir" exp/perturbed_mfcc data/train data/train_perturbed_mfcc
fi


if [ $stage -le 4 ]; then
  # Align the perturbed features.
  steps/align_fmllr.sh --nj 80 --cmd "$train_cmd" \
    data/train_perturbed_mfcc data/lang exp/tri3b exp/tri3b_ali_perturbed_mfcc
fi

ivectordir=exp/nnet2_online/ivectors_perturbed_mfcc
if [ $stage -le 5 ]; then
  # Extract iVectors for the perturbed features.
  if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
    utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$ivectordir $ivectordir/storage
  fi
  # Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
  # of speakers into "fake-speakers" with about 2 utterances each, by randomly making
  # some have 2 and some 3 utterances... this randomness will be different in different
  # copies of the data.
  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
     data/train_perturbed_mfcc_max2.5

  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
    data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
fi


if [ $stage -le 6 ]; then
  if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
    # dir is the neural-net training dir.
    utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$dir/egs $dir/egs/storage
  fi
  # the --max-jobs-run 15 allows more of the dump_egs jobs than the default (5), since we
  # have 4 filesystems to access.  We reduce the number of epochs since we have
  # more data and we don't want so slow down the training too much, and we also
  # reduce the final learning rate (when we have a lot of data we like a ratio of 10
  # between the initial and final learning rate).  I also have --add-layers-period 2
  # which is typical when we have enough data, and increase the number of hidden layers
  # and pnorm dimentions vs. run_online_decoding_nnet2.sh since we have more data.
  steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
    --splice-width 7 \
    --feat-type raw \
    --online-ivector-dir exp/nnet2_online/ivectors_perturbed_mfcc \
    --cmvn-opts "--norm-means=false --norm-vars=false" \
    --num-threads "$num_threads" \
    --minibatch-size "$minibatch_size" \
    --parallel-opts "$parallel_opts" \
    --io-opts "--max-jobs-run 15" \
    --num-jobs-nnet 4 \
    --num-epochs 5 --num-epochs-extra 2 \
    --add-layers-period 2 \
    --num-hidden-layers 3 \
    --mix-up 4000 \
    --initial-learning-rate 0.02 --final-learning-rate 0.002 \
    --cmd "$decode_cmd" \
    --pnorm-input-dim 1200 \
    --pnorm-output-dim 200 \
    data/train_perturbed_mfcc data/lang exp/tri3b_ali_perturbed_mfcc $dir  || exit 1;
fi

# This time we don't bother testing with offline decoding, only with online.

if [ $stage -le 7 ]; then
  # If this setup used PLP features, we'd have to give the option --feature-type plp
  # to the script below.
  steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
    "$dir" ${dir}_online || exit 1;
fi

if [ $stage -le 8 ]; then
  # do the actual online decoding with iVectors.
  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
    exp/tri3b/graph data/test ${dir}_online/decode &
  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
    exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
  wait
fi

if [ $stage -le 9 ]; then
  # this version of the decoding treats each utterance separately
  # without carrying forward speaker information.
  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
    --per-utt true \
    exp/tri3b/graph data/test ${dir}_online/decode_per_utt &
  steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
    --per-utt true \
    exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_per_utt || exit 1;
  wait
fi

exit 0;

# the experiment (with GPU)

# per-speaker (carrying adaptation info forward):
#for x in exp/nnet2_online/nnet_gpu_perturbed_online/decode*; do grep WER $x/wer_* | utils/best_wer.sh ; done
#%WER 1.62 [ 203 / 12533, 20 ins, 41 del, 142 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode/wer_5
#%WER 8.97 [ 1124 / 12533, 87 ins, 204 del, 833 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug/wer_11

 # Note, this is the baseline with no perturbing of features, from ./run_nnet2.sh
 # [different hidden-layer configuration though.]
 #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
 #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11


# per-utterance:
#%WER 1.85 [ 232 / 12533, 23 ins, 45 del, 164 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_per_utt/wer_5
#%WER 9.17 [ 1149 / 12533, 118 ins, 174 del, 857 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug_per_utt/wer_9

 # this is the per-utterance baseline with no perturbing of features, from ./run_nnet2.sh
 # [different hidden-layer configuration though]
 #%WER 2.21 [ 277 / 12533, 45 ins, 48 del, 184 sub ] exp/nnet2_online/nnet_gpu_online/decode_per_utt/wer_4
 #%WER 10.27 [ 1287 / 12533, 142 ins, 186 del, 959 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug_per_utt/wer_10