Blame view
egs/rm/s5/local/online/run_nnet2_perturbed.sh
7.28 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
#!/bin/bash . ./cmd.sh stage=1 train_stage=-10 use_gpu=true dir=exp/nnet2_online/nnet_perturbed . ./cmd.sh . ./path.sh . ./utils/parse_options.sh if $use_gpu; then if ! cuda-compiled; then cat <<EOF && exit 1 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA If you want to use GPUs (and have them), go to src/, and configure and make on a machine where "nvcc" is installed. Otherwise, call this script with --use-gpu false EOF fi parallel_opts="--gpu 1" num_threads=1 minibatch_size=512 else # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be # almost the same, but this may be a little bit slow. num_threads=16 minibatch_size=128 parallel_opts="--num-threads $num_threads" fi if [ $stage -le 1 ]; then # Note: if you've already run run_online_decoding_nnet2.sh you can # skip this stage. mkdir -p exp/nnet2_online steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \ data/train 256 exp/tri3b exp/nnet2_online/diag_ubm fi if [ $stage -le 2 ]; then # Note: if you've already run run_online_decoding_nnet2.sh you can # skip this stage. # use a smaller iVector dim (50) than the default (100) because RM has a very # small amount of data. steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \ --ivector-dim 50 \ data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1; fi if [ $stage -le 3 ]; then # Dump perturbed versions of the features. # store them in a sub-directory of the experimental directory. featdir=exp/perturbed_mfcc/feats; mkdir -p $featdir if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems. utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$featdir $featdir/storage fi # We can afford to run 80 jobs as we have 4 separate machines for storage. steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc --nj 80 \ conf/mfcc.conf "$featdir" exp/perturbed_mfcc data/train data/train_perturbed_mfcc fi if [ $stage -le 4 ]; then # Align the perturbed features. steps/align_fmllr.sh --nj 80 --cmd "$train_cmd" \ data/train_perturbed_mfcc data/lang exp/tri3b exp/tri3b_ali_perturbed_mfcc fi ivectordir=exp/nnet2_online/ivectors_perturbed_mfcc if [ $stage -le 5 ]; then # Extract iVectors for the perturbed features. if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems. utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$ivectordir $ivectordir/storage fi # Below, setting --utts-per-spk-max to a noninteger helps to randomize the division # of speakers into "fake-speakers" with about 2 utterances each, by randomly making # some have 2 and some 3 utterances... this randomness will be different in different # copies of the data. steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \ data/train_perturbed_mfcc_max2.5 steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1; fi if [ $stage -le 6 ]; then if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems. # dir is the neural-net training dir. utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$dir/egs $dir/egs/storage fi # the --max-jobs-run 15 allows more of the dump_egs jobs than the default (5), since we # have 4 filesystems to access. We reduce the number of epochs since we have # more data and we don't want so slow down the training too much, and we also # reduce the final learning rate (when we have a lot of data we like a ratio of 10 # between the initial and final learning rate). I also have --add-layers-period 2 # which is typical when we have enough data, and increase the number of hidden layers # and pnorm dimentions vs. run_online_decoding_nnet2.sh since we have more data. steps/nnet2/train_pnorm_fast.sh --stage $train_stage \ --splice-width 7 \ --feat-type raw \ --online-ivector-dir exp/nnet2_online/ivectors_perturbed_mfcc \ --cmvn-opts "--norm-means=false --norm-vars=false" \ --num-threads "$num_threads" \ --minibatch-size "$minibatch_size" \ --parallel-opts "$parallel_opts" \ --io-opts "--max-jobs-run 15" \ --num-jobs-nnet 4 \ --num-epochs 5 --num-epochs-extra 2 \ --add-layers-period 2 \ --num-hidden-layers 3 \ --mix-up 4000 \ --initial-learning-rate 0.02 --final-learning-rate 0.002 \ --cmd "$decode_cmd" \ --pnorm-input-dim 1200 \ --pnorm-output-dim 200 \ data/train_perturbed_mfcc data/lang exp/tri3b_ali_perturbed_mfcc $dir || exit 1; fi # This time we don't bother testing with offline decoding, only with online. if [ $stage -le 7 ]; then # If this setup used PLP features, we'd have to give the option --feature-type plp # to the script below. steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \ "$dir" ${dir}_online || exit 1; fi if [ $stage -le 8 ]; then # do the actual online decoding with iVectors. steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \ exp/tri3b/graph data/test ${dir}_online/decode & steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \ exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1; wait fi if [ $stage -le 9 ]; then # this version of the decoding treats each utterance separately # without carrying forward speaker information. steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \ --per-utt true \ exp/tri3b/graph data/test ${dir}_online/decode_per_utt & steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \ --per-utt true \ exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_per_utt || exit 1; wait fi exit 0; # the experiment (with GPU) # per-speaker (carrying adaptation info forward): #for x in exp/nnet2_online/nnet_gpu_perturbed_online/decode*; do grep WER $x/wer_* | utils/best_wer.sh ; done #%WER 1.62 [ 203 / 12533, 20 ins, 41 del, 142 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode/wer_5 #%WER 8.97 [ 1124 / 12533, 87 ins, 204 del, 833 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug/wer_11 # Note, this is the baseline with no perturbing of features, from ./run_nnet2.sh # [different hidden-layer configuration though.] #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8 #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11 # per-utterance: #%WER 1.85 [ 232 / 12533, 23 ins, 45 del, 164 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_per_utt/wer_5 #%WER 9.17 [ 1149 / 12533, 118 ins, 174 del, 857 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug_per_utt/wer_9 # this is the per-utterance baseline with no perturbing of features, from ./run_nnet2.sh # [different hidden-layer configuration though] #%WER 2.21 [ 277 / 12533, 45 ins, 48 del, 184 sub ] exp/nnet2_online/nnet_gpu_online/decode_per_utt/wer_4 #%WER 10.27 [ 1287 / 12533, 142 ins, 186 del, 959 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug_per_utt/wer_10 |