Yannick Estève / ONTRAC-Kaldi

Blame view

egs/rm/s5/local/online/run_nnet2_perturbed.sh 7.28 KB
  #!/bin/bash
  
  . ./cmd.sh
  
  
  stage=1
  train_stage=-10
  use_gpu=true
  dir=exp/nnet2_online/nnet_perturbed
  
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  
  
  if $use_gpu; then
    if ! cuda-compiled; then
      cat <<EOF && exit 1
  This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
  If you want to use GPUs (and have them), go to src/, and configure and make on a machine
  where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
  EOF
    fi
    parallel_opts="--gpu 1"
    num_threads=1
    minibatch_size=512
  else
    # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
    # almost the same, but this may be a little bit slow.
    num_threads=16
    minibatch_size=128
    parallel_opts="--num-threads $num_threads"
  fi
  
  
  if [ $stage -le 1 ]; then
    # Note: if you've already run run_online_decoding_nnet2.sh you can
    # skip this stage.
    mkdir -p exp/nnet2_online
    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
      data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
  fi
  
  if [ $stage -le 2 ]; then
    # Note: if you've already run run_online_decoding_nnet2.sh you can
    # skip this stage.
    # use a smaller iVector dim (50) than the default (100) because RM has a very
    # small amount of data.
    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
      --ivector-dim 50 \
     data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
  fi
  
  if [ $stage -le 3 ]; then
    # Dump perturbed versions of the features.
    # store them in a sub-directory of the experimental directory.
    featdir=exp/perturbed_mfcc/feats; mkdir -p $featdir
    if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$featdir $featdir/storage
    fi
    # We can afford to run 80 jobs as we have 4 separate machines for storage.
    steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc --nj 80 \
      conf/mfcc.conf "$featdir" exp/perturbed_mfcc data/train data/train_perturbed_mfcc
  fi
  
  
  if [ $stage -le 4 ]; then
    # Align the perturbed features.
    steps/align_fmllr.sh --nj 80 --cmd "$train_cmd" \
      data/train_perturbed_mfcc data/lang exp/tri3b exp/tri3b_ali_perturbed_mfcc
  fi
  
  ivectordir=exp/nnet2_online/ivectors_perturbed_mfcc
  if [ $stage -le 5 ]; then
    # Extract iVectors for the perturbed features.
    if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$ivectordir $ivectordir/storage
    fi
    # Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
    # of speakers into "fake-speakers" with about 2 utterances each, by randomly making
    # some have 2 and some 3 utterances... this randomness will be different in different
    # copies of the data.
    steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
       data/train_perturbed_mfcc_max2.5
  
    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
      data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
  fi
  
  
  if [ $stage -le 6 ]; then
    if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
      # dir is the neural-net training dir.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$dir/egs $dir/egs/storage
    fi
    # the --max-jobs-run 15 allows more of the dump_egs jobs than the default (5), since we
    # have 4 filesystems to access.  We reduce the number of epochs since we have
    # more data and we don't want so slow down the training too much, and we also
    # reduce the final learning rate (when we have a lot of data we like a ratio of 10
    # between the initial and final learning rate).  I also have --add-layers-period 2
    # which is typical when we have enough data, and increase the number of hidden layers
    # and pnorm dimentions vs. run_online_decoding_nnet2.sh since we have more data.
    steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
      --splice-width 7 \
      --feat-type raw \
      --online-ivector-dir exp/nnet2_online/ivectors_perturbed_mfcc \
      --cmvn-opts "--norm-means=false --norm-vars=false" \
      --num-threads "$num_threads" \
      --minibatch-size "$minibatch_size" \
      --parallel-opts "$parallel_opts" \
      --io-opts "--max-jobs-run 15" \
      --num-jobs-nnet 4 \
      --num-epochs 5 --num-epochs-extra 2 \
      --add-layers-period 2 \
      --num-hidden-layers 3 \
      --mix-up 4000 \
      --initial-learning-rate 0.02 --final-learning-rate 0.002 \
      --cmd "$decode_cmd" \
      --pnorm-input-dim 1200 \
      --pnorm-output-dim 200 \
      data/train_perturbed_mfcc data/lang exp/tri3b_ali_perturbed_mfcc $dir  || exit 1;
  fi
  
  # This time we don't bother testing with offline decoding, only with online.
  
  if [ $stage -le 7 ]; then
    # If this setup used PLP features, we'd have to give the option --feature-type plp
    # to the script below.
    steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
      "$dir" ${dir}_online || exit 1;
  fi
  
  if [ $stage -le 8 ]; then
    # do the actual online decoding with iVectors.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
      exp/tri3b/graph data/test ${dir}_online/decode &
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
      exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
    wait
  fi
  
  if [ $stage -le 9 ]; then
    # this version of the decoding treats each utterance separately
    # without carrying forward speaker information.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
      --per-utt true \
      exp/tri3b/graph data/test ${dir}_online/decode_per_utt &
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
      --per-utt true \
      exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_per_utt || exit 1;
    wait
  fi
  
  exit 0;
  
  # the experiment (with GPU)
  
  # per-speaker (carrying adaptation info forward):
  #for x in exp/nnet2_online/nnet_gpu_perturbed_online/decode*; do grep WER $x/wer_* | utils/best_wer.sh ; done
  #%WER 1.62 [ 203 / 12533, 20 ins, 41 del, 142 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode/wer_5
  #%WER 8.97 [ 1124 / 12533, 87 ins, 204 del, 833 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug/wer_11
  
   # Note, this is the baseline with no perturbing of features, from ./run_nnet2.sh
   # [different hidden-layer configuration though.]
   #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
   #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
  
  
  # per-utterance:
  #%WER 1.85 [ 232 / 12533, 23 ins, 45 del, 164 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_per_utt/wer_5
  #%WER 9.17 [ 1149 / 12533, 118 ins, 174 del, 857 sub ] exp/nnet2_online/nnet_gpu_perturbed_online/decode_ug_per_utt/wer_9
  
   # this is the per-utterance baseline with no perturbing of features, from ./run_nnet2.sh
   # [different hidden-layer configuration though]
   #%WER 2.21 [ 277 / 12533, 45 ins, 48 del, 184 sub ] exp/nnet2_online/nnet_gpu_online/decode_per_utt/wer_4
   #%WER 10.27 [ 1287 / 12533, 142 ins, 186 del, 959 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug_per_utt/wer_10