Yannick Estève / ONTRAC-Kaldi

Blame view

egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh 5.12 KB
  #!/bin/bash
  
  set -euo pipefail
  
  # This script is called from local/nnet3/run_tdnn.sh and
  # local/chain/run_tdnn.sh (and may eventually be called by more
  # scripts).  It contains the common feature preparation and
  # iVector-related parts of the script.  See those scripts for examples
  # of usage.
  
  stage=0
  train_set=train
  test_sets="devtest test"
  gmm=tri3b
  
  nnet3_affix=
  
  . ./cmd.sh
  . ./path.sh
  . utils/parse_options.sh
  
  gmm_dir=exp/${gmm}
  ali_dir=exp/${gmm}_ali_${train_set}_sp
  
  for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
    if [ ! -f $f ]; then
      echo "$0: expected file $f to exist"
      exit 1
    fi
  done
  
  if [ $stage -le 1 ]; then
      # perturb data to get alignments
      # nnet will be trained by high resolution data
      # _sp stands for speed-perturbed
      echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
      utils/data/perturb_data_dir_speed_3way.sh \
  	data/${train_set} \
  	data/${train_set}_sp
      echo "$0: making mfcc features for low-resolution speed-perturbed data"
      steps/make_mfcc.sh \
  	--cmd "$train_cmd" \
  	--nj 10 \
  	data/${train_set}_sp
      steps/compute_cmvn_stats.sh \
  	data/${train_set}_sp
      utils/fix_data_dir.sh \
  	data/${train_set}_sp
  fi
  
  if [ $stage -le 2 ]; then
      echo "$0: aligning with the perturbed low-resolution data"
      steps/align_fmllr.sh \
  	--nj 20 \
  	--cmd "$train_cmd" \
  	data/${train_set}_sp \
  	data/lang \
  	$gmm_dir \
  	$ali_dir
  fi
  
  if [ $stage -le 3 ]; then
      # Create high-resolution MFCC features (with 40 cepstra instead of 13).
  
      echo "$0: creating high-resolution MFCC features"
      mfccdir=data/${train_set}_sp_hires/data
      for datadir in ${train_set}_sp ${test_sets}; do
  	utils/copy_data_dir.sh \
  	    data/$datadir \
  	    data/${datadir}_hires
      done
  
      # do volume-perturbation on the training data prior to extracting hires
      # features; this helps make trained nnets more invariant to test data volume.
      utils/data/perturb_data_dir_volume.sh \
  	data/${train_set}_sp_hires
  
      for datadir in ${train_set}_sp ${test_sets}; do
  	steps/make_mfcc.sh \
  	    --nj 10 \
  	    --mfcc-config conf/mfcc_hires.conf \
  	    --cmd "$train_cmd" \
  	    data/${datadir}_hires
  	steps/compute_cmvn_stats.sh \
  	    data/${datadir}_hires
  	utils/fix_data_dir.sh \
  	    data/${datadir}_hires
      done
  fi
  
  if [ $stage -le 4 ]; then
      echo "$0: computing a subset of data to train the diagonal UBM."
      # We'll use about a quarter of the data.
      mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
      temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
  
      num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
      num_utts=$[$num_utts_total/4]
      utils/data/subset_data_dir.sh \
  	data/${train_set}_sp_hires \
  	$num_utts \
  	${temp_data_root}/${train_set}_sp_hires_subset
  
      echo "$0: computing a PCA transform from the hires data."
      steps/online/nnet2/get_pca_transform.sh \
  	--cmd "$train_cmd" \
  	--splice-opts "--left-context=3 --right-context=3" \
  	--max-utts 10000 \
  	--subsample 2 \
  	${temp_data_root}/${train_set}_sp_hires_subset \
  	exp/nnet3${nnet3_affix}/pca_transform
  
      echo "$0: training the diagonal UBM."
      # Use 512 Gaussians in the UBM.
      steps/online/nnet2/train_diag_ubm.sh \
  	--cmd "$train_cmd" \
  	--nj 20 \
  	--num-frames 700000 \
  	--num-threads 8 \
  	${temp_data_root}/${train_set}_sp_hires_subset \
  	512 \
  	exp/nnet3${nnet3_affix}/pca_transform \
  	exp/nnet3${nnet3_affix}/diag_ubm
  fi
  
  if [ $stage -le 5 ]; then
      # Train the iVector extractor.
      # Use all the speed-perturbed data .
      # iVector extractors can be sensitive to the amount of data.
      # The script defaults to an iVector dimension of 100.
      echo "$0: training the iVector extractor"
      steps/online/nnet2/train_ivector_extractor.sh \
  	--cmd "$train_cmd" \
  	--nj 10 \
  	data/${train_set}_sp_hires \
  	exp/nnet3${nnet3_affix}/diag_ubm \
  	exp/nnet3${nnet3_affix}/extractor
  fi
  
  # combine   and train system on short segments.
  # extract iVectors on speed-perturbed training data
  # With --utts-per-spk-max 2, script pairs  utterances into twos.
  # Treats each  pair as one speaker.
  # Gives more diversity in iVectors.
  # Extracted online.
  
  # note: extract  ivectors from max2 data
  # Why is max2 not encoded in ivectordir name?
  # valid for non-max2 data
  #  utterance list is the same.
  
  # having a larger number of speakers is helpful for generalization, and to
  # handle per-utterance decoding well (iVector starts at zero).
  
  if [ $stage -le 6 ]; then
      ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
      temp_data_root=${ivectordir}
      utils/data/modify_speaker_info.sh \
  	--utts-per-spk-max 2 \
  	data/${train_set}_sp_hires \
  	${temp_data_root}/${train_set}_sp_hires_max2
  
      steps/online/nnet2/extract_ivectors_online.sh \
  	--cmd "$train_cmd" \
  	--nj 20 \
  	${temp_data_root}/${train_set}_sp_hires_max2 \
  	exp/nnet3${nnet3_affix}/extractor \
  	$ivectordir
  fi
  
  # Also extract iVectors for test data.
  # No need for speed perturbation (sp).
  
  if [ $stage -le 7 ]; then
      for data in $test_sets; do
  	steps/online/nnet2/extract_ivectors_online.sh \
  	    --cmd "$train_cmd" \
  	    --nj 1 \
  	    data/${data}_hires \
  	    exp/nnet3${nnet3_affix}/extractor \
  	    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
      done
  fi
  
  exit 0