run_ivector_common.sh 5.12 KB
#!/bin/bash

set -euo pipefail

# This script is called from local/nnet3/run_tdnn.sh and
# local/chain/run_tdnn.sh (and may eventually be called by more
# scripts).  It contains the common feature preparation and
# iVector-related parts of the script.  See those scripts for examples
# of usage.

stage=0
train_set=train
test_sets="devtest test"
gmm=tri3b

nnet3_affix=

. ./cmd.sh
. ./path.sh
. utils/parse_options.sh

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}_sp

for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
    # perturb data to get alignments
    # nnet will be trained by high resolution data
    # _sp stands for speed-perturbed
    echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
    utils/data/perturb_data_dir_speed_3way.sh \
	data/${train_set} \
	data/${train_set}_sp
    echo "$0: making mfcc features for low-resolution speed-perturbed data"
    steps/make_mfcc.sh \
	--cmd "$train_cmd" \
	--nj 10 \
	data/${train_set}_sp
    steps/compute_cmvn_stats.sh \
	data/${train_set}_sp
    utils/fix_data_dir.sh \
	data/${train_set}_sp
fi

if [ $stage -le 2 ]; then
    echo "$0: aligning with the perturbed low-resolution data"
    steps/align_fmllr.sh \
	--nj 20 \
	--cmd "$train_cmd" \
	data/${train_set}_sp \
	data/lang \
	$gmm_dir \
	$ali_dir
fi

if [ $stage -le 3 ]; then
    # Create high-resolution MFCC features (with 40 cepstra instead of 13).

    echo "$0: creating high-resolution MFCC features"
    mfccdir=data/${train_set}_sp_hires/data
    for datadir in ${train_set}_sp ${test_sets}; do
	utils/copy_data_dir.sh \
	    data/$datadir \
	    data/${datadir}_hires
    done

    # do volume-perturbation on the training data prior to extracting hires
    # features; this helps make trained nnets more invariant to test data volume.
    utils/data/perturb_data_dir_volume.sh \
	data/${train_set}_sp_hires

    for datadir in ${train_set}_sp ${test_sets}; do
	steps/make_mfcc.sh \
	    --nj 10 \
	    --mfcc-config conf/mfcc_hires.conf \
	    --cmd "$train_cmd" \
	    data/${datadir}_hires
	steps/compute_cmvn_stats.sh \
	    data/${datadir}_hires
	utils/fix_data_dir.sh \
	    data/${datadir}_hires
    done
fi

if [ $stage -le 4 ]; then
    echo "$0: computing a subset of data to train the diagonal UBM."
    # We'll use about a quarter of the data.
    mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
    temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

    num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
    num_utts=$[$num_utts_total/4]
    utils/data/subset_data_dir.sh \
	data/${train_set}_sp_hires \
	$num_utts \
	${temp_data_root}/${train_set}_sp_hires_subset

    echo "$0: computing a PCA transform from the hires data."
    steps/online/nnet2/get_pca_transform.sh \
	--cmd "$train_cmd" \
	--splice-opts "--left-context=3 --right-context=3" \
	--max-utts 10000 \
	--subsample 2 \
	${temp_data_root}/${train_set}_sp_hires_subset \
	exp/nnet3${nnet3_affix}/pca_transform

    echo "$0: training the diagonal UBM."
    # Use 512 Gaussians in the UBM.
    steps/online/nnet2/train_diag_ubm.sh \
	--cmd "$train_cmd" \
	--nj 20 \
	--num-frames 700000 \
	--num-threads 8 \
	${temp_data_root}/${train_set}_sp_hires_subset \
	512 \
	exp/nnet3${nnet3_affix}/pca_transform \
	exp/nnet3${nnet3_affix}/diag_ubm
fi

if [ $stage -le 5 ]; then
    # Train the iVector extractor.
    # Use all the speed-perturbed data .
    # iVector extractors can be sensitive to the amount of data.
    # The script defaults to an iVector dimension of 100.
    echo "$0: training the iVector extractor"
    steps/online/nnet2/train_ivector_extractor.sh \
	--cmd "$train_cmd" \
	--nj 10 \
	data/${train_set}_sp_hires \
	exp/nnet3${nnet3_affix}/diag_ubm \
	exp/nnet3${nnet3_affix}/extractor
fi

# combine   and train system on short segments.
# extract iVectors on speed-perturbed training data
# With --utts-per-spk-max 2, script pairs  utterances into twos.
# Treats each  pair as one speaker.
# Gives more diversity in iVectors.
# Extracted online.

# note: extract  ivectors from max2 data
# Why is max2 not encoded in ivectordir name?
# valid for non-max2 data
#  utterance list is the same.

# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).

if [ $stage -le 6 ]; then
    ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
    temp_data_root=${ivectordir}
    utils/data/modify_speaker_info.sh \
	--utts-per-spk-max 2 \
	data/${train_set}_sp_hires \
	${temp_data_root}/${train_set}_sp_hires_max2

    steps/online/nnet2/extract_ivectors_online.sh \
	--cmd "$train_cmd" \
	--nj 20 \
	${temp_data_root}/${train_set}_sp_hires_max2 \
	exp/nnet3${nnet3_affix}/extractor \
	$ivectordir
fi

# Also extract iVectors for test data.
# No need for speed perturbation (sp).

if [ $stage -le 7 ]; then
    for data in $test_sets; do
	steps/online/nnet2/extract_ivectors_online.sh \
	    --cmd "$train_cmd" \
	    --nj 1 \
	    data/${data}_hires \
	    exp/nnet3${nnet3_affix}/extractor \
	    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
    done
fi

exit 0