Blame view
egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
7.1 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
#!/bin/bash set -e -o pipefail # This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually # be called by more scripts). It contains the common feature preparation and iVector-related parts # of the script. See those scripts for examples of usage. stage=0 nj=30 train_set=train_cleaned # you might set this to e.g. train. gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; # it should contain alignments for 'train_set'. num_threads_ubm=32 nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it # becomes exp/nnet3_cleaned or whatever. . ./cmd.sh . ./path.sh . utils/parse_options.sh gmm_dir=exp/${gmm} ali_dir=exp/${gmm}_ali_${train_set}_sp for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" exit 1 fi done if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then echo "$0: data/${train_set}_sp_hires/feats.scp already exists." echo " ... Please either remove it, or rerun this script with stage > 2." exit 1 fi if [ $stage -le 1 ]; then echo "$0: preparing directory for speed-perturbed data" utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp fi if [ $stage -le 2 ]; then echo "$0: creating high-resolution MFCC features" # this shows how you can split across multiple file-systems. we'll split the # MFCC dir across multiple locations. You might want to be careful here, if you # have multiple copies of Kaldi checked out and run the same recipe, not to let # them overwrite each other. mfccdir=data/${train_set}_sp_hires/data if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi for datadir in ${train_set}_sp dev test; do utils/copy_data_dir.sh data/$datadir data/${datadir}_hires done # do volume-perturbation on the training data prior to extracting hires # features; this helps make trained nnets more invariant to test data volume. utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires for datadir in ${train_set}_sp dev test; do steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${datadir}_hires steps/compute_cmvn_stats.sh data/${datadir}_hires utils/fix_data_dir.sh data/${datadir}_hires done fi if [ $stage -le 3 ]; then echo "$0: computing a subset of data to train the diagonal UBM." mkdir -p exp/nnet3${nnet3_affix}/diag_ubm temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm # train a diagonal UBM using a subset of about a quarter of the data num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk) num_utts=$[$num_utts_total/4] utils/data/subset_data_dir.sh data/${train_set}_sp_hires \ $num_utts ${temp_data_root}/${train_set}_sp_hires_subset echo "$0: computing a PCA transform from the hires data." steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ --max-utts 10000 --subsample 2 \ ${temp_data_root}/${train_set}_sp_hires_subset \ exp/nnet3${nnet3_affix}/pca_transform echo "$0: training the diagonal UBM." # Use 512 Gaussians in the UBM. steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \ --num-frames 700000 \ --num-threads $num_threads_ubm \ ${temp_data_root}/${train_set}_sp_hires_subset 512 \ exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm fi if [ $stage -le 4 ]; then # Train the iVector extractor. µUse all of the speed-perturbed data since iVector extractors # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100. echo "$0: training the iVector extractor" steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \ exp/nnet3${nnet3_affix}/extractor || exit 1; fi if [ $stage -le 5 ]; then # note, we don't encode the 'max2' in the name of the ivectordir even though # that's the data we extract the ivectors from, as it's still going to be # valid for the non-'max2' data, the utterance list is the same. ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/tedlium-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage fi # We now extract iVectors on the speed-perturbed training data . With # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats # each of these pairs as one speaker; this gives more diversity in iVectors.. # Note that these are extracted 'online' (they vary within the utterance). # Having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (the iVector starts at zero at the beginning # of each pseudo-speaker). temp_data_root=${ivectordir} utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2 steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \ ${temp_data_root}/${train_set}_sp_hires_max2 \ exp/nnet3${nnet3_affix}/extractor $ivectordir # Also extract iVectors for the test data, but in this case we don't need the speed # perturbation (sp) or small-segment concatenation (comb). for data in dev test; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \ data/${data}_hires exp/nnet3${nnet3_affix}/extractor \ exp/nnet3${nnet3_affix}/ivectors_${data}_hires done fi if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then echo "$0: $feats already exists. Refusing to overwrite the features " echo " to avoid wasting time. Please remove the file and continue if you really mean this." exit 1; fi if [ $stage -le 6 ]; then echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" utils/data/perturb_data_dir_speed_3way.sh \ data/${train_set} data/${train_set}_sp fi if [ $stage -le 7 ]; then echo "$0: making MFCC features for low-resolution speed-perturbed data" steps/make_mfcc.sh --nj $nj \ --cmd "$train_cmd" data/${train_set}_sp steps/compute_cmvn_stats.sh data/${train_set}_sp echo "$0: fixing input data-dir to remove nonexistent features, in case some " echo ".. speed-perturbed segments were too short." utils/fix_data_dir.sh data/${train_set}_sp fi if [ $stage -le 8 ]; then if [ -f $ali_dir/ali.1.gz ]; then echo "$0: alignments in $ali_dir appear to already exist. Please either remove them " echo " ... or use a later --stage option." exit 1 fi echo "$0: aligning with the perturbed low-resolution data" steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/${train_set}_sp data/lang $gmm_dir $ali_dir fi exit 0; |