Blame view
egs/voxceleb/v2/run.sh
10.3 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
#!/bin/bash # Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) # 2017 Johns Hopkins University (Author: Daniel Povey) # 2017-2018 David Snyder # 2018 Ewald Enzinger # Apache 2.0. # # See ../README.txt for more info on data required. # Results (mostly equal error-rates) are inline in comments below. . ./cmd.sh . ./path.sh set -e mfccdir=`pwd`/mfcc vaddir=`pwd`/mfcc # The trials file is downloaded by local/make_voxceleb1_v2.pl. voxceleb1_trials=data/voxceleb1_test/trials voxceleb1_root=/export/corpora/VoxCeleb1 voxceleb2_root=/export/corpora/VoxCeleb2 nnet_dir=exp/xvector_nnet_1a musan_root=/export/corpora/JHU/musan stage=0 if [ $stage -le 0 ]; then local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1. # Our evaluation set is the test portion of VoxCeleb1. local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead. # local/make_voxceleb1.pl $voxceleb1_root data # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. # This should give 7,323 speakers and 1,276,888 utterances. utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train fi if [ $stage -le 1 ]; then # Make MFCCs and compute the energy-based VAD for each dataset for name in train voxceleb1_test; do steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ data/${name} exp/make_mfcc $mfccdir utils/fix_data_dir.sh data/${name} sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ data/${name} exp/make_vad $vaddir utils/fix_data_dir.sh data/${name} done fi # In this section, we augment the VoxCeleb2 data with reverberation, # noise, music, and babble, and combine it with the clean data. if [ $stage -le 2 ]; then frame_shift=0.01 awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur if [ ! -d "RIRS_NOISES" ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip unzip rirs_noises.zip fi # Make a version with reverberated speech rvb_opts=() rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") # Make a reverberated version of the VoxCeleb2 list. Note that we don't add any # additive noise here. steps/data/reverberate_data_dir.py \ "${rvb_opts[@]}" \ --speech-rvb-probability 1 \ --pointsource-noise-addition-probability 0 \ --isotropic-noise-addition-probability 0 \ --num-replications 1 \ --source-sampling-rate 16000 \ data/train data/train_reverb cp data/train/vad.scp data/train_reverb/ utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new rm -rf data/train_reverb mv data/train_reverb.new data/train_reverb # Prepare the MUSAN corpus, which consists of music, speech, and noise # suitable for augmentation. steps/data/make_musan.sh --sampling-rate 16000 $musan_root data # Get the duration of the MUSAN recordings. This will be used by the # script augment_data_dir.py. for name in speech noise music; do utils/data/get_utt2dur.sh data/musan_${name} mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur done # Augment with musan_noise steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise # Augment with musan_music steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music # Augment with musan_speech steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble # Combine reverb, noise, music, and babble into one directory. utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble fi if [ $stage -le 3 ]; then # Take a random subset of the augmentations utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m utils/fix_data_dir.sh data/train_aug_1m # Make MFCCs for the augmented data. Note that we do not compute a new # vad.scp file here. Instead, we use the vad.scp from the clean version of # the list. steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ data/train_aug_1m exp/make_mfcc $mfccdir # Combine the clean and augmented VoxCeleb2 list. This is now roughly # double the size of the original clean list. utils/combine_data.sh data/train_combined data/train_aug_1m data/train fi # Now we prepare the features to generate examples for xvector training. if [ $stage -le 4 ]; then # This script applies CMVN and removes nonspeech frames. Note that this is somewhat # wasteful, as it roughly doubles the amount of training data on disk. After # creating training examples, this can be removed. local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \ data/train_combined data/train_combined_no_sil exp/train_combined_no_sil utils/fix_data_dir.sh data/train_combined_no_sil fi if [ $stage -le 5 ]; then # Now, we need to remove features that are too short after removing silence # frames. We want atleast 5s (500 frames) per utterance. min_len=400 mv data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2num_frames.bak awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_no_sil/utt2num_frames.bak > data/train_combined_no_sil/utt2num_frames utils/filter_scp.pl data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2spk > data/train_combined_no_sil/utt2spk.new mv data/train_combined_no_sil/utt2spk.new data/train_combined_no_sil/utt2spk utils/fix_data_dir.sh data/train_combined_no_sil # We also want several utterances per speaker. Now we'll throw out speakers # with fewer than 8 utterances. min_num_utts=8 awk '{print $1, NF-1}' data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2num awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/train_combined_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2utt.new mv data/train_combined_no_sil/spk2utt.new data/train_combined_no_sil/spk2utt utils/spk2utt_to_utt2spk.pl data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/utt2spk utils/filter_scp.pl data/train_combined_no_sil/utt2spk data/train_combined_no_sil/utt2num_frames > data/train_combined_no_sil/utt2num_frames.new mv data/train_combined_no_sil/utt2num_frames.new data/train_combined_no_sil/utt2num_frames # Now we're ready to create training examples. utils/fix_data_dir.sh data/train_combined_no_sil fi # Stages 6 through 8 are handled in run_xvector.sh local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \ --data data/train_combined_no_sil --nnet-dir $nnet_dir \ --egs-dir $nnet_dir/egs if [ $stage -le 9 ]; then # Extract x-vectors for centering, LDA, and PLDA training. sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \ $nnet_dir data/train \ $nnet_dir/xvectors_train # Extract x-vectors used in the evaluation. sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \ $nnet_dir data/voxceleb1_test \ $nnet_dir/xvectors_voxceleb1_test fi if [ $stage -le 10 ]; then # Compute the mean vector for centering the evaluation xvectors. $train_cmd $nnet_dir/xvectors_train/log/compute_mean.log \ ivector-mean scp:$nnet_dir/xvectors_train/xvector.scp \ $nnet_dir/xvectors_train/mean.vec || exit 1; # This script uses LDA to decrease the dimensionality prior to PLDA. lda_dim=200 $train_cmd $nnet_dir/xvectors_train/log/lda.log \ ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ "ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train/xvector.scp ark:- |" \ ark:data/train/utt2spk $nnet_dir/xvectors_train/transform.mat || exit 1; # Train the PLDA model. $train_cmd $nnet_dir/xvectors_train/log/plda.log \ ivector-compute-plda ark:data/train/spk2utt \ "ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ $nnet_dir/xvectors_train/plda || exit 1; fi if [ $stage -le 11 ]; then $train_cmd exp/scores/log/voxceleb1_test_scoring.log \ ivector-plda-scoring --normalize-length=true \ "ivector-copy-plda --smoothing=0.0 $nnet_dir/xvectors_train/plda - |" \ "ark:ivector-subtract-global-mean $nnet_dir/xvectors_train/mean.vec scp:$nnet_dir/xvectors_voxceleb1_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ "ark:ivector-subtract-global-mean $nnet_dir/xvectors_train/mean.vec scp:$nnet_dir/xvectors_voxceleb1_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ "cat '$voxceleb1_trials' | cut -d\ --fields=1,2 |" exp/scores_voxceleb1_test || exit 1; fi if [ $stage -le 12 ]; then eer=`compute-eer <(local/prepare_for_eer.py $voxceleb1_trials exp/scores_voxceleb1_test) 2> /dev/null` mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` echo "EER: $eer%" echo "minDCF(p-target=0.01): $mindcf1" echo "minDCF(p-target=0.001): $mindcf2" # EER: 3.128% # minDCF(p-target=0.01): 0.3258 # minDCF(p-target=0.001): 0.5003 # # For reference, here's the ivector system from ../v1: # EER: 5.329% # minDCF(p-target=0.01): 0.4933 # minDCF(p-target=0.001): 0.6168 fi |