Blame view

egs/sitw/v1/run.sh 12.1 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
  #!/bin/bash
  # Copyright   2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
  #             2017   Johns Hopkins University (Author: Daniel Povey)
  #        2017-2018   David Snyder
  #             2018   Ewald Enzinger
  # Apache 2.0.
  #
  # This is an i-vector-based recipe for Speakers in the Wild (SITW).
  # See ../README.txt for more info on data required.  The recipe uses
  # VoxCeleb 1 and 2 for training the UBM and T matrix, and an augmented
  # version of those datasets for PLDA training.  The augmentation consists
  # of MUSAN noises, music, and babble and reverberation from the Room
  # Impulse Response and Noise Database.  Note that there are 60 speakers
  # in VoxCeleb 1 that overlap with our evaluation dataset, SITW.  The recipe
  # removes those 60 speakers prior to training.  See ../README.txt for more
  # info on data required.  The results are reported in terms of EER and minDCF,
  # and are inline in the comments below.
  
  . ./cmd.sh
  . ./path.sh
  set -e
  mfccdir=`pwd`/mfcc
  vaddir=`pwd`/mfcc
  
  voxceleb1_root=/export/corpora/VoxCeleb1
  voxceleb2_root=/export/corpora/VoxCeleb2
  sitw_root=/export/corpora/SRI/sitw
  musan_root=/export/corpora/JHU/musan
  
  sitw_dev_trials_core=data/sitw_dev_test/trials/core-core.lst
  sitw_eval_trials_core=data/sitw_eval_test/trials/core-core.lst
  
  stage=0
  
  if [ $stage -le 0 ]; then
    # Prepare the VoxCeleb1 dataset.  The script also downloads a list from
    # http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt that
    # contains the speakers that overlap between VoxCeleb1 and our evaluation
    # set SITW.  The script removes the overlapping speakers from VoxCeleb1.
    local/make_voxceleb1.pl $voxceleb1_root data
  
    # Prepare the dev portion of the VoxCeleb2 dataset.
    local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
  
    # The original version of this recipe included the test portion of VoxCeleb2
    # in the training list.  Unfortunately, it turns out that there's an overlap
    # with our evaluation set, Speakers in the Wild.  Therefore, we've removed
    # this dataset from the training list.
    # local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
  
    # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
    # This should leave 7,185 speakers and 1,236,567 utterances.
    utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb1
  
    # Prepare Speakers in the Wild.  This is our evaluation dataset.
    local/make_sitw.sh $sitw_root data
  fi
  
  if [ $stage -le 1 ]; then
    # Make MFCCs and compute the energy-based VAD for each dataset
    for name in sitw_eval_enroll sitw_eval_test sitw_dev_enroll sitw_dev_test train; do
      steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 80 --cmd "$train_cmd" \
        data/${name} exp/make_mfcc $mfccdir
      utils/fix_data_dir.sh data/${name}
      sid/compute_vad_decision.sh --nj 80 --cmd "$train_cmd" \
        data/${name} exp/make_vad $vaddir
      utils/fix_data_dir.sh data/${name}
    done
  fi
  
  if [ $stage -le 2 ]; then
    # Train the UBM on VoxCeleb 1 and 2.
    sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \
      --nj 40 --num-threads 8 \
      data/train 2048 \
      exp/diag_ubm
  
    sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \
      --nj 40 --remove-low-count-gaussians false \
      data/train \
      exp/diag_ubm exp/full_ubm
  fi
  
  if [ $stage -le 3 ]; then
    # In this stage, we train the i-vector extractor on a subset of VoxCeleb 1
    # and 2.
    #
    # Note that there are well over 1 million utterances in our training set,
    # and it takes an extremely long time to train the extractor on all of this.
    # Also, most of those utterances are very short.  Short utterances are
    # harmful for training the i-vector extractor.  Therefore, to reduce the
    # training time and improve performance, we will only train on the 100k
    # longest utterances.
    utils/subset_data_dir.sh \
      --utt-list <(sort -n -k 2 data/train/utt2num_frames | tail -n 100000) \
      data/train data/train_100k
  
    # Train the i-vector extractor.
    sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 16G" \
      --ivector-dim 400 --num-iters 5 \
      exp/full_ubm/final.ubm data/train_100k \
      exp/extractor
  fi
  
  # In this section, we augment the VoxCeleb 1 and 2 data with reverberation,
  # noise, music, and babble, and combine it with the clean data.  This will
  # later be used to train out PLDA model.
  if [ $stage -le 4 ]; then
    frame_shift=0.01
    awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train_100k/utt2num_frames > data/train_100k/reco2dur
  
    if [ ! -d "RIRS_NOISES" ]; then
      # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
      wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
      unzip rirs_noises.zip
    fi
  
    # Make a version with reverberated speech
    rvb_opts=()
    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
  
    # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
    # additive noise here.
    steps/data/reverberate_data_dir.py \
      "${rvb_opts[@]}" \
      --speech-rvb-probability 1 \
      --pointsource-noise-addition-probability 0 \
      --isotropic-noise-addition-probability 0 \
      --num-replications 1 \
      --source-sampling-rate 16000 \
      data/train_100k data/train_100k_reverb
    cp data/train_100k/vad.scp data/train_100k_reverb/
    utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_100k_reverb data/train_100k_reverb.new
    rm -rf data/train_100k_reverb
    mv data/train_100k_reverb.new data/train_100k_reverb
  
    # Prepare the MUSAN corpus, which consists of music, speech, and noise
    # suitable for augmentation.
    steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
  
    # Get the duration of the MUSAN recordings.  This will be used by the
    # script augment_data_dir.py.
    for name in speech noise music; do
      utils/data/get_utt2dur.sh data/musan_${name}
      mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
    done
  
    # Augment with musan_noise
    steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train_100k data/train_100k_noise
    # Augment with musan_music
    steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train_100k data/train_100k_music
    # Augment with musan_speech
    steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train_100k data/train_100k_babble
  
    # Combine reverb, noise, music, and babble into one directory.
    utils/combine_data.sh data/train_aug data/train_100k_reverb data/train_100k_noise data/train_100k_music data/train_100k_babble
  fi
  
  if [ $stage -le 5 ]; then
    # Take a 100k subset of the augmentations.
    utils/subset_data_dir.sh data/train_aug 100000 data/train_100k_aug
    utils/fix_data_dir.sh data/train_100k_aug
  
    # Make MFCCs for the augmented data.  Note that we do not compute a new
    # vad.scp file here.  Instead, we use the vad.scp from the clean version of
    # the list.
    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 80 --cmd "$train_cmd" \
      data/train_100k_aug exp/make_mfcc $mfccdir
  
    # Combine the clean and augmented VoxCeleb list.  This is now roughly
    # double the size of the original clean list.
    utils/combine_data.sh data/train_combined_200k data/train_100k_aug data/train_100k
  fi
  
  if [ $stage -le 6 ]; then
    # These i-vectors will be used for mean-subtraction, LDA, and PLDA training.
    sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \
      exp/extractor data/train_combined_200k \
      exp/ivectors_train_combined_200k
  
    # Extract i-vectors for the SITW dev and eval sets.
    for name in sitw_eval_enroll sitw_eval_test sitw_dev_enroll sitw_dev_test; do
      sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \
        exp/extractor data/$name \
        exp/ivectors_$name
    done
  fi
  
  if [ $stage -le 7 ]; then
    # Compute the mean vector for centering the evaluation i-vectors.
    $train_cmd exp/ivectors_train_combined_200k/log/compute_mean.log \
      ivector-mean scp:exp/ivectors_train_combined_200k/ivector.scp \
      exp/ivectors_train_combined_200k/mean.vec || exit 1;
  
    # This script uses LDA to decrease the dimensionality prior to PLDA.
    lda_dim=150
    $train_cmd exp/ivectors_train_combined_200k/log/lda.log \
      ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
      "ark:ivector-subtract-global-mean scp:exp/ivectors_train_combined_200k/ivector.scp ark:- |" \
      ark:data/train_combined_200k/utt2spk exp/ivectors_train_combined_200k/transform.mat || exit 1;
  
    # Train the PLDA model.
    $train_cmd exp/ivectors_train_combined_200k/log/plda.log \
      ivector-compute-plda ark:data/train_combined_200k/spk2utt \
      "ark:ivector-subtract-global-mean scp:exp/ivectors_train_combined_200k/ivector.scp ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:-  ark:- |" \
      exp/ivectors_train_combined_200k/plda || exit 1;
  fi
  
  if [ $stage -le 8 ]; then
    # Compute PLDA scores for SITW dev core-core trials
    $train_cmd exp/scores/log/sitw_dev_core_scoring.log \
      ivector-plda-scoring --normalize-length=true \
      --num-utts=ark:exp/ivectors_sitw_dev_enroll/num_utts.ark \
      "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_combined_200k/plda - |" \
      "ark:ivector-mean ark:data/sitw_dev_enroll/spk2utt scp:exp/ivectors_sitw_dev_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
      "ark:ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec scp:exp/ivectors_sitw_dev_test/ivector.scp ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
      "cat '$sitw_dev_trials_core' | cut -d\  --fields=1,2 |" exp/scores/sitw_dev_core_scores || exit 1;
  
    # SITW Dev Core:
    # EER: 4.813%
    # minDCF(p-target=0.01): 0.4250
    # minDCF(p-target=0.001): 0.5727
    echo "SITW Dev Core:"
    eer=$(paste $sitw_dev_trials_core exp/scores/sitw_dev_core_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
    mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/sitw_dev_core_scores $sitw_dev_trials_core 2> /dev/null`
    mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/sitw_dev_core_scores $sitw_dev_trials_core 2> /dev/null`
    echo "EER: $eer%"
    echo "minDCF(p-target=0.01): $mindcf1"
    echo "minDCF(p-target=0.001): $mindcf2"
  fi
  
  if [ $stage -le 9 ]; then
    # Compute PLDA scores for SITW eval core-core trials
    $train_cmd exp/scores/log/sitw_eval_core_scoring.log \
      ivector-plda-scoring --normalize-length=true \
      --num-utts=ark:exp/ivectors_sitw_eval_enroll/num_utts.ark \
      "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_combined_200k/plda - |" \
      "ark:ivector-mean ark:data/sitw_eval_enroll/spk2utt scp:exp/ivectors_sitw_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
      "ark:ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec scp:exp/ivectors_sitw_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
      "cat '$sitw_eval_trials_core' | cut -d\  --fields=1,2 |" exp/scores/sitw_eval_core_scores || exit 1;
  
    # SITW Eval Core:
    # EER: 5.659%
    # minDCF(p-target=0.01): 0.4637
    # minDCF(p-target=0.001): 0.6290
    echo -e "
  SITW Eval Core:";
    eer=$(paste $sitw_eval_trials_core exp/scores/sitw_eval_core_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
    mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/sitw_eval_core_scores $sitw_eval_trials_core 2> /dev/null`
    mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/sitw_eval_core_scores $sitw_eval_trials_core 2> /dev/null`
    echo "EER: $eer%"
    echo "minDCF(p-target=0.01): $mindcf1"
    echo "minDCF(p-target=0.001): $mindcf2"
  fi