run.sh
10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/bin/bash
# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2017 Johns Hopkins University (Author: Daniel Povey)
# 2017-2018 David Snyder
# 2018 Ewald Enzinger
# Apache 2.0.
#
# See ../README.txt for more info on data required.
# Results (mostly equal error-rates) are inline in comments below.
. ./cmd.sh
. ./path.sh
set -e
mfccdir=`pwd`/mfcc
vaddir=`pwd`/mfcc
# The trials file is downloaded by local/make_voxceleb1_v2.pl.
voxceleb1_trials=data/voxceleb1_test/trials
voxceleb1_root=/export/corpora/VoxCeleb1
voxceleb2_root=/export/corpora/VoxCeleb2
nnet_dir=exp/xvector_nnet_1a
musan_root=/export/corpora/JHU/musan
stage=0
if [ $stage -le 0 ]; then
local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
# This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
# Our evaluation set is the test portion of VoxCeleb1.
local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
# if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
# local/make_voxceleb1.pl $voxceleb1_root data
# We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
# This should give 7,323 speakers and 1,276,888 utterances.
utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
fi
if [ $stage -le 1 ]; then
# Make MFCCs and compute the energy-based VAD for each dataset
for name in train voxceleb1_test; do
steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
data/${name} exp/make_mfcc $mfccdir
utils/fix_data_dir.sh data/${name}
sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
data/${name} exp/make_vad $vaddir
utils/fix_data_dir.sh data/${name}
done
fi
# In this section, we augment the VoxCeleb2 data with reverberation,
# noise, music, and babble, and combine it with the clean data.
if [ $stage -le 2 ]; then
frame_shift=0.01
awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur
if [ ! -d "RIRS_NOISES" ]; then
# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
unzip rirs_noises.zip
fi
# Make a version with reverberated speech
rvb_opts=()
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
# Make a reverberated version of the VoxCeleb2 list. Note that we don't add any
# additive noise here.
steps/data/reverberate_data_dir.py \
"${rvb_opts[@]}" \
--speech-rvb-probability 1 \
--pointsource-noise-addition-probability 0 \
--isotropic-noise-addition-probability 0 \
--num-replications 1 \
--source-sampling-rate 16000 \
data/train data/train_reverb
cp data/train/vad.scp data/train_reverb/
utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new
rm -rf data/train_reverb
mv data/train_reverb.new data/train_reverb
# Prepare the MUSAN corpus, which consists of music, speech, and noise
# suitable for augmentation.
steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
# Get the duration of the MUSAN recordings. This will be used by the
# script augment_data_dir.py.
for name in speech noise music; do
utils/data/get_utt2dur.sh data/musan_${name}
mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
done
# Augment with musan_noise
steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
# Augment with musan_music
steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
# Augment with musan_speech
steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
# Combine reverb, noise, music, and babble into one directory.
utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
fi
if [ $stage -le 3 ]; then
# Take a random subset of the augmentations
utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m
utils/fix_data_dir.sh data/train_aug_1m
# Make MFCCs for the augmented data. Note that we do not compute a new
# vad.scp file here. Instead, we use the vad.scp from the clean version of
# the list.
steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
data/train_aug_1m exp/make_mfcc $mfccdir
# Combine the clean and augmented VoxCeleb2 list. This is now roughly
# double the size of the original clean list.
utils/combine_data.sh data/train_combined data/train_aug_1m data/train
fi
# Now we prepare the features to generate examples for xvector training.
if [ $stage -le 4 ]; then
# This script applies CMVN and removes nonspeech frames. Note that this is somewhat
# wasteful, as it roughly doubles the amount of training data on disk. After
# creating training examples, this can be removed.
local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \
data/train_combined data/train_combined_no_sil exp/train_combined_no_sil
utils/fix_data_dir.sh data/train_combined_no_sil
fi
if [ $stage -le 5 ]; then
# Now, we need to remove features that are too short after removing silence
# frames. We want atleast 5s (500 frames) per utterance.
min_len=400
mv data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2num_frames.bak
awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_no_sil/utt2num_frames.bak > data/train_combined_no_sil/utt2num_frames
utils/filter_scp.pl data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2spk > data/train_combined_no_sil/utt2spk.new
mv data/train_combined_no_sil/utt2spk.new data/train_combined_no_sil/utt2spk
utils/fix_data_dir.sh data/train_combined_no_sil
# We also want several utterances per speaker. Now we'll throw out speakers
# with fewer than 8 utterances.
min_num_utts=8
awk '{print $1, NF-1}' data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2num
awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/train_combined_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2utt.new
mv data/train_combined_no_sil/spk2utt.new data/train_combined_no_sil/spk2utt
utils/spk2utt_to_utt2spk.pl data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/utt2spk
utils/filter_scp.pl data/train_combined_no_sil/utt2spk data/train_combined_no_sil/utt2num_frames > data/train_combined_no_sil/utt2num_frames.new
mv data/train_combined_no_sil/utt2num_frames.new data/train_combined_no_sil/utt2num_frames
# Now we're ready to create training examples.
utils/fix_data_dir.sh data/train_combined_no_sil
fi
# Stages 6 through 8 are handled in run_xvector.sh
local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \
--data data/train_combined_no_sil --nnet-dir $nnet_dir \
--egs-dir $nnet_dir/egs
if [ $stage -le 9 ]; then
# Extract x-vectors for centering, LDA, and PLDA training.
sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \
$nnet_dir data/train \
$nnet_dir/xvectors_train
# Extract x-vectors used in the evaluation.
sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \
$nnet_dir data/voxceleb1_test \
$nnet_dir/xvectors_voxceleb1_test
fi
if [ $stage -le 10 ]; then
# Compute the mean vector for centering the evaluation xvectors.
$train_cmd $nnet_dir/xvectors_train/log/compute_mean.log \
ivector-mean scp:$nnet_dir/xvectors_train/xvector.scp \
$nnet_dir/xvectors_train/mean.vec || exit 1;
# This script uses LDA to decrease the dimensionality prior to PLDA.
lda_dim=200
$train_cmd $nnet_dir/xvectors_train/log/lda.log \
ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
"ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train/xvector.scp ark:- |" \
ark:data/train/utt2spk $nnet_dir/xvectors_train/transform.mat || exit 1;
# Train the PLDA model.
$train_cmd $nnet_dir/xvectors_train/log/plda.log \
ivector-compute-plda ark:data/train/spk2utt \
"ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
$nnet_dir/xvectors_train/plda || exit 1;
fi
if [ $stage -le 11 ]; then
$train_cmd exp/scores/log/voxceleb1_test_scoring.log \
ivector-plda-scoring --normalize-length=true \
"ivector-copy-plda --smoothing=0.0 $nnet_dir/xvectors_train/plda - |" \
"ark:ivector-subtract-global-mean $nnet_dir/xvectors_train/mean.vec scp:$nnet_dir/xvectors_voxceleb1_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
"ark:ivector-subtract-global-mean $nnet_dir/xvectors_train/mean.vec scp:$nnet_dir/xvectors_voxceleb1_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
"cat '$voxceleb1_trials' | cut -d\ --fields=1,2 |" exp/scores_voxceleb1_test || exit 1;
fi
if [ $stage -le 12 ]; then
eer=`compute-eer <(local/prepare_for_eer.py $voxceleb1_trials exp/scores_voxceleb1_test) 2> /dev/null`
mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null`
mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null`
echo "EER: $eer%"
echo "minDCF(p-target=0.01): $mindcf1"
echo "minDCF(p-target=0.001): $mindcf2"
# EER: 3.128%
# minDCF(p-target=0.01): 0.3258
# minDCF(p-target=0.001): 0.5003
#
# For reference, here's the ivector system from ../v1:
# EER: 5.329%
# minDCF(p-target=0.01): 0.4933
# minDCF(p-target=0.001): 0.6168
fi