Blame view

egs/callhome_diarization/v2/run.sh 18.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  #!/bin/bash
  # Copyright 2017-2018  David Snyder
  #           2017-2018  Matthew Maciejewski
  #
  # Apache 2.0.
  #
  # This recipe demonstrates the use of x-vectors for speaker diarization.
  # The scripts are based on the recipe in ../v1/run.sh, but clusters x-vectors
  # instead of i-vectors.  It is similar to the x-vector-based diarization system
  # described in "Diarization is Hard: Some Experiences and Lessons Learned for
  # the JHU Team in the Inaugural DIHARD Challenge" by Sell et al.  The main
  # difference is that we haven't implemented the VB resegmentation yet.
  
  . ./cmd.sh
  . ./path.sh
  set -e
  mfccdir=`pwd`/mfcc
  vaddir=`pwd`/mfcc
  data_root=/export/corpora5/LDC
  stage=0
  nnet_dir=exp/xvector_nnet_1a/
  num_components=1024 # the number of UBM components (used for VB resegmentation)
  ivector_dim=400 # the dimension of i-vector (used for VB resegmentation)
  
  # Prepare datasets
  if [ $stage -le 0 ]; then
    # Prepare a collection of NIST SRE data. This will be used to train,
    # x-vector DNN and PLDA model.
    local/make_sre.sh $data_root data
  
    # Prepare SWB for x-vector DNN training.
    local/make_swbd2_phase1.pl /export/corpora/LDC/LDC98S75 \
      data/swbd2_phase1_train
    local/make_swbd2_phase2.pl $data_root/LDC99S79 \
                             data/swbd2_phase2_train
    local/make_swbd2_phase3.pl $data_root/LDC2002S06 \
                             data/swbd2_phase3_train
    local/make_swbd_cellular1.pl $data_root/LDC2001S13 \
                               data/swbd_cellular1_train
    local/make_swbd_cellular2.pl $data_root/LDC2004S07 \
                               data/swbd_cellular2_train
  
    # Prepare the Callhome portion of NIST SRE 2000.
    local/make_callhome.sh /export/corpora/NIST/LDC2001S97/ data/
  
    utils/combine_data.sh data/train \
      data/swbd_cellular1_train data/swbd_cellular2_train \
      data/swbd2_phase1_train \
      data/swbd2_phase2_train data/swbd2_phase3_train data/sre
  fi
  
  # Prepare features
  if [ $stage -le 1 ]; then
    # The script local/make_callhome.sh splits callhome into two parts, called
    # callhome1 and callhome2.  Each partition is treated like a held-out
    # dataset, and used to estimate various quantities needed to perform
    # diarization on the other part (and vice versa).
    for name in train callhome1 callhome2 callhome; do
      steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \
        --cmd "$train_cmd" --write-utt2num-frames true \
        data/$name exp/make_mfcc $mfccdir
      utils/fix_data_dir.sh data/$name
    done
  
    for name in train callhome1 callhome2; do
      sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
        data/$name exp/make_vad $vaddir
      utils/fix_data_dir.sh data/$name
    done
  
    # The sre dataset is a subset of train
    cp data/train/{feats,vad}.scp data/sre/
    utils/fix_data_dir.sh data/sre
  
    # This writes features to disk after applying the sliding window CMN.
    # Although this is somewhat wasteful in terms of disk space, for diarization
    # it ends up being preferable to performing the CMN in memory.  If the CMN
    # were performed in memory (e.g., we used --apply-cmn true in
    # diarization/nnet3/xvector/extract_xvectors.sh) it would need to be
    # performed after the subsegmentation, which leads to poorer results.
    for name in sre callhome1 callhome2; do
      local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
        data/$name data/${name}_cmn exp/${name}_cmn
      cp data/$name/vad.scp data/${name}_cmn/
      if [ -f data/$name/segments ]; then
        cp data/$name/segments data/${name}_cmn/
      fi
      utils/fix_data_dir.sh data/${name}_cmn
    done
  
    echo "0.01" > data/sre_cmn/frame_shift
    # Create segments to extract x-vectors from for PLDA training data.
    # The segments are created using an energy-based speech activity
    # detection (SAD) system, but this is not necessary.  You can replace
    # this with segments computed from your favorite SAD.
    diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \
      data/sre_cmn data/sre_cmn_segmented
  fi
  
  # In this section, we augment the training data with reverberation,
  # noise, music, and babble, and combined it with the clean data.
  # The combined list will be used to train the xvector DNN.  The SRE
  # subset will be used to train the PLDA model.
  if [ $stage -le 2 ]; then
    frame_shift=0.01
    awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur
    if [ ! -d "RIRS_NOISES" ]; then
      # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
      wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
      unzip rirs_noises.zip
    fi
  
    # Make a version with reverberated speech
    rvb_opts=()
    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
  
    # Make a reverberated version of the SWBD+SRE list.  Note that we don't add any
    # additive noise here.
    steps/data/reverberate_data_dir.py \
      "${rvb_opts[@]}" \
      --speech-rvb-probability 1 \
      --pointsource-noise-addition-probability 0 \
      --isotropic-noise-addition-probability 0 \
      --num-replications 1 \
      --source-sampling-rate 8000 \
      data/train data/train_reverb
    cp data/train/vad.scp data/train_reverb/
    utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new
    rm -rf data/train_reverb
    mv data/train_reverb.new data/train_reverb
  
    # Prepare the MUSAN corpus, which consists of music, speech, and noise
    # suitable for augmentation.
    steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
  
    # Get the duration of the MUSAN recordings.  This will be used by the
    # script augment_data_dir.py.
    for name in speech noise music; do
      utils/data/get_utt2dur.sh data/musan_${name}
      mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
    done
  
    # Augment with musan_noise
    steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
    # Augment with musan_music
    steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
    # Augment with musan_speech
    steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
  
    # Combine reverb, noise, music, and babble into one directory.
    utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
  
    # Take a random subset of the augmentations (128k is somewhat larger than twice
    # the size of the SWBD+SRE list)
    utils/subset_data_dir.sh data/train_aug 128000 data/train_aug_128k
    utils/fix_data_dir.sh data/train_aug_128k
  
    # Make filterbanks for the augmented data.  Note that we do not compute a new
    # vad.scp file here.  Instead, we use the vad.scp from the clean version of
    # the list.
    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
      data/train_aug_128k exp/make_mfcc $mfccdir
  
    # Combine the clean and augmented SWBD+SRE list.  This is now roughly
    # double the size of the original clean list.
    utils/combine_data.sh data/train_combined data/train_aug_128k data/train
  fi
  
  # Now we prepare the features to generate examples for xvector training.
  if [ $stage -le 3 ]; then
    # This script applies CMN and removes nonspeech frames.  Note that this is somewhat
    # wasteful, as it roughly doubles the amount of training data on disk.  After
    # creating training examples, this can be removed.
    local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \
      data/train_combined data/train_combined_cmn_no_sil exp/train_combined_cmn_no_sil
    utils/fix_data_dir.sh data/train_combined_cmn_no_sil
  
    # Now, we need to remove features that are too short after removing silence
    # frames.  We want atleast 5s (500 frames) per utterance.
    min_len=500
    mv data/train_combined_cmn_no_sil/utt2num_frames data/train_combined_cmn_no_sil/utt2num_frames.bak
    awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_cmn_no_sil/utt2num_frames.bak > data/train_combined_cmn_no_sil/utt2num_frames
    utils/filter_scp.pl data/train_combined_cmn_no_sil/utt2num_frames data/train_combined_cmn_no_sil/utt2spk > data/train_combined_cmn_no_sil/utt2spk.new
    mv data/train_combined_cmn_no_sil/utt2spk.new data/train_combined_cmn_no_sil/utt2spk
    utils/fix_data_dir.sh data/train_combined_cmn_no_sil
  
    # We also want several utterances per speaker. Now we'll throw out speakers
    # with fewer than 8 utterances.
    min_num_utts=8
    awk '{print $1, NF-1}' data/train_combined_cmn_no_sil/spk2utt > data/train_combined_cmn_no_sil/spk2num
    awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' \
      data/train_combined_cmn_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_cmn_no_sil/spk2utt \
      > data/train_combined_cmn_no_sil/spk2utt.new
    mv data/train_combined_cmn_no_sil/spk2utt.new data/train_combined_cmn_no_sil/spk2utt
    utils/spk2utt_to_utt2spk.pl data/train_combined_cmn_no_sil/spk2utt > data/train_combined_cmn_no_sil/utt2spk
  
    utils/filter_scp.pl data/train_combined_cmn_no_sil/utt2spk data/train_combined_cmn_no_sil/utt2num_frames > data/train_combined_cmn_no_sil/utt2num_frames.new
    mv data/train_combined_cmn_no_sil/utt2num_frames.new data/train_combined_cmn_no_sil/utt2num_frames
  
    # Now we're ready to create training examples.
    utils/fix_data_dir.sh data/train_combined_cmn_no_sil
  fi
  
  local/nnet3/xvector/tuning/run_xvector_1a.sh --stage $stage --train-stage -1 \
    --data data/train_combined_cmn_no_sil --nnet-dir $nnet_dir \
    --egs-dir $nnet_dir/egs
  
  # Extract x-vectors
  if [ $stage -le 7 ]; then
    # Extract x-vectors for the two partitions of callhome.
    diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \
      --nj 40 --window 1.5 --period 0.75 --apply-cmn false \
      --min-segment 0.5 $nnet_dir \
      data/callhome1_cmn $nnet_dir/xvectors_callhome1
  
    diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \
      --nj 40 --window 1.5 --period 0.75 --apply-cmn false \
      --min-segment 0.5 $nnet_dir \
      data/callhome2_cmn $nnet_dir/xvectors_callhome2
  
    # Reduce the amount of training data for the PLDA,
    utils/subset_data_dir.sh data/sre_cmn_segmented 128000 data/sre_cmn_segmented_128k
    # Extract x-vectors for the SRE, which is our PLDA training
    # data.  A long period is used here so that we don't compute too
    # many x-vectors for each recording.
    diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \
      --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \
      --hard-min true $nnet_dir \
      data/sre_cmn_segmented_128k $nnet_dir/xvectors_sre_segmented_128k
  fi
  
  # Train PLDA models
  if [ $stage -le 8 ]; then
    # Train a PLDA model on SRE, using callhome1 to whiten.
    # We will later use this to score x-vectors in callhome2.
    "$train_cmd" $nnet_dir/xvectors_callhome1/log/plda.log \
      ivector-compute-plda ark:$nnet_dir/xvectors_sre_segmented_128k/spk2utt \
        "ark:ivector-subtract-global-mean \
        scp:$nnet_dir/xvectors_sre_segmented_128k/xvector.scp ark:- \
        | transform-vec $nnet_dir/xvectors_callhome1/transform.mat ark:- ark:- \
        | ivector-normalize-length ark:- ark:- |" \
      $nnet_dir/xvectors_callhome1/plda || exit 1;
  
    # Train a PLDA model on SRE, using callhome2 to whiten.
    # We will later use this to score x-vectors in callhome1.
    "$train_cmd" $nnet_dir/xvectors_callhome2/log/plda.log \
      ivector-compute-plda ark:$nnet_dir/xvectors_sre_segmented_128k/spk2utt \
        "ark:ivector-subtract-global-mean \
        scp:$nnet_dir/xvectors_sre_segmented_128k/xvector.scp ark:- \
        | transform-vec $nnet_dir/xvectors_callhome2/transform.mat ark:- ark:- \
        | ivector-normalize-length ark:- ark:- |" \
      $nnet_dir/xvectors_callhome2/plda || exit 1;
  fi
  
  # Perform PLDA scoring
  if [ $stage -le 9 ]; then
    # Perform PLDA scoring on all pairs of segments for each recording.
    # The first directory contains the PLDA model that used callhome2
    # to perform whitening (recall that we're treating callhome2 as a
    # held-out dataset).  The second directory contains the x-vectors
    # for callhome1.
    diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \
      --nj 20 $nnet_dir/xvectors_callhome2 $nnet_dir/xvectors_callhome1 \
      $nnet_dir/xvectors_callhome1/plda_scores
  
    # Do the same thing for callhome2.
    diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \
      --nj 20 $nnet_dir/xvectors_callhome1 $nnet_dir/xvectors_callhome2 \
      $nnet_dir/xvectors_callhome2/plda_scores
  fi
  
  # Cluster the PLDA scores using a stopping threshold.
  if [ $stage -le 10 ]; then
    # First, we find the threshold that minimizes the DER on each partition of
    # callhome.
    mkdir -p $nnet_dir/tuning
    for dataset in callhome1 callhome2; do
      echo "Tuning clustering threshold for $dataset"
      best_der=100
      best_threshold=0
      utils/filter_scp.pl -f 2 data/$dataset/wav.scp \
        data/callhome/fullref.rttm > data/$dataset/ref.rttm
  
      # The threshold is in terms of the log likelihood ratio provided by the
      # PLDA scores.  In a perfectly calibrated system, the threshold is 0.
      # In the following loop, we evaluate the clustering on a heldout dataset
      # (callhome1 is heldout for callhome2 and vice-versa) using some reasonable
      # thresholds for a well-calibrated system.
      for threshold in -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3; do
        diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
          --threshold $threshold $nnet_dir/xvectors_$dataset/plda_scores \
          $nnet_dir/xvectors_$dataset/plda_scores_t$threshold
  
        md-eval.pl -1 -c 0.25 -r data/$dataset/ref.rttm \
         -s $nnet_dir/xvectors_$dataset/plda_scores_t$threshold/rttm \
         2> $nnet_dir/tuning/${dataset}_t${threshold}.log \
         > $nnet_dir/tuning/${dataset}_t${threshold}
  
        der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
          $nnet_dir/tuning/${dataset}_t${threshold})
        if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
          best_der=$der
          best_threshold=$threshold
        fi
      done
      echo "$best_threshold" > $nnet_dir/tuning/${dataset}_best
    done
  
    # Cluster callhome1 using the best threshold found for callhome2.  This way,
    # callhome2 is treated as a held-out dataset to discover a reasonable
    # stopping threshold for callhome1.
    diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
      --threshold $(cat $nnet_dir/tuning/callhome2_best) \
      $nnet_dir/xvectors_callhome1/plda_scores $nnet_dir/xvectors_callhome1/plda_scores
  
    # Do the same thing for callhome2, treating callhome1 as a held-out dataset
    # to discover a stopping threshold.
    diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
      --threshold $(cat $nnet_dir/tuning/callhome1_best) \
      $nnet_dir/xvectors_callhome2/plda_scores $nnet_dir/xvectors_callhome2/plda_scores
  
    mkdir -p $nnet_dir/results
    # Now combine the results for callhome1 and callhome2 and evaluate it
    # together.
    cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \
      $nnet_dir/xvectors_callhome2/plda_scores/rttm | md-eval.pl -1 -c 0.25 -r \
      data/callhome/fullref.rttm -s - 2> $nnet_dir/results/threshold.log \
      > $nnet_dir/results/DER_threshold.txt
    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
      $nnet_dir/results/DER_threshold.txt)
    # Using supervised calibration, DER: 8.39%
    # Compare to 10.36% in ../v1/run.sh
    echo "Using supervised calibration, DER: $der%"
  fi
  
  # Cluster the PLDA scores using the oracle number of speakers
  if [ $stage -le 11 ]; then
    # In this section, we show how to do the clustering if the number of speakers
    # (and therefore, the number of clusters) per recording is known in advance.
    diarization/cluster.sh --cmd "$train_cmd --mem 4G" \
      --reco2num-spk data/callhome1/reco2num_spk \
      $nnet_dir/xvectors_callhome1/plda_scores $nnet_dir/xvectors_callhome1/plda_scores_num_spk
  
    diarization/cluster.sh --cmd "$train_cmd --mem 4G" \
      --reco2num-spk data/callhome2/reco2num_spk \
      $nnet_dir/xvectors_callhome2/plda_scores $nnet_dir/xvectors_callhome2/plda_scores_num_spk
  
    mkdir -p $nnet_dir/results
    # Now combine the results for callhome1 and callhome2 and evaluate it together.
    cat $nnet_dir/xvectors_callhome1/plda_scores_num_spk/rttm \
    $nnet_dir/xvectors_callhome2/plda_scores_num_spk/rttm \
      | md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s - 2> $nnet_dir/results/num_spk.log \
      > $nnet_dir/results/DER_num_spk.txt
    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
      $nnet_dir/results/DER_num_spk.txt)
    # Using the oracle number of speakers, DER: 7.12%
    # Compare to 8.69% in ../v1/run.sh
    echo "Using the oracle number of speakers, DER: $der%"
  fi
  
  # Variational Bayes resegmentation using the code from Brno University of Technology
  # Please see https://speech.fit.vutbr.cz/software/vb-diarization-eigenvoice-and-hmm-priors 
  # for details
  if [ $stage -le 12 ]; then
    utils/subset_data_dir.sh data/train 32000 data/train_32k
    # Train the diagonal UBM.
    sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
      --nj 40 --num-threads 8 --subsample 1 --delta-order 0 --apply-cmn false \
      data/train_32k $num_components exp/diag_ubm_$num_components
  
    # Train the i-vector extractor. The UBM is assumed to be diagonal.
    diarization/train_ivector_extractor_diag.sh \
      --cmd "$train_cmd --mem 35G" \
      --ivector-dim $ivector_dim --num-iters 5 --apply-cmn false \
      --num-threads 1 --num-processes 1 --nj 40 \
      exp/diag_ubm_$num_components/final.dubm data/train \
      exp/extractor_diag_c${num_components}_i${ivector_dim}
  fi
  
  if [ $stage -le 13 ]; then
    output_rttm_dir=exp/VB/rttm
    mkdir -p $output_rttm_dir || exit 1;
    cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \
      $nnet_dir/xvectors_callhome2/plda_scores/rttm > $output_rttm_dir/x_vector_rttm
    init_rttm_file=$output_rttm_dir/x_vector_rttm
  
    # VB resegmentation. In this script, I use the x-vector result to 
    # initialize the VB system. You can also use i-vector result or random 
    # initize the VB system. The following script uses kaldi_io. 
    # You could use `sh ../../../tools/extras/install_kaldi_io.sh` to install it
    diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \
      --initialize 1 data/callhome $init_rttm_file exp/VB \
      exp/diag_ubm_$num_components/final.dubm exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie || exit 1; 
  
    # Compute the DER after VB resegmentation
    mkdir -p exp/VB/results || exit 1;
    md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s $output_rttm_dir/VB_rttm 2> exp/VB/log/VB_DER.log \
      > exp/VB/results/VB_DER.txt
    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
      exp/VB/results/VB_DER.txt)
    # After VB resegmentation, DER: 6.48%
    echo "After VB resegmentation, DER: $der%"
  fi