Blame view

egs/lre07/v2/run.sh 7.91 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
  #!/bin/bash
  # Copyright  2016-2017  Go-Vivace Inc. (Author: Mousmita Sarma)
  #
  # Apache 2.0.
  #
  # This script runs the NIST 2007 General Language Recognition Closed-Set
  # evaluation.
  # This example script shows how to replace the GMM-UBM
  # with a DNN trained for ASR.
  
  . ./cmd.sh
  . ./path.sh
  set -e
  
  mfccdir=`pwd`/mfcc
  vaddir=`pwd`/mfcc
  languages=local/general_lr_closed_set_langs.txt
  nnet=exp/nnet2_online/nnet_ms_a/final.mdl
  
  # Train a DNN on about 1800 hours of the english portion of Fisher.
  local/dnn/train_dnn.sh
  
  data_root=/export/corpora/LDC
  # Training data sources
  local/make_sre_2008_train.pl $data_root/LDC2011S05 data
  local/make_callfriend.pl $data_root/LDC96S60 vietnamese data
  local/make_callfriend.pl $data_root/LDC96S59 tamil data
  local/make_callfriend.pl $data_root/LDC96S53 japanese data
  local/make_callfriend.pl $data_root/LDC96S52 hindi data
  local/make_callfriend.pl $data_root/LDC96S51 german data
  local/make_callfriend.pl $data_root/LDC96S50 farsi data
  local/make_callfriend.pl $data_root/LDC96S48 french data
  local/make_callfriend.pl $data_root/LDC96S49 arabic.standard data
  local/make_callfriend.pl $data_root/LDC96S54 korean data
  local/make_callfriend.pl $data_root/LDC96S55 chinese.mandarin.mainland data
  local/make_callfriend.pl $data_root/LDC96S56 chinese.mandarin.taiwan data
  local/make_callfriend.pl $data_root/LDC96S57 spanish.caribbean data
  local/make_callfriend.pl $data_root/LDC96S58 spanish.noncaribbean data
  local/make_lre03.pl $data_root/LDC/LDC2006S31 data
  local/make_lre05.pl $data_root/LDC/LDC2008S05 data
  local/make_lre07_train.pl $data_root/LDC2009S05 data
  local/make_lre09.pl /export/corpora5/NIST/LRE/LRE2009/eval data
  
  # Make the evaluation data set. We're concentrating on the General Language
  # Recognition Closet-Set evaluation, so we remove the dialects and filter
  # out the unknown languages used in the open-set evaluation.
  local/make_lre07.pl $data_root/LDC2009S04 data/lre07_all
  
  cp -r data/lre07_all data/lre07
  utils/filter_scp.pl -f 2 $languages <(lid/remove_dialect.pl data/lre07_all/utt2lang) \
    > data/lre07/utt2lang
  utils/fix_data_dir.sh data/lre07
  
  src_list="data/sre08_train_10sec_female \
    data/sre08_train_10sec_male data/sre08_train_3conv_female \
    data/sre08_train_3conv_male data/sre08_train_8conv_female \
    data/sre08_train_8conv_male data/sre08_train_short2_male \
    data/sre08_train_short2_female data/ldc96* data/lid05d1 \
    data/lid05e1 data/lid96d1 data/lid96e1 data/lre03 \
    data/ldc2009* data/lre09"
  # Remove any spk2gender files that we have: since not all data
  # sources have this info, it will cause problems with combine_data.sh
  for d in $src_list; do rm -f $d/spk2gender 2>/dev/null; done
  
  utils/combine_data.sh data/train_unsplit $src_list
  
  # original utt2lang will remain in data/train_unsplit/.backup/utt2lang.
  utils/apply_map.pl -f 2 --permissive local/lang_map.txt  < data/train_unsplit/utt2lang  2>/dev/null > foo
  cp foo data/train_unsplit/utt2lang
  rm foo
  
  local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
  
  echo "**Language count in i-Vector extractor training (after splitting long utterances):**"
  awk '{print $2}' data/train/utt2lang | sort | uniq -c | sort -nr
  
  use_vtln=true
  if $use_vtln; then
    for t in train lre07; do
      cp -r data/${t} data/${t}_novtln
      rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true
      steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 12 --cmd "$train_cmd" \
         data/${t}_novtln exp/make_mfcc $mfccdir
      lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
    done
  
    # Vtln-related things:
    # We'll use a subset of utterances to train the GMM we'll use for VTLN
    # warping.
    utils/subset_data_dir.sh data/train_novtln 5000 data/train_5k_novtln
  
    # Note, we're using the speaker-id version of the train_diag_ubm.sh script, which
    # uses double-delta instead of SDC features to train a 256-Gaussian UBM.
    sid/train_diag_ubm.sh --nj 12 --cmd "$train_cmd" data/train_5k_novtln 256 \
      exp/diag_ubm_vtln
    lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 12 --cmd "$train_cmd" \
       data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
  
    for t in lre07 train; do
      lid/get_vtln_warps.sh --nj 12 --cmd "$train_cmd" \
         data/${t}_novtln exp/vtln exp/${t}_warps
      cp exp/${t}_warps/utt2warp data/$t/
    done
  
    utils/fix_data_dir.sh data/train
    utils/filter_scp.pl data/train/utt2warp data/train/utt2spk > data/train/utt2spk_tmp
    cp data/train/utt2spk_tmp data/train/utt2spk
    utils/fix_data_dir.sh data/train
  fi
  
  cp -r data/train data/train_dnn
  cp -r data/lre07 data/lre07_dnn
  
  # Extract language recognition features
  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 12 --cmd "$train_cmd" \
    data/train exp/make_mfcc $mfccdir
  
  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 12 --cmd "$train_cmd" \
    data/lre07 exp/make_mfcc $mfccdir
  
  # Extract DNN features
  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 12 --cmd "$train_cmd" \
    data/train_dnn exp/make_mfcc $mfccdir
  
  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 12 --cmd "$train_cmd" \
    data/lre07_dnn exp/make_mfcc $mfccdir
  
  for name in lre07_dnn train_dnn lre07 train; do
    utils/fix_data_dir.sh data/${name}
  done
  
  lid/compute_vad_decision.sh --nj 12 --cmd "$train_cmd" data/train \
    exp/make_vad $vaddir
  lid/compute_vad_decision.sh --nj 12 --cmd "$train_cmd" data/lre07 \
    exp/make_vad $vaddir
  
  for name in train lre07; do
    cp data/${name}/vad.scp data/${name}_dnn/vad.scp
    cp data/${name}/utt2spk data/${name}_dnn/utt2spk
    cp data/${name}/spk2utt data/${name}_dnn/spk2utt
    utils/fix_data_dir.sh data/${name}
    utils/fix_data_dir.sh data/${name}_dnn
  done
  
  # Subset training data for faster sup-GMM initialization.
  utils/subset_data_dir.sh data/train 32000 data/train_32k
  utils/fix_data_dir.sh data/train_32k
  utils/subset_data_dir.sh data/train_dnn 32000 data/train_dnn_32k
  utils/fix_data_dir.sh data/train_dnn_32k
  
  # Initialize a full GMM from the DNN posteriors and language recognition
  # features. This can be used both alone, as a UBM, or to initialize the
  # i-vector extractor in a DNN-based system.
  lid/init_full_ubm_from_dnn.sh --nj 8 --cmd "$train_cmd --mem 6G" \
    data/train_32k \
    data/train_dnn_32k $nnet exp/full_ubm
  
  # Train an i-vector extractor based on the DNN-UBM.
  lid/train_ivector_extractor_dnn.sh \
    --cmd "$train_cmd --mem 80G" --nnet-job-opt "--mem 4G" \
    --min-post 0.015 \
    --ivector-dim 600 \
    --num-iters 5 \
    --nj 5 exp/full_ubm/final.ubm $nnet \
    data/train \
    data/train_dnn \
    exp/extractor_dnn
  
  # Filter out the languages we don't need for the closed-set eval
  cp -r data/train data/train_lr
  utils/filter_scp.pl -f 2 $languages <(lid/remove_dialect.pl data/train/utt2lang) \
    > data/train_lr/utt2lang
  utils/fix_data_dir.sh data/train_lr
  
  echo "**Language count for logistic regression training (after splitting long utterances):**"
  awk '{print $2}' data/train_lr/utt2lang | sort | uniq -c | sort -nr
  
  cp -r data/train_dnn data/train_lr_dnn
  utils/filter_scp.pl -f 2 $languages <(lid/remove_dialect.pl data/train_dnn/utt2lang) \
    > data/train_lr_dnn/utt2lang
  utils/fix_data_dir.sh data/train_lr_dnn
  
  echo "**Language count for logistic regression training (after splitting long utterances):**"
  awk '{print $2}' data/train_lr_dnn/utt2lang | sort | uniq -c | sort -nr
  
  # Extract i-vectors using the extractor with the DNN-UBM
  lid/extract_ivectors_dnn.sh --cmd "$train_cmd --mem 30G" \
    --nj 5 exp/extractor_dnn \
    $nnet \
    data/train_lr \
    data/train_lr_dnn \
    exp/ivectors_train
  
  lid/extract_ivectors_dnn.sh --cmd "$train_cmd --mem 30G" \
    --nj 5 exp/extractor_dnn \
    $nnet \
    data/lre07 \
    data/lre07_dnn \
    exp/ivectors_lre07
  
  # Train a logistic regression model on top of i-Vectors
  lid/run_logistic_regression.sh --prior-scale 0.70 \
    --conf conf/logistic-regression.conf
  
  # General LR 2007 closed-set eval
  local/lre07_eval/lre07_eval.sh exp/ivectors_lre07 \
    local/general_lr_closed_set_langs.txt
  
  #Duration (sec):    avg      3     10     30
  #        ER (%):  16.18  31.43  12.38   4.73
  #     C_avg (%):  10.27  19.67   7.84   3.31