Blame view

egs/lre/v1/run.sh 5 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
  #!/bin/bash
  # Copyright  2014   David Snyder
  #            2014   Daniel Povey
  # Apache 2.0.
  #
  # An incomplete run.sh for this example.
  
  . ./cmd.sh
  . ./path.sh
  set -e
  
  mfccdir=`pwd`/mfcc
  vaddir=`pwd`/mfcc
  
  
  local/make_sre_2008_train.pl /export/corpora5/LDC/LDC2011S05 data
  
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S48 french data
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S49 arabic.standard data
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S54 korean data
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S55 chinese.mandarin.mainland data
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S56 chinese.mandarin.taiwan data
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S57 spanish.caribbean data
  local/make_callfriend.pl /export/corpora5/LDC/LDC96S58 spanish.noncaribbean data
  
  local/make_lre03.pl /export/corpora4/LDC/LDC2006S31 data
  local/make_lre05.pl /export/corpora5/LDC/LDC2008S05 data
  local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/lre07
  
  src_list="data/sre08_train_10sec_female \
      data/sre08_train_10sec_male data/sre08_train_3conv_female \
      data/sre08_train_3conv_male data/sre08_train_8conv_female \
      data/sre08_train_8conv_male data/sre08_train_short2_male \
      data/sre08_train_short2_female data/ldc96s* data/lid05d1 \
      data/lid05e1 data/lid96d1 data/lid96e1 data/lre03"
  
  # Remove any spk2gender files that we have: since not all data
  # sources have this info, it will cause problems with combine_data.sh
  for d in $src_list; do rm -f $d/spk2gender 2>/dev/null; done
  
  utils/combine_data.sh data/train_unsplit $src_list
  
  # original utt2lang will remain in data/train_unsplit/.backup/utt2lang.
  utils/apply_map.pl -f 2 --permissive local/lang_map.txt  < data/train_unsplit/utt2lang  2>/dev/null > foo
  cp foo data/train_unsplit/utt2lang
  echo "**Language count in training:**"
  awk '{print $2}' foo | sort | uniq -c | sort -nr
  rm foo
  
  local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
  
  use_vtln=true
  if $use_vtln; then
    for t in train lre07; do
      cp -rt data/${t} data/${t}_novtln
      rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true
      steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \
         data/${t}_novtln exp/make_mfcc $mfccdir
      lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
    done
    # Vtln-related things:
    # We'll use a subset of utterances to train the GMM we'll use for VTLN
    # warping.
    utils/subset_data_dir.sh data/train_novtln 5000 data/train_novtln_5k
  
    # for the features we use to estimate VTLN warp factors, we use more cepstra
    # (13 instead of just 7); this needs to be tuned.
    steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 50 --cmd "$train_cmd" \
      data/train_5k_novtln exp/make_mfcc $mfccdir
  
    # note, we're using the speaker-id version of the train_diag_ubm.sh script, which
    # uses double-delta instead of SDC features.  We train a 256-Gaussian UBM; this
    # has to be tuned.
    sid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k_novtln 256 \
      exp/diag_ubm_vtln
    lid/train_lvtln_model.sh --mfcc-config conf/mfcc_vtln.conf --nj 30 --cmd "$train_cmd" \
       data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
  
    for t in lre07 train; do
      lid/get_vtln_warps.sh --nj 100 --cmd "$train_cmd" \
         data/${t}_novtln exp/vtln exp/${t}_warps
      cp exp/${t}_warps/utt2warp $data/$t/
    done
  fi
  
  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 100 --cmd "$train_cmd" \
    data/train exp/make_mfcc $mfccdir
  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
    data/lre07 exp/make_mfcc $mfccdir
  
  lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/train \
    exp/make_vad $vaddir
  lid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/lre07 \
    exp/make_vad $vaddir
  
  
  utils/subset_data_dir.sh data/train 5000 data/train_5k
  utils/subset_data_dir.sh data/train 10000 data/train_10k
  
  
  lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k 2048 \
    exp/diag_ubm_2048
  lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \
    exp/diag_ubm_2048 exp/full_ubm_2048_10k
  
  lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
    exp/full_ubm_2048_10k exp/full_ubm_2048
  
  # Alternatively, a diagonal UBM can replace the full UBM used above.
  # The preceding calls to train_diag_ubm.sh and train_full_ubm.sh
  # can be commented out and replaced with the following lines.
  #
  # This results in a slight degradation but could improve error rate when
  # there is less training data than used in this example.
  #
  #lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train 2048 \
  #  exp/diag_ubm_2048
  #
  #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \
  #  exp/full_ubm_2048/final.ubm
  
  lid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
    --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
    exp/extractor_2048
  
  lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
     exp/extractor_2048 data/train exp/ivectors_train
  
  lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
     exp/extractor_2048 data/lre07 exp/ivectors_lre07