Blame view

egs/fisher_callhome_spanish/s5/run.sh 10.9 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
  #!/bin/bash
  #
  # Copyright 2018  Nagendra Goel, Saikiran Valluri  Apache 2.0
  # Copyright 2014  Gaurav Kumar.   Apache 2.0
  # Recipe for Fisher/Callhome-Spanish
  
  stage=0
  train_stage=-20
  train_sgmm2=false
  
  # call the next line with the directory where the Spanish Fisher data is
  # (the values below are just an example).
  sfisher_speech=/export/corpora/LDC/LDC2010S01
  sfisher_transcripts=/export/corpora/LDC/LDC2010T04
  spanish_lexicon=/export/corpora/LDC/LDC96L16
  split=local/splits/split_fisher
  
  callhome_speech=/export/corpora/LDC/LDC96S35
  callhome_transcripts=/export/corpora/LDC/LDC96T17
  split_callhome=local/splits/split_callhome
  
  mfccdir=`pwd`/mfcc
  
  . ./cmd.sh
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  set -e
  
  if [ $stage -le 1 ]; then
    local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
  
    local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
  
    # The lexicon is created using the LDC spanish lexicon, the words from the
    # fisher spanish corpus. Additional (most frequent) words are added from the
    # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
    # wordlist is downloaded if it is not available.
    local/fsp_prepare_dict.sh $spanish_lexicon
  
    # Added c,j, v to the non silences phones manually
    utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
  
    # Make sure that you do not use your test and your dev sets to train the LM
    # Some form of cross validation is possible where you decode your dev/set based on an
    # LM that is trained on  everything but that that conversation
    # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
    # to get the numbers. Depending on your needs, you might have to change the size of
    # the splits within that file. The default paritions are based on the Kaldi + Joshua
    # requirements which means that I have very large dev and test sets
    local/fsp_train_lms.sh $split
    local/fsp_create_test_lang.sh
  
    utils/fix_data_dir.sh data/local/data/train_all
  
    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
  
    utils/fix_data_dir.sh data/local/data/train_all
    utils/validate_data_dir.sh data/local/data/train_all
  
    cp -r data/local/data/train_all data/train_all
  
    # For the CALLHOME corpus
    utils/fix_data_dir.sh data/local/data/callhome_train_all
  
    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1;
  
    utils/fix_data_dir.sh data/local/data/callhome_train_all
    utils/validate_data_dir.sh data/local/data/callhome_train_all
  
    cp -r data/local/data/callhome_train_all data/callhome_train_all
  
    # Creating data partitions for the pipeline
    # We need datasets for both the ASR and SMT system
    # We have 257455 utterances left, so the partitions are roughly as follows
    # ASR Train : 100k utterances
    # ASR Tune : 17455 utterances
    # ASR Eval : 20k utterances
    # MT Train : 100k utterances
    # MT Tune : Same as the ASR eval set (Use the lattices from here)
    # MT Eval : 20k utterances
    # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
    # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
    # As noted above, the LM has not been trained on the dev and the test sets.
    #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
    #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
    #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
    #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
    #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
    #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
    #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
    #rm -r data/dev_and_test
    #rm -r data/asr_dev_and_test
    #rm -r data/mt_train_and_test
  
    local/create_splits.sh $split
    local/callhome_create_splits.sh $split_callhome
  fi
  
  if [ $stage -le 2 ]; then
    # Now compute CMVN stats for the train, dev and test subsets
    steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
    steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
    steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir
    #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir
    #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir
  
    #n=$[`cat data/train_all/segments | wc -l` - 158126]
    #utils/subset_data_dir.sh --last data/train_all $n data/train
    steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
  
    steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir
    steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir
    steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir
  
    # Again from Dan's recipe : Reduced monophone training data
    # Now-- there are 1.6 million utterances, and we want to start the monophone training
    # on relatively short utterances (easier to align), but not only the very shortest
    # ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
    # utterances from those.
  
    utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
    utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
    utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
    utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
    utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
  fi
  
  if [ $stage -le 3 ]; then
    steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
      data/train_10k_nodup data/lang exp/mono0a
  
    steps/align_si.sh --nj 30 --cmd "$train_cmd" \
      data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
  
    steps/train_deltas.sh --cmd "$train_cmd" \
      2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
  
  
    (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri1/graph data/dev exp/tri1/decode_dev)&
  
    steps/align_si.sh --nj 30 --cmd "$train_cmd" \
      data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
  
    steps/train_deltas.sh --cmd "$train_cmd" \
      2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
  
    (
      utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
      steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
     )&
  fi
  
  if [ $stage -le 4 ]; then
    steps/align_si.sh --nj 30 --cmd "$train_cmd" \
      data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
  
  # Train tri3a, which is LDA+MLLT, on 100k data.
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" \
     3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
    (
      utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
      steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
       exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
    )&
  fi
  
  if [ $stage -le 5 ]; then
  # Next we'll use fMLLR and train with SAT (i.e. on
  # fMLLR features)
    steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
      data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
  
    steps/train_sat.sh  --cmd "$train_cmd" \
      4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
  
    (
      utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri4a/graph data/dev exp/tri4a/decode_dev
  )&
  
  
    steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
      data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
  
  # Reduce the number of gaussians
    steps/train_sat.sh  --cmd "$train_cmd" \
      5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
  
    (
      utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri5a/graph data/dev exp/tri5a/decode_dev
      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri5a/graph data/test exp/tri5a/decode_test
  
    # Decode CALLHOME
      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
        exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
      ) &
  
  
     steps/align_fmllr.sh \
       --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
       data/train data/lang exp/tri5a exp/tri5a_ali
  fi
  
  if $train_sgmm2; then
  
  steps/train_ubm.sh \
    --cmd "$train_cmd" 750 \
    data/train data/lang exp/tri5a_ali exp/ubm5
  
  steps/train_sgmm2.sh \
    --cmd "$train_cmd" 5000 18000 \
    data/train data/lang exp/tri5a_ali exp/ubm5/final.ubm exp/sgmm5
  
  utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
  
  (
    steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
      --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
     exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
  )&
  
  steps/align_sgmm2.sh \
    --nj 32  --cmd "$train_cmd" --transform-dir exp/tri5a_ali \
    --use-graphs true --use-gselect true \
    data/train data/lang exp/sgmm5 exp/sgmm5_ali
  
  steps/make_denlats_sgmm2.sh \
    --nj 32 --sub-split 32 --num-threads 4 \
    --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri5a_ali \
    data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats
  
  steps/train_mmi_sgmm2.sh \
    --cmd "$train_cmd" --drop-frames true --transform-dir exp/tri5a_ali --boost 0.1 \
    data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
    exp/sgmm5_mmi_b0.1
  
  (
  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
  steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
    --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
   exp/tri5a/graph data/dev exp/tri5a/decode_dev
  utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
  steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
    --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
   exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
  for iter in 1 2 3 4; do
    decode=exp/sgmm5_mmi_b0.1/decode_dev_it$iter
    mkdir -p $decode
    steps/decode_sgmm2_rescore.sh  \
      --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5a/decode_dev \
      data/lang_test data/dev/  exp/sgmm5/decode_dev $decode
  done
  ) &
  
  fi
  
  local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
  exit 0;