Blame view

egs/hub4_spanish/s5/run.sh 8.2 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
  #!/bin/bash
  # Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
  # License: Apache 2.0
  
  # Begin configuration section.
  train_nj=32
  stage=0
  # End configuration section
  . ./utils/parse_options.sh
  
  set -e -o pipefail
  set -o nounset                              # Treat unset variables as an error
  
  audio_data=/export/corpora/LDC/LDC98S74
  transcript_data=/export/corpora/LDC/LDC98T29
  eval_data=/export/corpora/LDC/LDC2001S91
  
  boost_sil=0.5
  numLeavesTri1=1000
  numGaussTri1=10000
  numLeavesTri2=1000
  numGaussTri2=20000
  numLeavesTri3=6000
  numGaussTri3=75000
  numLeavesMLLT=6000
  numGaussMLLT=75000
  numLeavesSAT=6000
  numGaussSAT=75000
  unk="<unk>"
  
  . ./cmd.sh
  . ./path.sh
  
  if [ $stage -le 0 ]; then
    # Eval dataset preparation
  
    # prepare_data.sh does not really care about the order or number of the
    # corpus directories
    local/prepare_data.sh \
      $eval_data/HUB4_1997NE/doc/h4ne97sp.sgm \
      $eval_data/HUB4_1997NE/h4ne_sp/h4ne97sp.sph data/eval
    local/prepare_test_text.pl \
      "$unk" data/eval/text > data/eval/text.clean
    mv data/eval/text data/eval/text.old
    mv data/eval/text.clean data/eval/text
    utils/fix_data_dir.sh data/eval
  fi
  
  
  if [ $stage -le 1 ]; then
    ## Training dataset preparation
    local/prepare_data.sh $audio_data $transcript_data data/train
    local/prepare_training_text.pl \
      "$unk" data/train/text > data/train/text.clean
    mv data/train/text data/train/text.old
    mv data/train/text.clean data/train/text
    utils/fix_data_dir.sh data/train
  fi
  
  if [ $stage -le 2 ]; then
    # Graphemic lexicon
    mkdir -p data/local
    local/prepare_lexicon.sh data/train/text data/local
  fi
  
  if [ $stage -le 3 ]; then
    # Language model
    local/train_lms_srilm.sh  --oov-symbol "$unk"\
        --train-text data/train/text data data/srilm
    cp -R data/lang data/lang_test
    utils/format_lm.sh \
      data/lang data/srilm/lm.gz  data/local/lexicon.txt data/lang_test
  fi
  
  if [ $stage -le 4 ]; then
    # Training set features
    steps/make_mfcc.sh --cmd "$train_cmd" --nj $train_nj data/train exp/make_mfcc_pitch/train mfcc
    utils/fix_data_dir.sh data/train
    steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc
    utils/fix_data_dir.sh data/train
  fi
  
  if [ $stage -le 4 ]; then
    # Eval dataset features
    steps/make_mfcc.sh --cmd "$decode_cmd" --nj 16  data/eval exp/make_mfcc_pitch/eval mfcc
    utils/fix_data_dir.sh data/eval
    steps/compute_cmvn_stats.sh data/eval exp/make_mfcc/eval mfcc
    utils/fix_data_dir.sh data/eval
  fi
  
  
  if [ $stage -le 5 ]; then
    # Subset the training data to speed up the early stages of training
    numutt=`cat data/train/feats.scp | wc -l`;
    utils/subset_data_dir.sh data/train  5000 data/train_sub1
    if [ $numutt -gt 10000 ] ; then
      utils/subset_data_dir.sh data/train 10000 data/train_sub2
    else
      (cd data; ln -s train train_sub2 )
    fi
    if [ $numutt -gt 20000 ] ; then
      utils/subset_data_dir.sh data/train 20000 data/train_sub3
    else
      (cd data; ln -s train train_sub3 )
    fi
  
  fi
  
  mkdir -p exp
  if [ $stage -le 6 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 6: Starting (small) monophone training in exp/mono on" `date`
    echo ---------------------------------------------------------------------
    steps/train_mono.sh \
      --boost-silence $boost_sil --nj 8 --cmd "$train_cmd" \
      data/train_sub1 data/lang exp/mono
  fi
  
  if [ $stage -le 6 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 6: Starting (small) triphone training in exp/tri1 on" `date`
    echo ---------------------------------------------------------------------
    steps/align_si.sh \
      --boost-silence $boost_sil --nj 12 --cmd "$train_cmd" \
      data/train_sub2 data/lang exp/mono exp/mono_ali_sub2
  
    steps/train_deltas.sh \
      --boost-silence $boost_sil --cmd "$train_cmd" $numLeavesTri1 $numGaussTri1 \
      data/train_sub2 data/lang exp/mono_ali_sub2 exp/tri1
  fi
  
  if [ $stage -le 7 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 7: Starting (medium) triphone training in exp/tri2 on" `date`
    echo ---------------------------------------------------------------------
    steps/align_si.sh \
      --boost-silence $boost_sil --nj 24 --cmd "$train_cmd" \
      data/train_sub3 data/lang exp/tri1 exp/tri1_ali_sub3
  
    steps/train_deltas.sh \
      --boost-silence $boost_sil --cmd "$train_cmd" $numLeavesTri2 $numGaussTri2 \
      data/train_sub3 data/lang exp/tri1_ali_sub3 exp/tri2
  
    local/reestimate_langp.sh --cmd "$train_cmd" --unk "$unk" \
      data/train_sub3 data/lang data/local/ \
      exp/tri2 data/local/dictp/tri2 data/local/langp/tri2 data/langp/tri2
  fi
  
  if [ $stage -le 8 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 8: Starting (full) triphone training in exp/tri3 on" `date`
    echo ---------------------------------------------------------------------
    steps/align_si.sh \
      --boost-silence $boost_sil --nj $train_nj --cmd "$train_cmd" \
      data/train data/langp/tri2 exp/tri2 exp/tri2_ali
  
    steps/train_deltas.sh \
      --boost-silence $boost_sil --cmd "$train_cmd" \
      $numLeavesTri3 $numGaussTri3 data/train data/langp/tri2 exp/tri2_ali exp/tri3
  
    local/reestimate_langp.sh --cmd "$train_cmd" --unk "$unk" \
      data/train data/lang data/local/ \
      exp/tri3 data/local/dictp/tri3 data/local/langp/tri3 data/langp/tri3
  fi
  
  if [ $stage -le 9 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 9: Starting (lda_mllt) triphone training in exp/tri4 on" `date`
    echo ---------------------------------------------------------------------
    steps/align_si.sh \
      --boost-silence $boost_sil --nj $train_nj --cmd "$train_cmd" \
      data/train data/langp/tri3 exp/tri3 exp/tri3_ali
  
    steps/train_lda_mllt.sh \
      --boost-silence $boost_sil --cmd "$train_cmd" \
      $numLeavesMLLT $numGaussMLLT data/train data/langp/tri3 exp/tri3_ali exp/tri4
  
    local/reestimate_langp.sh --cmd "$train_cmd" --unk "$unk" \
      data/train data/lang data/local \
      exp/tri4 data/local/dictp/tri4 data/local/langp/tri4 data/langp/tri4
  fi
  
  if [ $stage -le 10 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 10: Starting (SAT) triphone training in exp/tri5 on" `date`
    echo ---------------------------------------------------------------------
  
    steps/align_si.sh \
      --boost-silence $boost_sil --nj $train_nj --cmd "$train_cmd" \
      data/train data/langp/tri4 exp/tri4 exp/tri4_ali
  
    steps/train_sat.sh \
      --boost-silence $boost_sil --cmd "$train_cmd" \
      $numLeavesSAT $numGaussSAT data/train data/langp/tri4 exp/tri4_ali exp/tri5
  
    local/reestimate_langp.sh --cmd "$train_cmd" --unk "$unk" \
      data/train data/lang data/local \
      exp/tri5 data/local/dictp/tri5 data/local/langp/tri5 data/langp/tri5
  fi
  
  
  if [ $stage -le 11 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 11: Starting exp/tri5_ali on" `date`
    echo ---------------------------------------------------------------------
    steps/align_fmllr.sh \
      --boost-silence $boost_sil --nj $train_nj --cmd "$train_cmd" \
      data/train data/langp/tri5 exp/tri5 exp/tri5_ali
  
    local/reestimate_langp.sh --cmd "$train_cmd" --unk "$unk" \
      data/train data/lang data/local \
      exp/tri5_ali data/local/dictp/tri5_ali data/local/langp/tri5_ali data/langp/tri5_ali
  fi
  
  if [ $stage -le 12 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 12: Building lang dir" `date`
    echo ---------------------------------------------------------------------
    cp -R data/langp/tri5_ali/ data/langp_test
    cp data/lang_test/G.fst data/langp_test
  fi
  
  if [ $stage -le 13 ]; then
    echo ---------------------------------------------------------------------
    echo "Stage 13: Running decoding with SAT models  on" `date`
    echo ---------------------------------------------------------------------
    decode=exp/tri5/decode_test
    utils/mkgraph.sh \
      data/langp_test exp/tri5 exp/tri5/graph |tee exp/tri5/mkgraph.log
  
    mkdir -p $decode
    steps/decode_fmllr_extra.sh  --beam 10 --lattice-beam 4\
      --nj 32 --cmd "$decode_cmd"\
      exp/tri5/graph data/eval/ ${decode} |tee ${decode}/decode.log
    touch ${decode}/.done
  fi
  
  
  #./local/chain/run_tdnn.sh
  #./local/run_sgmm2.sh