run.sh
7.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/bin/bash
#
# Johns Hopkins University (Author : Gaurav Kumar, Daniel Povey)
# Recipe for CallHome Egyptian Arabic
# Made to integrate KALDI with JOSHUA for end-to-end ASR and SMT
. ./cmd.sh
. ./path.sh
mfccdir=`pwd`/mfcc
set -e
# Specify the location of the speech files, the transcripts and the lexicon
# These are passed off to other scripts in including the one for data and lexicon prep
eca_speech=/export/corpora/LDC/LDC97S45
eca_transcripts=/export/corpora/LDC/LDC97T19
eca_lexicon=/export/corpora/LDC/LDC99L22
sup_speech=/export/corpora/LDC/LDC2002S37
sup_transcripts=/export/corpora/LDC/LDC2002T38
h5_speech=/export/corpora/LDC/LDC2002S22
h5_transcripts=/export/corpora/LDC/LDC2002T39
split=local/splits
local/callhome_data_prep.sh $eca_speech $eca_transcripts $sup_speech $sup_transcripts $h5_speech $h5_transcripts
local/callhome_prepare_dict.sh $eca_lexicon
# Added c,j, v to the non silences phones manually
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
# Make sure that you do not use your test and your dev sets to train the LM
# Some form of cross validation is possible where you decode your dev/set based on an
# LM that is trained on everything but that that conversation
local/callhome_train_lms.sh $split
local/callhome_create_test_lang.sh
utils/fix_data_dir.sh data/local/data/train_all
steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
utils/fix_data_dir.sh data/local/data/train_all
utils/validate_data_dir.sh data/local/data/train_all
cp -r data/local/data/train_all data/train_all
# Creating data partitions for the pipeline
local/create_splits $split
# Now compute CMVN stats for the train, dev and test subsets
steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
steps/compute_cmvn_stats.sh data/sup exp/make_mfcc/sup $mfccdir
steps/compute_cmvn_stats.sh data/h5 exp/make_mfcc/h5 $mfccdir
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
# Again from Dan's recipe : Reduced monophone training data
# Now-- there are 1.6 million utterances, and we want to start the monophone training
# on relatively short utterances (easier to align), but not only the very shortest
# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
# utterances from those.
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
data/train data/lang exp/mono0a
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri1/graph data/dev exp/tri1/decode_dev)&
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
steps/train_deltas.sh --cmd "$train_cmd" \
1400 15000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
)&
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
# Train tri3a, which is LDA+MLLT, on 100k data.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
1800 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
)&
# Next we'll use fMLLR and train with SAT (i.e. on
# fMLLR features)
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
2200 25000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri4a/graph data/dev exp/tri4a/decode_dev
)&
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
# Reduce the number of gaussians
steps/train_sat.sh --cmd "$train_cmd" \
2600 30000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
(
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a/graph data/dev exp/tri5a/decode_dev
)&
(
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a/graph data/test exp/tri5a/decode_test
# Decode Supplement and H5
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a/graph data/sup exp/tri5a/decode_sup
steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
exp/tri5a/graph data/h5 exp/tri5a/decode_h5
)&
dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
--parallel-opts "--num-threads 16" --cmd "queue.pl --mem 1G")
dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
--parallel-opts "--gpu 1" --cmd "queue.pl --mem 1G")
steps/nnet2/train_pnorm_ensemble.sh \
--mix-up 5000 --initial-learning-rate 0.008 --final-learning-rate 0.0008\
--num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\
--cmd "$train_cmd" \
"${dnn_gpu_parallel_opts[@]}" \
--ensemble-size 4 --initial-beta 0.1 --final-beta 5 \
data/train data/lang exp/tri5a_ali exp/tri6a_dnn
(
steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
--scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
) &
# Decode test sets
(
steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
--scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_test exp/tri5a/graph data/test exp/tri6a_dnn/decode_test
steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
--scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_sup exp/tri5a/graph data/sup exp/tri6a_dnn/decode_sup
steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
--scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_h5 exp/tri5a/graph data/h5 exp/tri6a_dnn/decode_h5
) &
wait
# (TDNN + iVectors) training
# Note that the alignments used by run_tdnn.sh come from the pnorm-ensemble model
# If you choose to skip ensemble training (which is slow), use the best
# fmllr alignments available (tri4a)
# You can modify this in local/nnet/run_tdnn.sh
local/nnet3/run_tdnn.sh
exit 0;