run.sh
6.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/bin/bash
#
# Based mostly on the Switchboard recipe. The training database is TED-LIUM,
# it consists of TED talks with cleaned automatic transcripts:
#
# http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus
# http://www.openslr.org/resources (Mirror).
#
# The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
# which allow free non-commercial use, while only a citation is required.
#
# Copyright 2014 Nickolay V. Shmyrev
# 2014 Brno University of Technology (Author: Karel Vesely)
# 2016 Vincent Nguyen
# 2016 Johns Hopkins University (Author: Daniel Povey)
#
# Apache 2.0
#
. ./cmd.sh
. ./path.sh
set -e -o pipefail -u
nj=35
decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set
# after applying --seconds-per-spk-max 180. We decode with 4 threads, so
# this will be too many jobs if you're using run.pl.
stage=0
. utils/parse_options.sh # accept options
# Data preparation
if [ $stage -le 0 ]; then
local/download_data.sh
fi
if [ $stage -le 1 ]; then
local/prepare_data.sh
# Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and
# lets us use more jobs for decoding etc.
# [we chose 3 minutes because that gives us 38 speakers for the dev data, which is
# more than our normal 30 jobs.]
for dset in dev test train; do
utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset}
done
fi
if [ $stage -le 2 ]; then
local/prepare_dict.sh
fi
if [ $stage -le 3 ]; then
utils/prepare_lang.sh data/local/dict_nosp \
"<unk>" data/local/lang_nosp data/lang_nosp
fi
if [ $stage -le 4 ]; then
# Download the pre-built LMs from kaldi-asr.org instead of building them
# locally.
local/ted_download_lm.sh
# Uncomment this script to build the language models instead of
# downloading them from kaldi-asr.org.
# local/ted_train_lm.sh
fi
if [ $stage -le 5 ]; then
local/format_lms.sh
fi
# Feature extraction
if [ $stage -le 6 ]; then
for set in test dev train; do
dir=data/$set
steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir
steps/compute_cmvn_stats.sh $dir
done
fi
# Now we have 212 hours of training data.
# Well create a subset with 10k short segments to make flat-start training easier:
if [ $stage -le 7 ]; then
utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort
utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup
fi
# Train
if [ $stage -le 8 ]; then
steps/train_mono.sh --nj 20 --cmd "$train_cmd" \
data/train_10kshort_nodup data/lang_nosp exp/mono
fi
if [ $stage -le 9 ]; then
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
data/train data/lang_nosp exp/mono exp/mono_ali
steps/train_deltas.sh --cmd "$train_cmd" \
2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1
fi
if [ $stage -le 10 ]; then
utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp
# The slowest part about this decoding is the scoring, which we can't really
# control as the bottleneck is the NIST tools.
for dset in dev test; do
steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore
done
fi
if [ $stage -le 11 ]; then
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
data/train data/lang_nosp exp/tri1 exp/tri1_ali
steps/train_lda_mllt.sh --cmd "$train_cmd" \
4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2
fi
if [ $stage -le 12 ]; then
utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp
for dset in dev test; do
steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore
done
fi
if [ $stage -le 13 ]; then
steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \
exp/tri2/sil_counts_nowb.txt \
exp/tri2/pron_bigram_counts_nowb.txt data/local/dict
fi
if [ $stage -le 14 ]; then
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
cp -rT data/lang data/lang_rescore
cp data/lang_nosp/G.fst data/lang/
cp data/lang_nosp_rescore/G.carpa data/lang_rescore/
utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
for dset in dev test; do
steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri2/graph data/${dset} exp/tri2/decode_${dset}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore
done
fi
if [ $stage -le 15 ]; then
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
data/train data/lang exp/tri2 exp/tri2_ali
steps/train_sat.sh --cmd "$train_cmd" \
5000 100000 data/train data/lang exp/tri2_ali exp/tri3
utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
for dset in dev test; do
steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri3/graph data/${dset} exp/tri3/decode_${dset}
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore
done
fi
# the following shows you how to insert a phone language model in place of <unk>
# and decode with that.
# local/run_unk_model.sh
if [ $stage -le 16 ]; then
# this does some data-cleaning. It actually degrades the GMM-level results
# slightly, but the cleaned data should be useful when we add the neural net and chain
# systems. If not we'll remove this stage.
local/run_cleanup_segmentation.sh
fi
# TODO: xiaohui-zhang will add lexicon cleanup at some point.
if [ $stage -le 17 ]; then
# This will only work if you have GPUs on your system (and note that it requires
# you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
local/chain/run_tdnn.sh
fi
# The nnet3 TDNN recipe:
# local/nnet3/run_tdnn.sh
# We removed the GMM+MMI stage that used to exist in the release-1 scripts,
# since the neural net training is more of interest.
echo "$0: success."
exit 0