run.sh
4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/bin/bash
# Recipe for Mozilla Common Voice corpus v1
#
# Copyright 2017 Ewald Enzinger
# Apache 2.0
data=$HOME/cv_corpus_v1
data_url=https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz
. ./cmd.sh
. ./path.sh
stage=0
. ./utils/parse_options.sh
set -euo pipefail
if [ $stage -le 0 ]; then
mkdir -p $data
local/download_and_untar.sh $(dirname $data) $data_url
fi
if [ $stage -le 1 ]; then
for part in valid-train valid-dev valid-test; do
# use underscore-separated names in data directories.
local/data_prep.pl $data cv-$part data/$(echo $part | tr - _)
done
# Prepare ARPA LM and vocabulary using SRILM
local/prepare_lm.sh data/valid_train
# Prepare the lexicon and various phone lists
# Pronunciations for OOV words are obtained using a pre-trained Sequitur model
local/prepare_dict.sh
# Prepare data/lang and data/local/lang directories
utils/prepare_lang.sh data/local/dict \
'<unk>' data/local/lang data/lang || exit 1
utils/format_lm.sh data/lang data/local/lm.gz data/local/dict/lexicon.txt data/lang_test/
fi
if [ $stage -le 2 ]; then
mfccdir=mfcc
# spread the mfccs over various machines, as this data-set is quite large.
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
utils/create_split_dir.pl /export/b{07,14,16,17}/$USER/kaldi-data/mfcc/commonvoice/s5/$mfcc/storage \
$mfccdir/storage
fi
for part in valid_train valid_dev valid_test; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$part exp/make_mfcc/$part $mfccdir
steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
done
# Get the shortest 10000 utterances first because those are more likely
# to have accurate alignments.
utils/subset_data_dir.sh --shortest data/valid_train 10000 data/train_10kshort || exit 1;
utils/subset_data_dir.sh data/valid_train 20000 data/train_20k || exit 1;
fi
# train a monophone system
if [ $stage -le 3 ]; then
steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
data/train_10kshort data/lang exp/mono || exit 1;
(
utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
for testset in valid_dev; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph \
data/$testset exp/mono/decode_$testset
done
)&
steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
data/train_20k data/lang exp/mono exp/mono_ali_train_20k
fi
# train a first delta + delta-delta triphone system
if [ $stage -le 4 ]; then
steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
2000 10000 data/train_20k data/lang exp/mono_ali_train_20k exp/tri1
# decode using the tri1 model
(
utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
for testset in valid_dev; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph \
data/$testset exp/tri1/decode_$testset
done
)&
steps/align_si.sh --nj 10 --cmd "$train_cmd" \
data/train_20k data/lang exp/tri1 exp/tri1_ali_train_20k
fi
# train an LDA+MLLT system.
if [ $stage -le 5 ]; then
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" 2500 15000 \
data/train_20k data/lang exp/tri1_ali_train_20k exp/tri2b
# decode using the LDA+MLLT model
utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
(
for testset in valid_dev; do
steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph \
data/$testset exp/tri2b/decode_$testset
done
)&
# Align utts using the tri2b model
steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
data/train_20k data/lang exp/tri2b exp/tri2b_ali_train_20k
fi
# Train tri3b, which is LDA+MLLT+SAT
if [ $stage -le 6 ]; then
steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
data/train_20k data/lang exp/tri2b_ali_train_20k exp/tri3b
# decode using the tri3b model
(
utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
for testset in valid_dev; do
steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
exp/tri3b/graph data/$testset exp/tri3b/decode_$testset
done
)&
fi
if [ $stage -le 7 ]; then
# Align utts in the full training set using the tri3b model
steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
data/valid_train data/lang \
exp/tri3b exp/tri3b_ali_valid_train
# train another LDA+MLLT+SAT system on the entire training set
steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
data/valid_train data/lang \
exp/tri3b_ali_valid_train exp/tri4b
# decode using the tri4b model
(
utils/mkgraph.sh data/lang_test exp/tri4b exp/tri4b/graph
for testset in valid_dev; do
steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
exp/tri4b/graph data/$testset \
exp/tri4b/decode_$testset
done
)&
fi
# Train a chain model
if [ $stage -le 8 ]; then
local/chain/run_tdnn.sh --stage 0
fi
# Don't finish until all background decoding jobs are finished.
wait