run.sh
10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/bin/bash
# Copyright 2015 Tokyo Institute of Technology
# (Authors: Takafumi Moriya, Tomohiro Tanaka and Takahiro Shinozaki)
# 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
# Apache 2.0
# Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055.
# This recipe is based on the Switchboard corpus recipe, by Arnab Ghoshal,
# in the egs/swbd/s5c/ directory.
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
# Caution: some of the graph creation steps use quite a bit of memory, so you
# should run this on a machine that has sufficient memory.
. ./cmd.sh
. ./path.sh
set -e # exit on error
#: << '#SKIP'
use_dev=false # Use the first 4k sentences from training data as dev set. (39 speakers.)
CSJDATATOP=/export/corpora5/CSJ/USB
#CSJDATATOP=/db/laputa1/data/processed/public/CSJ ## CSJ database top directory.
CSJVER=usb ## Set your CSJ format (dvd or usb).
## Usage :
## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17.
## Neccesary directory is dvd3 - dvd17.
## e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17
##
## Case USB : Neccesary directory is MORPH/SDB and WAV
## e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv
## Case merl :MERL setup. Neccesary directory is WAV and sdb
if [ ! -e data/csj-data/.done_make_all ]; then
echo "CSJ transcription file does not exist"
#local/csj_make_trans/csj_autorun.sh <RESOUCE_DIR> <MAKING_PLACE(no change)> || exit 1;
local/csj_make_trans/csj_autorun.sh $CSJDATATOP data/csj-data $CSJVER
fi
wait
[ ! -e data/csj-data/.done_make_all ]\
&& echo "Not finished processing CSJ data" && exit 1;
# Prepare Corpus of Spontaneous Japanese (CSJ) data.
# Processing CSJ data to KALDI format based on switchboard recipe.
# local/csj_data_prep.sh <SPEECH_and_TRANSCRIPTION_DATA_DIRECTORY> [ <mode_number> ]
# mode_number can be 0, 1, 2, 3 (0=default using "Academic lecture" and "other" data,
# 1=using "Academic lecture" data,
# 2=using All data except for "dialog" data, 3=using All data )
local/csj_data_prep.sh data/csj-data
# local/csj_data_prep.sh data/csj-data 1
# local/csj_data_prep.sh data/csj-data 2
# local/csj_data_prep.sh data/csj-data 3
local/csj_prepare_dict.sh
utils/prepare_lang.sh --num-sil-states 4 data/local/dict_nosp "<unk>" data/local/lang_nosp data/lang_nosp
# Now train the language models.
local/csj_train_lms.sh data/local/train/text data/local/dict_nosp/lexicon.txt data/local/lm
# We don't really need all these options for SRILM, since the LM training script
# does some of the same processing (e.g. -subset -tolower)
srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
LM=data/local/lm/csj.o3g.kn.gz
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
data/lang_nosp $LM data/local/dict_nosp/lexicon.txt data/lang_nosp_csj_tg
# Data preparation and formatting for evaluation set.
# CSJ has 3 types of evaluation data
#local/csj_eval_data_prep.sh <SPEECH_and_TRANSCRIPTION_DATA_DIRECTORY_ABOUT_EVALUATION_DATA> <EVAL_NUM>
for eval_num in eval1 eval2 eval3 ; do
local/csj_eval_data_prep.sh data/csj-data/eval $eval_num
done
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in train eval1 eval2 eval3; do
steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \
data/$x exp/make_mfcc/$x $mfccdir
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x
done
echo "Finish creating MFCCs"
#SKIP
##### Training and Decoding steps start from here #####
# Use the first 4k sentences as dev set. Note: when we trained the LM, we used
# the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
# LM training data. However, they will be in the lexicon, plus speakers
# may overlap, so it's still not quite equivalent to a test set.
if $use_dev ;then
dev_set=train_dev
utils/subset_data_dir.sh --first data/train 4000 data/$dev_set # 6hr 31min
n=$[`cat data/train/segments | wc -l` - 4000]
utils/subset_data_dir.sh --last data/train $n data/train_nodev
else
cp -r data/train data/train_nodev
fi
# Calculate the amount of utterance segmentations.
# perl -ne 'split; $s+=($_[3]-$_[2]); END{$h=int($s/3600); $r=($s-$h*3600); $m=int($r/60); $r-=$m*60; printf "%.1f sec -- %d:%d:%.1f\n", $s, $h, $m, $r;}' data/train/segments
# Now-- there are 162k utterances (240hr 8min), and we want to start the
# monophone training on relatively short utterances (easier to align), but want
# to exclude the shortest ones.
# Therefore, we first take the 100k shortest ones;
# remove most of the repeated utterances, and
# then take 10k random utterances from those (about 8hr 9mins)
utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort
utils/subset_data_dir.sh data/train_100kshort 30000 data/train_30kshort
# Take the first 100k utterances (about half the data); we'll use
# this for later stages of training.
utils/subset_data_dir.sh --first data/train_nodev 100000 data/train_100k
utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup # 147hr 6min
# Finally, the full training set:
utils/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup # 233hr 36min
## Starting basic training on MFCC features
steps/train_mono.sh --nj 50 --cmd "$train_cmd" \
data/train_30kshort data/lang_nosp exp/mono
steps/align_si.sh --nj 50 --cmd "$train_cmd" \
data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali
steps/train_deltas.sh --cmd "$train_cmd" \
3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1
graph_dir=exp/tri1/graph_csj_tg
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri1 $graph_dir
for eval_num in eval1 eval2 eval3 $dev_set ; do
steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/$eval_num exp/tri1/decode_${eval_num}_csj
done
steps/align_si.sh --nj 50 --cmd "$train_cmd" \
data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali
steps/train_deltas.sh --cmd "$train_cmd" \
4000 70000 data/train_100k_nodup data/lang_nosp exp/tri1_ali exp/tri2
# The previous mkgraph might be writing to this file. If the previous mkgraph
# is not running, you can remove this loop and this mkgraph will create it.
while [ ! -s data/lang_nosp_csj_tg/tmp/CLG_3_1.fst ]; do sleep 60; done
sleep 20; # in case still writing.
graph_dir=exp/tri2/graph_csj_tg
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri2 $graph_dir
for eval_num in eval1 eval2 eval3 $dev_set ; do
steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/$eval_num exp/tri2/decode_${eval_num}_csj
done
# From now, we start with the LDA+MLLT system
steps/align_si.sh --nj 50 --cmd "$train_cmd" \
data/train_100k_nodup data/lang_nosp exp/tri2 exp/tri2_ali_100k_nodup
# From now, we start using all of the data (except some duplicates of common
# utterances, which don't really contribute much).
steps/align_si.sh --nj 50 --cmd "$train_cmd" \
data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup
# Do another iteration of LDA+MLLT training, on all the data.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3
graph_dir=exp/tri3/graph_csj_tg
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri3 $graph_dir
for eval_num in eval1 eval2 eval3 $dev_set ; do
steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/$eval_num exp/tri3/decode_${eval_num}_csj_nosp
done
# Now we compute the pronunciation and silence probabilities from training data,
# and re-create the lang directory.
steps/get_prons.sh --cmd "$train_cmd" data/train_nodup data/lang_nosp exp/tri3
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_nosp exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
LM=data/local/lm/csj.o3g.kn.gz
srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
data/lang $LM data/local/dict/lexicon.txt data/lang_csj_tg
graph_dir=exp/tri3/graph_csj_tg
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/lang_csj_tg exp/tri3 $graph_dir
for eval_num in eval1 eval2 eval3 $dev_set ; do
steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/$eval_num exp/tri3/decode_${eval_num}_csj
done
# Train tri4, which is LDA+MLLT+SAT, on all the (nodup) data.
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup
steps/train_sat.sh --cmd "$train_cmd" \
11500 200000 data/train_nodup data/lang exp/tri3_ali_nodup exp/tri4
graph_dir=exp/tri4/graph_csj_tg
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/lang_csj_tg exp/tri4 $graph_dir
for eval_num in eval1 eval2 eval3 $dev_set ; do
steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/$eval_num exp/tri4/decode_${eval_num}_csj
done
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri4 exp/tri4_ali_nodup || exit 1
# You can execute DNN training script [e.g. local/chain/run_dnn.sh] from here.
# MMI training
# local/run_mmi.sh
# this will help find issues with the lexicon.
# steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_nodev data/lang exp/tri4 data/local/dict/lexicon.txt exp/debug_lexicon
# SGMM system
# local/run_sgmm2.sh
#SKIP
##### Start DNN training #####
# Karel's DNN recipe on top of fMLLR features
# local/nnet/run_dnn.sh
# nnet3 TDNN+Chain
local/chain/run_tdnn.sh
# nnet3 TDNN recipe
# local/nnet3/run_tdnn.sh
##### Start RNN-LM training for rescoring #####
# local/csj_run_rnnlm.sh
# getting results (see RESULTS file)
# for eval_num in eval1 eval2 eval3 $dev_set ; do
# echo "=== evaluation set $eval_num ===" ;
# for x in exp/{tri,dnn}*/decode_${eval_num}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done ;
# done