run.sh
6.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora
# 2017 Hossein Hadian
set -e
stage=0
nj=20
decode_gmm=false
username=
password=
# iam_database points to the database path on the JHU grid. If you have not
# already downloaded the database you can set it to a local directory
# like "data/download" and follow the instructions
# in "local/prepare_data.sh" to download the database:
iam_database=/export/corpora5/handwriting_ocr/IAM
# wellington_database points to the database path on the JHU grid. The Wellington
# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus).
# This corpus is of written NZ English that can be purchased here:
# "https://www.victoria.ac.nz/lals/resources/corpora-default"
wellington_database=/export/corpora5/Wellington/WWC/
train_set=train_aug
process_aachen_split=false
overwrite=false
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh
. ./utils/parse_options.sh # e.g. this parses the above options
# if supplied.
./local/check_tools.sh
if [ $stage -le 0 ]; then
if [ -f data/train/text ] && ! $overwrite; then
echo "$0: Not processing, probably script have run from wrong stage"
echo "Exiting with status 1 to avoid data corruption"
exit 1;
fi
echo "$0: Preparing data..."
local/prepare_data.sh --download-dir "$iam_database" \
--wellington-dir "$wellington_database" \
--username "$username" --password "$password" \
--process_aachen_split $process_aachen_split
fi
mkdir -p data/{train,test,val}/data
if [ $stage -le 1 ]; then
echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
# The next command creates a "allowed_lengths.txt" file in data/train
# which will be used by local/make_features.py to enforce the images to
# have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
echo "$0: $(date) Extracting features, creating feats.scp file"
local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
steps/compute_cmvn_stats.sh data/train || exit 1;
for set in val test; do
local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
--feat-dim 40 data/${set}
steps/compute_cmvn_stats.sh data/${set} || exit 1;
done
utils/fix_data_dir.sh data/train
fi
if [ $stage -le 2 ]; then
for set in train; do
echo "$0: $(date) stage 2: Performing augmentation, it will double training data"
local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
done
fi
if [ $stage -le 3 ]; then
echo "$0: Estimating a language model for decoding..."
# We do this stage before dict preparation because prepare_dict.sh
# generates the lexicon from pocolm's wordlist
local/train_lm.sh --vocab-size 50k
fi
if [ $stage -le 4 ]; then
echo "$0: Preparing dictionary and lang..."
# This is for training. Use a large vocab size, e.g. 500k to include all the
# training words:
local/prepare_dict.sh --vocab-size 500k --dir data/local/dict # this is for training
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
data/local/dict "<unk>" data/lang/temp data/lang
silphonelist=`cat data/lang/phones/silence.csl`
nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
# This is for decoding. We use a 50k lexicon to be consistent with the papers
# reporting WERs on IAM:
local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k # this is for decoding
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
data/local/dict_50k "<unk>" data/lang_test/temp data/lang_test
utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \
data/local/dict_50k/lexicon.txt data/lang_test
echo "$0: Preparing the unk model for open-vocab decoding..."
utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \
data/local/dict_50k exp/unk_lang_model
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
--unk-fst exp/unk_lang_model/unk_fst.txt \
data/local/dict_50k "<unk>" data/lang_unk/temp data/lang_unk
silphonelist=`cat data/lang/phones/silence.csl`
nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo
cp data/lang_test/G.fst data/lang_unk/G.fst
fi
if [ $stage -le 4 ]; then
steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/$train_set \
data/lang exp/mono
fi
if [ $stage -le 5 ] && $decode_gmm; then
utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
exp/mono/decode_test
fi
if [ $stage -le 6 ]; then
steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \
exp/mono exp/mono_ali
steps/train_deltas.sh --cmd $cmd 500 20000 data/$train_set data/lang \
exp/mono_ali exp/tri
fi
if [ $stage -le 7 ] && $decode_gmm; then
utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
exp/tri/decode_test
fi
if [ $stage -le 8 ]; then
steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \
exp/tri exp/tri_ali
steps/train_lda_mllt.sh --cmd $cmd \
--splice-opts "--left-context=3 --right-context=3" 500 20000 \
data/$train_set data/lang exp/tri_ali exp/tri2
fi
if [ $stage -le 9 ] && $decode_gmm; then
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \
data/test exp/tri2/decode_test
fi
if [ $stage -le 10 ]; then
steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
data/$train_set data/lang exp/tri2 exp/tri2_ali
steps/train_sat.sh --cmd $cmd 500 20000 \
data/$train_set data/lang exp/tri2_ali exp/tri3
fi
if [ $stage -le 11 ] && $decode_gmm; then
utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \
data/test exp/tri3/decode_test
fi
if [ $stage -le 12 ]; then
steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
data/$train_set data/lang exp/tri3 exp/tri3_ali
fi
if [ $stage -le 13 ]; then
local/chain/run_cnn.sh --lang-test lang_unk --train_set $train_set
fi
if [ $stage -le 14 ]; then
local/chain/run_cnn_chainali.sh --chain-model-dir exp/chain/cnn_1a --stage 2 --train_set $train_set
fi