run.sh
5.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/bin/bash
# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora
# 2017 Hossein Hadian
set -e
stage=0
nj=70
decode_gmm=false
# download_dir{1,2,3} points to the database path on the JHU grid. If you have not
# already downloaded the database you can set it to a local directory
# This corpus can be purchased here:
# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
data_splits_dir=data/download/data_splits
images_scp_dir=data/local
overwrite=false
subset=false
augment=false
use_extra_corpus_text=true
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh
. ./utils/parse_options.sh # e.g. this parses the above options
# if supplied.
./local/check_tools.sh
mkdir -p data/{train,test,dev}/data
mkdir -p data/local/{train,test,dev}
if [ $stage -le 0 ]; then
if [ -f data/train/text ] && ! $overwrite; then
echo "$0: Not processing, probably script have run from wrong stage"
echo "Exiting with status 1 to avoid data corruption"
exit 1;
fi
local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3 \
--use_extra_corpus_text $use_extra_corpus_text
for set in test train dev; do
data_split_file=$data_splits_dir/madcat.$set.raw.lineid
local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
--download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
--writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
--data data/local/$set --subset $subset --augment $augment || exit 1
done
echo "$0: Processing data..."
for set in dev train test; do
local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
$writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
image/fix_data_dir.sh data/${set}
done
fi
if [ $stage -le 1 ]; then
for dataset in test train; do
local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset
steps/compute_cmvn_stats.sh data/$dataset || exit 1;
done
utils/fix_data_dir.sh data/train
fi
if [ $stage -le 2 ]; then
echo "$0: Preparing BPE..."
cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
for set in test train dev; do
cut -d' ' -f1 data/$set/text > data/$set/ids
cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
| sed 's/@@//g' > data/$set/bpe_text
mv data/$set/text data/$set/text.old
paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
rm -f data/$set/bpe_text data/$set/ids
done
echo "$0:Preparing dictionary and lang..."
local/prepare_dict.sh
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
data/local/dict "<sil>" data/lang/temp data/lang
utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
fi
if [ $stage -le 3 ]; then
echo "$0: Estimating a language model for decoding..."
local/train_lm.sh
utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
data/local/dict/lexicon.txt data/lang
utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
data/lang data/lang_rescore_6g
fi
if [ $stage -le 4 ]; then
steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
data/lang exp/mono
fi
if [ $stage -le 5 ] && $decode_gmm; then
utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
exp/mono/decode_test
fi
if [ $stage -le 6 ]; then
steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
exp/mono exp/mono_ali
steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \
exp/mono_ali exp/tri
fi
if [ $stage -le 7 ] && $decode_gmm; then
utils/mkgraph.sh data/lang exp/tri exp/tri/graph
steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
exp/tri/decode_test
fi
if [ $stage -le 8 ]; then
steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
exp/tri exp/tri_ali
steps/train_lda_mllt.sh --cmd $cmd \
--splice-opts "--left-context=3 --right-context=3" 500 20000 \
data/train data/lang exp/tri_ali exp/tri3
fi
if [ $stage -le 9 ] && $decode_gmm; then
utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \
data/test exp/tri3/decode_test
fi
if [ $stage -le 10 ]; then
steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
data/train data/lang exp/tri3 exp/tri3_ali
fi
if [ $stage -le 11 ]; then
local/chain/run_cnn.sh
fi
if [ $stage -le 12 ]; then
local/chain/run_cnn_chainali.sh --stage 2
fi