prepare_data.sh
7.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/bin/bash
# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora
# 2017 Hossein Hadian
# Apache 2.0
# This script downloads the IAM handwriting database and prepares the training
# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
# It also downloads the LOB and Brown text corpora. It downloads the database files
# only if they do not already exist in download directory.
# Eg. local/prepare_data.sh
# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
# utt2spk file: 000_a01-000u-00 000
# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
stage=0
download_dir=data/download
process_aachen_split=false
wellington_dir=
username=
password= # username and password for downloading the IAM database
# if you have not already downloaded the database, please
# register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
# and provide this script with your username and password.
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh || exit 1;
if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
echo "exist in your data/local directory this script will fail because the required files"
echo "can't be downloaded automatically (it needs registration)."
echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
echo "... and then call this script again with --username <username> --password <password>"
echo ""
exit 1
fi
lines=data/local/lines
xml=data/local/xml
ascii=data/local/ascii
bcorpus=data/local/browncorpus
lobcorpus=data/local/lobcorpus
wcorpus=data/local/wellingtoncorpus
data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
wellington_corpus_loc=/export/corpora5/Wellington/WWC/
aachen_split_url=http://www.openslr.org/resources/56/splits.zip
aachen_splits=data/local/aachensplits
mkdir -p $download_dir data/local
# download and extact images and transcription
if [ -d $lines ]; then
echo "$0: Not downloading lines images as it is already there."
else
if [ ! -f $download_dir/lines.tgz ]; then
echo "$0: Trying to download lines images..."
wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
fi
mkdir -p $lines
tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
echo "$0: Done downloading and extracting lines images"
fi
if [ -d $xml ]; then
echo "$0: Not downloading transcriptions as it is already there."
else
if [ ! -f $download_dir/xml.tgz ]; then
echo "$0: Trying to download transcriptions..."
wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
fi
mkdir -p $xml
tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
echo "$0: Done downloading and extracting transcriptions."
fi
if [ -d $data_split_info ]; then
echo "$0: Not downloading data split information as it is already there."
else
if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
echo "$0: Trying to download training and testing data split information..."
wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
fi
mkdir -p $data_split_info
unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
echo "$0: Done downloading and extracting training and testing data split information"
fi
if [ -d $ascii ]; then
echo "$0: Not downloading ascii.tgz as it is already there."
else
if [ ! -f $download_dir/ascii.tgz ]; then
echo "$0: trying to download ascii.tgz..."
wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
fi
mkdir -p $ascii
tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
echo "$0: Done downloading and extracting ascii.tgz"
fi
if [ -d $lobcorpus ]; then
echo "$0: Not downloading the LOB text corpus as it is already there."
else
if [ ! -f $lobcorpus/0167.zip ]; then
echo "$0: Downloading the LOB text corpus ..."
mkdir -p $lobcorpus
wget -P $lobcorpus/ $lob_corpus_url || exit 1;
fi
unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
echo "$0: Done downloading and extracting LOB corpus"
fi
if [ -d $bcorpus ]; then
echo "$0: Not downloading the Brown corpus as it is already there."
else
if [ ! -f $bcorpus/brown.txt ]; then
mkdir -p $bcorpus
echo "$0: Downloading the Brown text corpus..."
wget -P $bcorpus $brown_corpus_url || exit 1;
fi
echo "$0: Done downloading the Brown text corpus"
fi
if [ -d $wcorpus ]; then
echo "$0: Not copying Wellington corpus as it is already there."
elif [ ! -z $wellington_dir ]; then
mkdir -p $wcorpus
cp -r $wellington_dir/. $wcorpus
# Combine Wellington corpora and replace some of their annotations
cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt
cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt
echo "$0: Done copying Wellington corpus"
else
echo "$0: Wellington Corpus not included because wellington_dir not provided"
fi
if [ -d $aachen_splits ]; then
echo "$0: Not downloading the Aachen splits as it is already there."
else
if [ ! -f $aachen_splits/splits.zip ]; then
echo "$0: Downloading Aachen splits ..."
mkdir -p $aachen_splits
wget -P $aachen_splits/ $aachen_split_url || exit 1;
fi
unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
echo "$0: Done downloading and extracting Aachen splits"
fi
mkdir -p data/{train,test,val}
file_name=largeWriterIndependentTextLineRecognitionTask
train_old="data/local/$file_name/trainset.txt"
test_old="data/local/$file_name/testset.txt"
val1_old="data/local/$file_name/validationset1.txt"
val2_old="data/local/$file_name/validationset2.txt"
train_new="data/local/train.uttlist"
test_new="data/local/test.uttlist"
val_new="data/local/validation.uttlist"
cat $train_old > $train_new
cat $test_old > $test_new
cat $val1_old $val2_old > $val_new
if $process_aachen_split; then
local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
else
local/process_data.py data/local data/train --dataset train || exit 1
local/process_data.py data/local data/test --dataset test || exit 1
local/process_data.py data/local data/val --dataset validation || exit 1
fi
image/fix_data_dir.sh data/train
image/fix_data_dir.sh data/test
image/fix_data_dir.sh data/val