process_data.py
9.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python3
# Copyright 2018 Ashish Arora
""" This script reads MADCAT files and creates the following files (for the
data subset selected via --dataset) :text, utt2spk, images.scp.
Eg. local/process_data.py data/local /export/corpora/LDC/LDC2012T15 /export/corpora/LDC/LDC2013T09
/export/corpora/LDC/LDC2013T15 data/download/data_splits/madcat.train.raw.lineid
data/dev data/local/lines/images.scp
Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 وجه وعقل غارق حتّى النخاع
utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0
data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif
"""
import argparse
import os
import sys
import xml.dom.minidom as minidom
import unicodedata
parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files",
epilog="E.g. " + sys.argv[0] + " data/LDC2012T15"
" data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid "
" data/train data/local/lines ",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('database_path1',
help='Path to the downloaded (and extracted) madcat data')
parser.add_argument('database_path2',
help='Path to the downloaded (and extracted) madcat data')
parser.add_argument('database_path3',
help='Path to the downloaded (and extracted) madcat data')
parser.add_argument('data_splits',
help='Path to file that contains the train/test/dev split information')
parser.add_argument('out_dir',
help='directory location to write output files.')
parser.add_argument('images_scp_path',
help='Path of input images.scp file(maps line image and location)')
parser.add_argument('writing_condition1',
help='Path to the downloaded (and extracted) writing conditions file 1')
parser.add_argument('writing_condition2',
help='Path to the downloaded (and extracted) writing conditions file 2')
parser.add_argument('writing_condition3',
help='Path to the downloaded (and extracted) writing conditions file 3')
parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
help="performs image augmentation")
parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False,
help="only processes subset of data based on writing condition")
args = parser.parse_args()
def check_file_location():
""" Returns the complete path of the page image and corresponding
xml file.
Args:
Returns:
image_file_name (string): complete path and name of the page image.
madcat_file_path (string): complete path and name of the madcat xml file
corresponding to the page image.
"""
madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml')
madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml')
madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml')
image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif')
image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif')
image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif')
if os.path.exists(madcat_file_path1):
return madcat_file_path1, image_file_path1, wc_dict1
if os.path.exists(madcat_file_path2):
return madcat_file_path2, image_file_path2, wc_dict2
if os.path.exists(madcat_file_path3):
return madcat_file_path3, image_file_path3, wc_dict3
return None, None, None
def parse_writing_conditions(writing_conditions):
""" Returns a dictionary which have writing condition of each page image.
Args:
writing_conditions(string): complete path of writing condition file.
Returns:
(dict): dictionary with key as page image name and value as writing condition.
"""
with open(writing_conditions) as f:
file_writing_cond = dict()
for line in f:
line_list = line.strip().split("\t")
file_writing_cond[line_list[0]] = line_list[3]
return file_writing_cond
def check_writing_condition(wc_dict):
""" Checks if a given page image is writing in a given writing condition.
It is used to create subset of dataset based on writing condition.
Args:
wc_dict (dict): dictionary with key as page image name and value as writing condition.
Returns:
(bool): True if writing condition matches.
"""
if args.subset:
writing_condition = wc_dict[base_name].strip()
if writing_condition != 'IUC':
return False
else:
return True
else:
return True
def read_text(madcat_file_path):
""" Maps every word in the page image to a corresponding line.
Args:
madcat_file_path (string): complete path and name of the madcat xml file
corresponding to the page image.
Returns:
dict: Mapping every word in the page image to a corresponding line.
"""
word_line_dict = dict()
doc = minidom.parse(madcat_file_path)
zone = doc.getElementsByTagName('zone')
for node in zone:
line_id = node.getAttribute('id')
word_image = node.getElementsByTagName('token-image')
for tnode in word_image:
word_id = tnode.getAttribute('id')
word_line_dict[word_id] = line_id
text_line_word_dict = dict()
segment = doc.getElementsByTagName('segment')
for node in segment:
token = node.getElementsByTagName('token')
for tnode in token:
ref_word_id = tnode.getAttribute('ref_id')
word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue
ref_line_id = word_line_dict[ref_word_id]
if ref_line_id not in text_line_word_dict:
text_line_word_dict[ref_line_id] = list()
text_line_word_dict[ref_line_id].append(word)
return text_line_word_dict
def get_line_image_location():
image_loc_dict = dict() # Stores image base name and location
image_loc_vect = input_image_fh.read().strip().split("\n")
for line in image_loc_vect:
base_name = os.path.basename(line)
location_vect = line.split('/')
location = "/".join(location_vect[:-1])
image_loc_dict[base_name]=location
return image_loc_dict
### main ###
print("Processing '{}' data...".format(args.out_dir))
text_file = os.path.join(args.out_dir, 'text')
text_fh = open(text_file, 'w', encoding='utf-8')
utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
image_file = os.path.join(args.out_dir, 'images.scp')
image_fh = open(image_file, 'w', encoding='utf-8')
input_image_file = args.images_scp_path
input_image_fh = open(input_image_file, 'r', encoding='utf-8')
wc_dict1 = parse_writing_conditions(args.writing_condition1)
wc_dict2 = parse_writing_conditions(args.writing_condition2)
wc_dict3 = parse_writing_conditions(args.writing_condition3)
image_loc_dict = get_line_image_location()
image_num = 0
with open(args.data_splits) as f:
prev_base_name = ''
for line in f:
base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0]
if prev_base_name != base_name:
prev_base_name = base_name
madcat_xml_path, image_file_path, wc_dict = check_file_location()
if wc_dict is None or not check_writing_condition(wc_dict):
continue
madcat_doc = minidom.parse(madcat_xml_path)
writer = madcat_doc.getElementsByTagName('writer')
writer_id = writer[0].getAttribute('id')
text_line_word_dict = read_text(madcat_xml_path)
base_name = os.path.basename(image_file_path).split('.tif')[0]
for line_id in sorted(text_line_word_dict):
if args.augment:
key = (line_id + '.')[:-1]
for i in range(0, 3):
location_id = "_{}_scale{}".format(line_id, i)
line_image_file_name = base_name + location_id + '.png'
location = image_loc_dict[line_image_file_name]
image_file_path = os.path.join(location, line_image_file_name)
line = text_line_word_dict[key]
text = ' '.join(line)
base_line_image_file_name = line_image_file_name.split('.png')[0]
utt_id = "{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_line_image_file_name)
text_fh.write(utt_id + ' ' + text + '\n')
utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
image_fh.write(utt_id + ' ' + image_file_path + '\n')
image_num += 1
else:
updated_base_name = "{}_{}.png".format(base_name, str(line_id).zfill(4))
location = image_loc_dict[updated_base_name]
image_file_path = os.path.join(location, updated_base_name)
line = text_line_word_dict[line_id]
text = ' '.join(line)
utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(line_id).zfill(4))
text_fh.write(utt_id + ' ' + text + '\n')
utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
image_fh.write(utt_id + ' ' + image_file_path + '\n')
image_num += 1