Blame view
egs/iam/v2/local/process_aachen_splits.py
3.22 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#!/usr/bin/env python3 # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora """ This script reads the extracted IAM database files and creates the following files (for the data subset selected via --dataset): text, utt2spk, images.scp. Eg. local/process_aachen_splits.py data/local data/train data --dataset train Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from utt2spk file: 000_a01-000u-00 000 images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png """ import argparse import os import sys import xml.dom.minidom as minidom parser = argparse.ArgumentParser(description="""Creates text, utt2spk and images.scp files.""") parser.add_argument('database_path', type=str, help='Path to the downloaded (and extracted) IAM data') parser.add_argument('split_path', type=str, help='location of the train/test/val set') parser.add_argument('out_dir', type=str, help='location to write output files.') parser.add_argument('--dataset', type=str, default='train', choices=['train', 'test','validation'], help='Subset of data to process.') args = parser.parse_args() text_file = os.path.join(args.out_dir + '/', 'text') text_fh = open(text_file, 'w') utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') utt2spk_fh = open(utt2spk_file, 'w') image_file = os.path.join(args.out_dir + '/', 'images.scp') image_fh = open(image_file, 'w') dataset_path = os.path.join(args.split_path, args.dataset + '.uttlist') text_file_path = os.path.join(args.database_path, 'ascii','lines.txt') text_dict = {} def process_text_file_for_word_model(): with open (text_file_path, 'rt') as in_file: for line in in_file: if line[0]=='#': continue line = line.strip() utt_id = line.split(' ')[0] text_vect = line.split(' ')[8:] text = "".join(text_vect) text = text.replace("|", " ") text_dict[utt_id] = text ### main ### print("Processing '{}' data...".format(args.dataset)) process_text_file_for_word_model() with open(dataset_path) as f: for line in f: line = line.strip() line_vect = line.split('-') xml_file = line_vect[0] + '-' + line_vect[1] xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') doc = minidom.parse(xml_path) form_elements = doc.getElementsByTagName('form')[0] writer_id = form_elements.getAttribute('writer-id') outerfolder = form_elements.getAttribute('id')[0:3] innerfolder = form_elements.getAttribute('id') lines_path = os.path.join(args.database_path, 'lines', outerfolder, innerfolder) for file in os.listdir(lines_path): if file.endswith(".png"): image_file_path = os.path.join(lines_path, file) base_name = os.path.splitext(os.path.basename(image_file_path))[0] text = text_dict[base_name] utt_id = writer_id + '_' + base_name text_fh.write(utt_id + ' ' + text + ' ') utt2spk_fh.write(utt_id + ' ' + writer_id + ' ') image_fh.write(utt_id + ' ' + image_file_path + ' ') |