combine_line_txt_to_paragraph.py
1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3
""" This script creates paragraph level text file. It reads
the line level text file and combines them to get
paragraph level file.
Eg. local/combine_line_txt_to_paragraph.py
Eg. Input: writer000000_eval2011-0_000001 Comme indiqué dans
writer000000_eval2011-0_000002 habitation n° DVT 36
writer000000_eval2011-0_000003 de mon domicile
Output: writer000000_eval2011-0 Comme indiqué dans habitation n° DVT 36 de mon domicile
"""
import argparse
import os
import io
import sys
### main ###
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
paragraph_txt_dict = dict()
for line in infile:
line_vect = line.strip().split(' ')
line_id = int(line_vect[0].split('_')[-1])
paragraph_id = line_vect[0].split('-')[-1]
paragraph_id = int(paragraph_id.split('_')[0])
line_text = " ".join(line_vect[1:])
if paragraph_id not in paragraph_txt_dict.keys():
paragraph_txt_dict[paragraph_id] = dict()
paragraph_txt_dict[paragraph_id][line_id] = line_text
para_txt_dict = dict()
for para_id in sorted(paragraph_txt_dict.keys()):
para_txt = ""
for line_id in sorted(paragraph_txt_dict[para_id]):
text = paragraph_txt_dict[para_id][line_id]
para_txt = para_txt + " " + text
para_txt_dict[para_id] = para_txt
utt_id = 'writer' + str(para_id).zfill(6) + '_' + 'eval2011-' + str(para_id)
output.write(utt_id + ' ' + para_txt + '\n')