Blame view
egs/rimes/v1/local/combine_line_txt_to_paragraph.py
1.49 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
#!/usr/bin/env python3 """ This script creates paragraph level text file. It reads the line level text file and combines them to get paragraph level file. Eg. local/combine_line_txt_to_paragraph.py Eg. Input: writer000000_eval2011-0_000001 Comme indiqué dans writer000000_eval2011-0_000002 habitation n° DVT 36 writer000000_eval2011-0_000003 de mon domicile Output: writer000000_eval2011-0 Comme indiqué dans habitation n° DVT 36 de mon domicile """ import argparse import os import io import sys ### main ### infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') paragraph_txt_dict = dict() for line in infile: line_vect = line.strip().split(' ') line_id = int(line_vect[0].split('_')[-1]) paragraph_id = line_vect[0].split('-')[-1] paragraph_id = int(paragraph_id.split('_')[0]) line_text = " ".join(line_vect[1:]) if paragraph_id not in paragraph_txt_dict.keys(): paragraph_txt_dict[paragraph_id] = dict() paragraph_txt_dict[paragraph_id][line_id] = line_text para_txt_dict = dict() for para_id in sorted(paragraph_txt_dict.keys()): para_txt = "" for line_id in sorted(paragraph_txt_dict[para_id]): text = paragraph_txt_dict[para_id][line_id] para_txt = para_txt + " " + text para_txt_dict[para_id] = para_txt utt_id = 'writer' + str(para_id).zfill(6) + '_' + 'eval2011-' + str(para_id) output.write(utt_id + ' ' + para_txt + ' ') |