Blame view

egs/rimes/v1/local/combine_line_txt_to_paragraph.py 1.49 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
  #!/usr/bin/env python3
  
  """ This script creates paragraph level text file. It reads 
      the line level text file and combines them to get
      paragraph level file.
    Eg. local/combine_line_txt_to_paragraph.py
    Eg. Input:  writer000000_eval2011-0_000001  Comme indiqué dans
                writer000000_eval2011-0_000002  habitation n° DVT 36
                writer000000_eval2011-0_000003  de mon domicile
        Output: writer000000_eval2011-0 Comme indiqué dans habitation n° DVT 36 de mon domicile
  """
  
  import argparse
  import os
  import io
  import sys
  ### main ###
  infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
  output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  
  paragraph_txt_dict = dict()
  for line in infile:
    line_vect = line.strip().split(' ')
    line_id = int(line_vect[0].split('_')[-1])
    paragraph_id = line_vect[0].split('-')[-1]
    paragraph_id = int(paragraph_id.split('_')[0])
    line_text = " ".join(line_vect[1:])
    if paragraph_id not in paragraph_txt_dict.keys():
        paragraph_txt_dict[paragraph_id] = dict()
    paragraph_txt_dict[paragraph_id][line_id] = line_text
  
  
  para_txt_dict = dict()
  for para_id in sorted(paragraph_txt_dict.keys()):
      para_txt = ""
      for line_id in sorted(paragraph_txt_dict[para_id]):
          text = paragraph_txt_dict[para_id][line_id]
          para_txt = para_txt + " " + text
      para_txt_dict[para_id] = para_txt
      utt_id = 'writer' + str(para_id).zfill(6) + '_' + 'eval2011-' + str(para_id)
      output.write(utt_id + ' ' + para_txt + '
  ')