Blame view

egs/librispeech/s5/local/lm/python/text_pre_process.py 3.68 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  #!/usr/bin/env python
  
  # Copyright 2014 Vassil Panayotov
  # Apache 2.0
  
  # [This script was taken verbatim from the alignment scripts]
  
  # Pre-process a book's text before passing it to Festival for normalization
  # of the non-standard words
  # Basically it does the following:
  # 1) Convert the non-ASCII characters to their closest ASCII equivalent.
  # 2) Convert Roman numerals to their decimal representation (do we really need this?)
  # 3) Segments the original file into utterances and puts a special token at the
  #    end of each sentence, to make possible to recover them after NSW normalization
  
  import argparse
  import codecs, unicodedata
  import re
  import nltk
  
  def parse_args():
      parser = argparse.ArgumentParser(description="Pre-process a book's text")
      parser.add_argument("--in-encoding", default="utf-8",
                          help="Encoding to use when reading the input text")
      parser.add_argument("--out-encoding", default="ascii",
                          help="Encoding to use when writing the output text")
      parser.add_argument('--sent-end-marker', default="DOTDOTDOT")
      parser.add_argument("in_text", help="Input text")
      parser.add_argument("out_text", help="Output text")
      return parser.parse_args()
  
  # http://rosettacode.org/wiki/Roman_numerals/Decode#Python
  _rdecode = dict(zip('XVI', (10, 5, 1)))
  def decode(roman):
      result = 0
      for r, r1 in zip(roman, roman[1:]):
          rd, rd1 = _rdecode[r], _rdecode[r1]
          result += -rd if rd < rd1 else rd
      return result + _rdecode[roman[-1]]
  
  def convert_roman(text):
      """
      Uses heuristics to decide whether to convert a string that looks like a
      roman numeral to decimal number.
      """
      lines = re.split('\r?
  ', text)
      new_lines = list()
      for i, l in enumerate(lines):
          m = re.match('^(\s*C((hapter)|(HAPTER))\s+)(([IVX]+)|([ivx]+))(.*)', l)
          if m is not None:
              new_line = "%s%s%s" % (m.group(1), decode(m.group(5).upper()), m.group(8))
              new_lines.append(new_line)
              continue
          m = re.match('^(\s*)(([IVX]+)|([ivx]+))([\s\.]+[A-Z].*)', l)
          if m is not None:
              new_line = "%s%s%s" % (m.group(1), decode(m.group(2).upper()), m.group(5))
              new_lines.append(new_line)
              continue
          new_lines.append(l)
      return '
  '.join(new_lines)
  
  def segment_sentences(text, sent_marker):
      punkt = nltk.data.load('tokenizers/punkt/english.pickle')
      sents = punkt.tokenize(text)
      line_sents = [re.sub('\r?
  ', ' ', s) for s in sents]
      line_sep = ' %s 
  ' % sent_marker
      return (line_sep.join(line_sents) + sent_marker)
  
  def pre_segment(text):
      """
      The segmentation at the start of the chapters is not ideal - e.g. Chapter
      number and title are lumped together into a long 'sentence'.
      This routine tries to mitigate this by putting a dot at the end of each line
      followed by 1 or more empty lines.
      """
      lines = text.split('
  ')
      out_text = list()
      punkt = set(['?', '!', '.'])
      for i, l in enumerate(lines[:-2]):
          if len(l.strip()) != 0 and l.strip()[-1] not in punkt and\
             len(lines[i+1].strip()) == 0: #  and len(lines[i+2].strip()) == 0:
              out_text.append(l + '.')
          else:
              out_text.append(l)
      return '
  '.join(out_text)
  
  if __name__ == '__main__':
      opts = parse_args()
      with codecs.open(opts.in_text, 'r', opts.in_encoding, errors='ignore') as src:
          text_in = src.read()
  
      text = unicodedata.normalize(
                  'NFKD', text_in).encode(opts.out_encoding, 'ignore')
      text = convert_roman(text)
      text = pre_segment(text)
      text = segment_sentences(text, opts.sent_end_marker)
  
      with open(opts.out_text, 'w') as dst:
          dst.write(text)