text_pre_process.py
3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# Copyright 2014 Vassil Panayotov
# Apache 2.0
# [This script was taken verbatim from the alignment scripts]
# Pre-process a book's text before passing it to Festival for normalization
# of the non-standard words
# Basically it does the following:
# 1) Convert the non-ASCII characters to their closest ASCII equivalent.
# 2) Convert Roman numerals to their decimal representation (do we really need this?)
# 3) Segments the original file into utterances and puts a special token at the
# end of each sentence, to make possible to recover them after NSW normalization
import argparse
import codecs, unicodedata
import re
import nltk
def parse_args():
parser = argparse.ArgumentParser(description="Pre-process a book's text")
parser.add_argument("--in-encoding", default="utf-8",
help="Encoding to use when reading the input text")
parser.add_argument("--out-encoding", default="ascii",
help="Encoding to use when writing the output text")
parser.add_argument('--sent-end-marker', default="DOTDOTDOT")
parser.add_argument("in_text", help="Input text")
parser.add_argument("out_text", help="Output text")
return parser.parse_args()
# http://rosettacode.org/wiki/Roman_numerals/Decode#Python
_rdecode = dict(zip('XVI', (10, 5, 1)))
def decode(roman):
result = 0
for r, r1 in zip(roman, roman[1:]):
rd, rd1 = _rdecode[r], _rdecode[r1]
result += -rd if rd < rd1 else rd
return result + _rdecode[roman[-1]]
def convert_roman(text):
"""
Uses heuristics to decide whether to convert a string that looks like a
roman numeral to decimal number.
"""
lines = re.split('\r?\n', text)
new_lines = list()
for i, l in enumerate(lines):
m = re.match('^(\s*C((hapter)|(HAPTER))\s+)(([IVX]+)|([ivx]+))(.*)', l)
if m is not None:
new_line = "%s%s%s" % (m.group(1), decode(m.group(5).upper()), m.group(8))
new_lines.append(new_line)
continue
m = re.match('^(\s*)(([IVX]+)|([ivx]+))([\s\.]+[A-Z].*)', l)
if m is not None:
new_line = "%s%s%s" % (m.group(1), decode(m.group(2).upper()), m.group(5))
new_lines.append(new_line)
continue
new_lines.append(l)
return '\n'.join(new_lines)
def segment_sentences(text, sent_marker):
punkt = nltk.data.load('tokenizers/punkt/english.pickle')
sents = punkt.tokenize(text)
line_sents = [re.sub('\r?\n', ' ', s) for s in sents]
line_sep = ' %s \n' % sent_marker
return (line_sep.join(line_sents) + sent_marker)
def pre_segment(text):
"""
The segmentation at the start of the chapters is not ideal - e.g. Chapter
number and title are lumped together into a long 'sentence'.
This routine tries to mitigate this by putting a dot at the end of each line
followed by 1 or more empty lines.
"""
lines = text.split('\n')
out_text = list()
punkt = set(['?', '!', '.'])
for i, l in enumerate(lines[:-2]):
if len(l.strip()) != 0 and l.strip()[-1] not in punkt and\
len(lines[i+1].strip()) == 0: # and len(lines[i+2].strip()) == 0:
out_text.append(l + '.')
else:
out_text.append(l)
return '\n'.join(out_text)
if __name__ == '__main__':
opts = parse_args()
with codecs.open(opts.in_text, 'r', opts.in_encoding, errors='ignore') as src:
text_in = src.read()
text = unicodedata.normalize(
'NFKD', text_in).encode(opts.out_encoding, 'ignore')
text = convert_roman(text)
text = pre_segment(text)
text = segment_sentences(text, opts.sent_end_marker)
with open(opts.out_text, 'w') as dst:
dst.write(text)