normalize_transcript.py
929 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
import codecs
import sys
import re
#import writenumbers
## Global vars
normdict = {".": "",
",": "",
":": "",
";": "",
"?": "",
"!": "",
"\\": " ",
"\t": " "
}
#removes all the above signs
from_chars = ''.join(list(normdict.keys()))
to_chars = ''.join(list(normdict.values()))
t_table = str.maketrans(normdict)
## Main
transcript = codecs.open(sys.argv[1], "r", "utf8")
outtext = codecs.open(sys.argv[2], "w", "utf8")
#TODO: Add number normalisation and remove uppercasing
for line in transcript:
line = line.replace(".\Punkt", ".")
line = line.replace(",\Komma", ",")
normtext1 = re.sub(r'[\.,:;\?]', '', line)
normtext2 = re.sub(r'[\t\\]', ' ', normtext1)
normtext3 = re.sub(r' +', ' ', normtext2.strip())
outtext.write(normtext3.upper())
transcript.close()
outtext.close()