Blame view
egs/sprakbanken_swe/s5/local/normalize_transcript.py
929 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
#!/usr/bin/env python import codecs import sys import re #import writenumbers ## Global vars normdict = {".": "", ",": "", ":": "", ";": "", "?": "", "!": "", "\\": " ", "\t": " " } #removes all the above signs from_chars = ''.join(list(normdict.keys())) to_chars = ''.join(list(normdict.values())) t_table = str.maketrans(normdict) ## Main transcript = codecs.open(sys.argv[1], "r", "utf8") outtext = codecs.open(sys.argv[2], "w", "utf8") #TODO: Add number normalisation and remove uppercasing for line in transcript: line = line.replace(".\Punkt", ".") line = line.replace(",\Komma", ",") normtext1 = re.sub(r'[\.,:;\?]', '', line) normtext2 = re.sub(r'[\t\\]', ' ', normtext1) normtext3 = re.sub(r' +', ' ', normtext2.strip()) outtext.write(normtext3.upper()) transcript.close() outtext.close() |