Blame view
egs/sprakbanken/s5/local/normalize_transcript.py
908 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import sys import re import writenumbers from string import maketrans ## Global vars normdict = {".": "", ",": "", ":": "", ";": "", "?": "", "\\": " ", "\t": " " } from_chars = ''.join(list(normdict.keys())) to_chars = ''.join(list(normdict.values())) #t_table = maketrans(from_chars, to_chars) ## Main numtable = writenumbers.loadNumTable(sys.argv[1]) transcript = codecs.open(sys.argv[2], "r", "utf8") outtext = codecs.open(sys.argv[3], "w", "utf8") for line in transcript: normtext1 = re.sub(r'[\.,:;\?]', '', line) normtext2 = re.sub(r'[\t\\]', ' ', normtext1) normtext3 = re.sub(r' +', ' ', normtext2.strip()) normtext4 = writenumbers.normNumber(normtext3, numtable) outtext.write(normtext4) transcript.close() outtext.close() |