Blame view

egs/sprakbanken_swe/s5/local/normalize_transcript.py 929 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
  #!/usr/bin/env python
  import codecs
  import sys
  import re
  #import writenumbers
  
  
  ## Global vars
  
  normdict = {".": "",
              ",": "",
              ":": "",
              ";": "",
              "?": "",
              "!": "",
              "\\": " ",
              "\t": " "
              }
  #removes all the above signs
  
  from_chars = ''.join(list(normdict.keys()))
  to_chars = ''.join(list(normdict.values()))
  
  t_table = str.maketrans(normdict)
  
  ## Main
  
  transcript = codecs.open(sys.argv[1], "r", "utf8")
  outtext = codecs.open(sys.argv[2], "w", "utf8")
  
  #TODO: Add number normalisation and remove uppercasing
  
  for line in transcript:
      line = line.replace(".\Punkt", ".")
      line = line.replace(",\Komma", ",")
      normtext1 = re.sub(r'[\.,:;\?]', '', line)
      normtext2 = re.sub(r'[\t\\]', ' ', normtext1)
      normtext3 = re.sub(r'  +', ' ', normtext2.strip())
      outtext.write(normtext3.upper())
  
  transcript.close()
  outtext.close()