Blame view

egs/yomdle_korean/v1/local/normalize_data.py 549 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
  #!/usr/bin/env python3
  
  # Copyright      2017  Hossein Hadian
  
  # Apache 2.0
  # This script converts a BPE-encoded text to normal text. It is used in scoring
  
  import sys, io
  import string
  import unicodedata
  infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
  output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  for line in infile:
    words = line.strip().split()
    uttid = words[0]
    transcript = ' '.join(words[1:])
    text_normalized = unicodedata.normalize('NFC', transcript)
    output.write(uttid + ' ' + text_normalized + '
  ')