Blame view

egs/wsj/s5/utils/lang/bpe/prepend_words.py 516 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
  #!/usr/bin/env python3
  
  # This script, prepend '|' to every words in the transcript to mark
  # the beginning of the words for finding the initial-space of every word
  # after decoding.
  
  import sys
  import io
  import re
  
  whitespace = re.compile("[ \t]+")
  infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
  output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')
  for line in infile:
      words = whitespace.split(line.strip(" \t\r
  "))
      output.write(' '.join([ "|"+word for word in words]) + '
  ')