prepend_words.py
516 Bytes
#!/usr/bin/env python3
# This script, prepend '|' to every words in the transcript to mark
# the beginning of the words for finding the initial-space of every word
# after decoding.
import sys
import io
import re
whitespace = re.compile("[ \t]+")
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')
for line in infile:
words = whitespace.split(line.strip(" \t\r\n"))
output.write(' '.join([ "|"+word for word in words]) + '\n')