prepend_words.py 516 Bytes
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16


#!/usr/bin/env python3

# This script, prepend '|' to every words in the transcript to mark
# the beginning of the words for finding the initial-space of every word
# after decoding.

import sys
import io
import re

whitespace = re.compile("[ \t]+")
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')
for line in infile:
    words = whitespace.split(line.strip(" \t\r\n"))
    output.write(' '.join([ "|"+word for word in words]) + '\n')