Blame view
egs/wsj/s5/utils/lang/bpe/prepend_words.py
516 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
#!/usr/bin/env python3 # This script, prepend '|' to every words in the transcript to mark # the beginning of the words for finding the initial-space of every word # after decoding. import sys import io import re whitespace = re.compile("[ \t]+") infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1') for line in infile: words = whitespace.split(line.strip(" \t\r ")) output.write(' '.join([ "|"+word for word in words]) + ' ') |