join_suffix.py 753 Bytes
#!/usr/bin/env python
#
# Copyright  2014  Nickolay V. Shmyrev
#            2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0


from __future__ import print_function
import sys
from codecs import open

# This script joins together pairs of split-up words like "you 're" -> "you're".
# The TEDLIUM transcripts are normalized in a way that's not traditional for
# speech recognition.

for line in sys.stdin:
    items = line.split()
    new_items = []
    i = 1
    while i < len(items):
        if i < len(items) - 1 and items[i+1][0] == '\'':
            new_items.append(items[i] + items[i+1])
            i = i + 1
        else:
            new_items.append(items[i])
        i = i + 1
    print(items[0] + ' ' + ' '.join(new_items))