Blame view

egs/tedlium/s5/local/join_suffix.py 753 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
  #!/usr/bin/env python
  #
  # Copyright  2014  Nickolay V. Shmyrev
  #            2016  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0
  
  
  from __future__ import print_function
  import sys
  from codecs import open
  
  # This script joins together pairs of split-up words like "you 're" -> "you're".
  # The TEDLIUM transcripts are normalized in a way that's not traditional for
  # speech recognition.
  
  for line in sys.stdin:
      items = line.split()
      new_items = []
      i = 1
      while i < len(items):
          if i < len(items) - 1 and items[i+1][0] == '\'':
              new_items.append(items[i] + items[i+1])
              i = i + 1
          else:
              new_items.append(items[i])
          i = i + 1
      print(items[0] + ' ' + ' '.join(new_items))