Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/lang/bpe/prepend_words.py 516 Bytes

  #!/usr/bin/env python3
  
  # This script, prepend '|' to every words in the transcript to mark
  # the beginning of the words for finding the initial-space of every word
  # after decoding.
  
  import sys
  import io
  import re
  
  whitespace = re.compile("[ \t]+")
  infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
  output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')
  for line in infile:
      words = whitespace.split(line.strip(" \t\r
  "))
      output.write(' '.join([ "|"+word for word in words]) + '
  ')