Blame view
egs/yomdle_russian/v1/local/process_corpus.py
841 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
#!/usr/bin/env python3 # Copyright 2018 Ashish Arora # Apache 2.0 # This script reads valid phones and removes the lines in the corpus # which have any other phone. import os import sys, io phone_file = os.path.join('data/local/text/cleaned/phones.txt') infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') phone_dict = dict() with open(phone_file, 'r', encoding='utf-8') as phone_fh: for line in phone_fh: line = line.strip().split()[0] phone_dict[line] = line phone_dict[' '] = ' ' corpus_text = list() for line in infile: text = line.strip() skip_text = False for phone in text: if phone not in phone_dict.keys(): skip_text = True break if not skip_text: output.write(text+ ' ') |