process_corpus.py
841 Bytes
#!/usr/bin/env python3
# Copyright 2018 Ashish Arora
# Apache 2.0
# This script reads valid phones and removes the lines in the corpus
# which have any other phone.
import os
import sys, io
phone_file = os.path.join('data/local/text/cleaned/phones.txt')
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
phone_dict = dict()
with open(phone_file, 'r', encoding='utf-8') as phone_fh:
for line in phone_fh:
line = line.strip().split()[0]
phone_dict[line] = line
phone_dict[' '] = ' '
corpus_text = list()
for line in infile:
text = line.strip()
skip_text = False
for phone in text:
if phone not in phone_dict.keys():
skip_text = True
break
if not skip_text:
output.write(text+ '\n')