Blame view
egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py
1.19 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
from __future__ import print_function #!/usr/bin/env py # Converts a romanized ECA word list (symbol table) to # a version in the arabic script import sys import codecs if len(sys.argv) < 3: print("USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]") print("E.g., local/convert_symtable_to_utf.py data/lang/words.txt \ /export/corpora/LDC/LDC99L22") sys.exit(1) # Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8 symtable = codecs.open(sys.argv[1], encoding="utf8") lexicon = codecs.open(sys.argv[2] + "/callhome_arabic_lexicon_991012/ar_lex.v07", encoding="iso-8859-6") dict_cache = {} # First read off the dictionary and store stuff in a cache for line in lexicon: line = line.strip().split() roman = line[0].strip() script = line[1].strip() assert roman not in dict_cache dict_cache[roman] = script # Now read the symbol table and write off the ut8 versions for line in symtable: line = line.strip().split() if line[0] in dict_cache: output = dict_cache[line[0]] + " " + line[1] else: output = line[0] + " " + line[1] sys.stdout.write(output.encode("utf-8") + " ") lexicon.close() symtable.close() |