convert_symtable_to_utf.py 1.19 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39


from __future__ import print_function
#!/usr/bin/env py

# Converts a romanized ECA word list (symbol table) to
# a version in the arabic script

import sys
import codecs

if len(sys.argv) < 3:
    print("USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]")
    print("E.g., local/convert_symtable_to_utf.py data/lang/words.txt \
                /export/corpora/LDC/LDC99L22")
    sys.exit(1)

# Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8
symtable = codecs.open(sys.argv[1], encoding="utf8")
lexicon = codecs.open(sys.argv[2] + "/callhome_arabic_lexicon_991012/ar_lex.v07", encoding="iso-8859-6")

dict_cache = {}
# First read off the dictionary and store stuff in a cache
for line in lexicon:
    line = line.strip().split()
    roman = line[0].strip()
    script = line[1].strip()
    assert roman not in dict_cache
    dict_cache[roman] = script

# Now read the symbol table and write off the ut8 versions
for line in symtable:
    line = line.strip().split()
    if line[0] in dict_cache:
        output = dict_cache[line[0]] + " " + line[1]
    else:
        output = line[0] + " " + line[1]
    sys.stdout.write(output.encode("utf-8") + "\n")

lexicon.close()
symtable.close()