format_acronyms_dict.py
3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# Copyright 2015 Minhua Wu
# Apache 2.0
# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
# BBCs to b._b._c.s
# BBC's to b._b._c.'s
import argparse,re
__author__ = 'Minhua Wu'
parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
parser.add_argument('-i','--input', help='Input lexicon',required=True)
parser.add_argument('-o1','--output1',help='Output acronym lexicon(mapped)', required=True)
parser.add_argument('-o2','--output2',help='Output acronym lexicon(original)', required=True)
parser.add_argument('-L','--Letter', help='Input single letter pronunciation',required=True)
parser.add_argument('-M','--Map', help='Output acronyms mapping',required=True)
args = parser.parse_args()
fin_lex = open(args.input,"r")
fin_Letter = open(args.Letter, "r")
fout_lex = open(args.output1, "w")
fout_lex_ori = open(args.output2, "w")
fout_map = open(args.Map, "w")
# Initialise single letter dictionary
dict_letter = {}
for single_letter_lex in fin_Letter:
items = single_letter_lex.split()
dict_letter[items[0]] = single_letter_lex[len(items[0])+1:].strip()
fin_Letter.close()
#print dict_letter
for lex in fin_lex:
items = lex.split()
word = items[0]
lexicon = lex[len(items[0])+1:].strip()
# find acronyms from words with only letters and '
pre_match = re.match(r'^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$',word)
if pre_match:
# find if words in the form of xxx's is acronym
if word[-2:] == '\'s' and (lexicon[-1] == 's' or lexicon[-1] == 'z'):
actual_word = word[:-2]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for l in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
for l in actual_word[:-1]:
acronym_mapped = acronym_mapped + l.lower() + '._'
acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.\'s'
fout_map.write(word + '\t' + acronym_mapped + '\n')
fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
fout_lex_ori.write(word + ' ' + lexicon + '\n')
else:
continue
# find if words in the form of xxxs is acronym
elif word[-1] == 's' and (lexicon[-1] == 's' or lexicon[-1] == 'z'):
actual_word = word[:-1]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for l in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
for l in actual_word[:-1]:
acronym_mapped = acronym_mapped + l.lower() + '._'
acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.s'
fout_map.write(word + '\t' + acronym_mapped + '\n')
fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
fout_lex_ori.write(word + ' ' + lexicon + '\n')
else:
continue
# find if words in the form of xxx (not ended with 's or s) is acronym
elif word.find('\'') == -1 and word[-1] != 's':
acronym_lexicon = ""
for l in word:
acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
if acronym_lexicon.strip() == lexicon:
acronym_mapped = ""
for l in word[:-1]:
acronym_mapped = acronym_mapped + l.lower() + '._'
acronym_mapped = acronym_mapped + word[-1].lower() + '.'
fout_map.write(word + '\t' + acronym_mapped + '\n')
fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
fout_lex_ori.write(word + ' ' + lexicon + '\n')
else:
continue
else:
continue