swbd_format_acronyms_dict.py
4.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# Copyright 2015 Minhua Wu
# Apache 2.0
###########################################################################################
# This script was copied from egs/swbd/s5c/local/format_acronyms_dict.py
# The source commit was c4a73526bb5e5602b5f5c98afb097234f7d891be
# No changes were made
###########################################################################################
# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
# BBCs to b._b._c.s
# BBC's to b._b._c.'s
import argparse,re
__author__ = 'Minhua Wu'
parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
parser.add_argument('-i','--input', help='Input lexicon',required=True)
parser.add_argument('-o','--output',help='Output lexicon', required=True)
parser.add_argument('-L','--Letter', help='Input single letter pronunciation',required=True)
parser.add_argument('-M','--Map', help='Output acronyms mapping',required=True)
args = parser.parse_args()
fin_lex = open(args.input,"r")
fin_Letter = open(args.Letter, "r")
fout_lex = open(args.output, "w")
fout_map = open(args.Map, "w")
# Initialise single letter dictionary
dict_letter = {}
for single_letter_lex in fin_Letter:
items = single_letter_lex.split()
dict_letter[items[0]] = single_letter_lex[len(items[0])+1:].strip()
fin_Letter.close()
#print dict_letter
for lex in fin_lex:
items = lex.split()
word = items[0]
lexicon = lex[len(items[0])+1:].strip()
# find acronyms from words with only letters and '
pre_match = re.match(r'^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$',word)
if pre_match:
# find if words in the form of xxx's is acronym
if word[-2:] == '\'s' and (lexicon[-1] == 's' or lexicon[-1] == 'z'):
actual_word = word[:-2]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for l in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for l in actual_word[:-1]:
acronym_mapped = acronym_mapped + l.lower() + '._'
acronym_mapped_back = acronym_mapped_back + l.lower() + ' '
acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.\'s'
acronym_mapped_back = acronym_mapped_back + actual_word[-1].lower() + '\'s'
fout_map.write(word + '\t' + acronym_mapped + '\t' + acronym_mapped_back + '\n')
fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
else:
fout_lex.write(lex)
# find if words in the form of xxxs is acronym
elif word[-1] == 's' and (lexicon[-1] == 's' or lexicon[-1] == 'z'):
actual_word = word[:-1]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for l in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for l in actual_word[:-1]:
acronym_mapped = acronym_mapped + l.lower() + '._'
acronym_mapped_back = acronym_mapped_back + l.lower() + ' '
acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.s'
acronym_mapped_back = acronym_mapped_back + actual_word[-1].lower() + '\'s'
fout_map.write(word + '\t' + acronym_mapped + '\t' + acronym_mapped_back + '\n')
fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
else:
fout_lex.write(lex)
# find if words in the form of xxx (not ended with 's or s) is acronym
elif word.find('\'') == -1 and word[-1] != 's':
acronym_lexicon = ""
for l in word:
acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
if acronym_lexicon.strip() == lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for l in word[:-1]:
acronym_mapped = acronym_mapped + l.lower() + '._'
acronym_mapped_back = acronym_mapped_back + l.lower() + ' '
acronym_mapped = acronym_mapped + word[-1].lower() + '.'
acronym_mapped_back = acronym_mapped_back + word[-1].lower()
fout_map.write(word + '\t' + acronym_mapped + '\t' + acronym_mapped_back + '\n')
fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
else:
fout_lex.write(lex)
else:
fout_lex.write(lex)
else:
fout_lex.write(lex)