Blame view
egs/fisher_swbd/s5/local/format_acronyms_dict.py
3.99 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
#!/usr/bin/env python # Copyright 2015 Minhua Wu # Apache 2.0 # convert acronyms in swbd dict to fisher convention # IBM to i._b._m. # BBC to b._b._c. # BBCs to b._b._c.s # BBC's to b._b._c.'s import argparse,re __author__ = 'Minhua Wu' parser = argparse.ArgumentParser(description='format acronyms to a._b._c.') parser.add_argument('-i','--input', help='Input lexicon',required=True) parser.add_argument('-o1','--output1',help='Output acronym lexicon(mapped)', required=True) parser.add_argument('-o2','--output2',help='Output acronym lexicon(original)', required=True) parser.add_argument('-L','--Letter', help='Input single letter pronunciation',required=True) parser.add_argument('-M','--Map', help='Output acronyms mapping',required=True) args = parser.parse_args() fin_lex = open(args.input,"r") fin_Letter = open(args.Letter, "r") fout_lex = open(args.output1, "w") fout_lex_ori = open(args.output2, "w") fout_map = open(args.Map, "w") # Initialise single letter dictionary dict_letter = {} for single_letter_lex in fin_Letter: items = single_letter_lex.split() dict_letter[items[0]] = single_letter_lex[len(items[0])+1:].strip() fin_Letter.close() #print dict_letter for lex in fin_lex: items = lex.split() word = items[0] lexicon = lex[len(items[0])+1:].strip() # find acronyms from words with only letters and ' pre_match = re.match(r'^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$',word) if pre_match: # find if words in the form of xxx's is acronym if word[-2:] == '\'s' and (lexicon[-1] == 's' or lexicon[-1] == 'z'): actual_word = word[:-2] actual_lexicon = lexicon[:-2] acronym_lexicon = "" for l in actual_word: acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " " if acronym_lexicon.strip() == actual_lexicon: acronym_mapped = "" for l in actual_word[:-1]: acronym_mapped = acronym_mapped + l.lower() + '._' acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.\'s' fout_map.write(word + '\t' + acronym_mapped + ' ') fout_lex.write(acronym_mapped + ' ' + lexicon + ' ') fout_lex_ori.write(word + ' ' + lexicon + ' ') else: continue # find if words in the form of xxxs is acronym elif word[-1] == 's' and (lexicon[-1] == 's' or lexicon[-1] == 'z'): actual_word = word[:-1] actual_lexicon = lexicon[:-2] acronym_lexicon = "" for l in actual_word: acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " " if acronym_lexicon.strip() == actual_lexicon: acronym_mapped = "" for l in actual_word[:-1]: acronym_mapped = acronym_mapped + l.lower() + '._' acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.s' fout_map.write(word + '\t' + acronym_mapped + ' ') fout_lex.write(acronym_mapped + ' ' + lexicon + ' ') fout_lex_ori.write(word + ' ' + lexicon + ' ') else: continue # find if words in the form of xxx (not ended with 's or s) is acronym elif word.find('\'') == -1 and word[-1] != 's': acronym_lexicon = "" for l in word: acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " " if acronym_lexicon.strip() == lexicon: acronym_mapped = "" for l in word[:-1]: acronym_mapped = acronym_mapped + l.lower() + '._' acronym_mapped = acronym_mapped + word[-1].lower() + '.' fout_map.write(word + '\t' + acronym_mapped + ' ') fout_lex.write(acronym_mapped + ' ' + lexicon + ' ') fout_lex_ori.write(word + ' ' + lexicon + ' ') else: continue else: continue |