Blame view
egs/fisher_swbd/s5/local/map_acronyms_transcripts.py
1.81 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/usr/bin/env python # Copyright 2015 Minhua Wu # Apache 2.0 # convert acronyms in swbd transcript to fisher convention # accoring to first two columns in the input acronyms mapping import argparse,re __author__ = 'Minhua Wu' parser = argparse.ArgumentParser(description='format acronyms to a._b._c.') parser.add_argument('-i','--input', help='Input transcripts',required=True) parser.add_argument('-o','--output',help='Output transcripts', required=True) parser.add_argument('-M','--Map', help='Input acronyms mapping',required=True) args = parser.parse_args() fin_map = open(args.Map, "r") dict_acronym = {} dict_acronym_noi = {} # Mapping of acronyms without I, i for pair in fin_map: items = pair.split('\t') dict_acronym[items[0]] = items[1].strip() dict_acronym_noi[items[0]] = items[1].strip() fin_map.close() del dict_acronym_noi['i'] del dict_acronym_noi['I'] fin_trans = open(args.input, "r") fout_trans = open(args.output, "w") for line in fin_trans: line = line.strip() items = line.split() L = len(items) # First pass mapping to map I as part of acronym for i in range(L): if items[i] == 'i': x = 0 while(i-1-x >= 0 and re.match(r'^[A-Z]$',items[i-1-x])): x += 1 y = 0 while(i+1+y < L and re.match(r'^[A-Z]$',items[i+1+y])): y += 1 if x+y > 0: for bias in range(-x, y+1): items[i+bias] = dict_acronym[items[i+bias]] # Second pass mapping (not mapping 'i' and 'I') for i in range(len(items)): if items[i] in dict_acronym_noi.keys(): items[i] = dict_acronym_noi[items[i]] sentence = ' '.join(items[1:]) fout_trans.write(items[0]+ ' ' +sentence.lower() +' ') fin_trans.close() fout_trans.close() |