Blame view
egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py
1.51 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
#!/usr/bin/env python # Copyright 2015 Minhua Wu # Apache 2.0 # convert acronyms in swbd decode result to fisher convention # e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to # en_4156 B 414.26 0.16 u # en_4156 B 414.42 0.16 c # en_4156 B 414.58 0.16 l # en_4156 B 414.74 0.17 a from __future__ import division import argparse,re __author__ = 'Minhua Wu' parser = argparse.ArgumentParser(description='format acronyms from a._b._c. to a b c') parser.add_argument('-i','--input', help='Input ctm file ',required=True) parser.add_argument('-o','--output',help='Output ctm file', required=True) args = parser.parse_args() fin = open(args.input,"r") fout = open(args.output, "w") for line in fin: items = line.split() if items[4].find(".") != -1: letters = items[4].split("._") acronym_period = round(float(items[3]), 2) letter_slot = round(acronym_period/len(letters), 2) time_start = round(float(items[2]), 2) for l in letters[:-1]: time = " %.2f %.2f " % (time_start, letter_slot) fout.write(' '.join(items[:2])+ time + l + " ") time_start = time_start + letter_slot last_slot = acronym_period - letter_slot * (len(letters) - 1) time = " %.2f %.2f " % (time_start, last_slot) letters[-1] = re.sub(r"\.'s", "'s", letters[-1]) letters[-1] = re.sub(r"\.s", "'s", letters[-1]) fout.write(' '.join(items[:2])+ time + letters[-1].replace('.','') + " ") else: fout.write(line) |