format_acronyms_ctm_eval2000.py 1.51 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45


#!/usr/bin/env python

# Copyright 2015  Minhua Wu
# Apache 2.0

# convert acronyms in swbd decode result to fisher convention
# e.g. convert things like en_4156 B 414.26 0.65 u._c._l._a. to
# en_4156 B 414.26 0.16 u
# en_4156 B 414.42 0.16 c
# en_4156 B 414.58 0.16 l
# en_4156 B 414.74 0.17 a

from __future__ import division
import argparse,re
__author__ = 'Minhua Wu'
 
parser = argparse.ArgumentParser(description='format acronyms from a._b._c. to a b c')
parser.add_argument('-i','--input', help='Input ctm file ',required=True)
parser.add_argument('-o','--output',help='Output ctm file', required=True)
args = parser.parse_args()

fin = open(args.input,"r")
fout = open(args.output, "w")

for line in fin:
    items = line.split()
    
    if items[4].find(".") != -1:
        letters = items[4].split("._")
        acronym_period = round(float(items[3]), 2)
        letter_slot = round(acronym_period/len(letters), 2)
        time_start = round(float(items[2]), 2)
        for l in letters[:-1]:
            time = " %.2f %.2f " % (time_start, letter_slot)
            fout.write(' '.join(items[:2])+ time + l + "\n")
            time_start = time_start + letter_slot
        last_slot = acronym_period - letter_slot * (len(letters) - 1)
        time = " %.2f %.2f " % (time_start, last_slot)
        letters[-1] = re.sub(r"\.'s", "'s", letters[-1])
        letters[-1] = re.sub(r"\.s", "'s", letters[-1])        
        fout.write(' '.join(items[:2])+ time + letters[-1].replace('.','') + "\n")
    else:
        fout.write(line)