convert_ctm_to_tra.py
1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env python
# Copyright 2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
from __future__ import print_function
import sys, operator
# This scripts loads a 'ctm' file and converts it into the 'tra' format:
# "utt-key word1 word2 word3 ... wordN"
# The 'utt-key' is the 1st column in the CTM.
# Typically the CTM contains:
# - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl')
# - confidences
if len(sys.argv) != 3:
print('Usage: %s ctm-in tra-out' % __file__)
sys.exit(1)
dummy, ctm_in, tra_out = sys.argv
if ctm_in == '-': ctm_in = '/dev/stdin'
if tra_out == '-': tra_out = '/dev/stdout'
# Load the 'ctm' into dictionary,
tra = dict()
with open(ctm_in) as f:
for l in f:
utt, ch, beg, dur, wrd, conf = l.split()
if not utt in tra: tra[utt] = []
tra[utt].append((float(beg),wrd))
# Store the in 'tra' format,
with open(tra_out,'w') as f:
for utt,tuples in tra.items():
tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time,
f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples])))