Blame view

egs/wsj/s5/steps/conf/convert_ctm_to_tra.py 1.06 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
  #!/usr/bin/env python
  
  # Copyright 2015  Brno University of Technology (author: Karel Vesely)
  # Apache 2.0
  
  from __future__ import print_function
  import sys, operator
  
  # This scripts loads a 'ctm' file and converts it into the 'tra' format:
  # "utt-key word1 word2 word3 ... wordN"
  # The 'utt-key' is the 1st column in the CTM.
  
  # Typically the CTM contains:
  # - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl')
  # - confidences
  
  if len(sys.argv) != 3:
    print('Usage: %s ctm-in tra-out' % __file__)
    sys.exit(1)
  dummy, ctm_in, tra_out = sys.argv
  
  if ctm_in == '-': ctm_in = '/dev/stdin'
  if tra_out == '-': tra_out = '/dev/stdout'
  
  # Load the 'ctm' into dictionary,
  tra = dict()
  with open(ctm_in) as f:
    for l in f:
      utt, ch, beg, dur, wrd, conf = l.split()
      if not utt in tra: tra[utt] = []
      tra[utt].append((float(beg),wrd))
  
  # Store the in 'tra' format,
  with open(tra_out,'w') as f:
    for utt,tuples in tra.items():
      tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time,
      f.write('%s %s
  ' % (utt,' '.join([t[1] for t in tuples])))