Blame view

egs/wsj/s5/steps/conf/append_eval_to_ctm.py 2.52 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
  #!/usr/bin/env python
  
  # Copyright 2015  Brno University of Technology (author: Karel Vesely)
  # Apache 2.0
  
  from __future__ import print_function
  import sys,operator
  
  # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
  # (i.e. the output of 'align-text' post-processed by 'wer_per_utt_details.pl')
  
  # The tags in the appended column are:
  #  'C' = correct
  #  'S' = substitution
  #  'I' = insertion
  #  'U' = unknown (not part of scored segment)
  
  if len(sys.argv) != 4:
    print('Usage: %s eval-in ctm-in ctm-eval-out' % __file__)
    sys.exit(1)
  dummy, eval_in, ctm_in, ctm_eval_out = sys.argv
  
  if ctm_eval_out == '-': ctm_eval_out = '/dev/stdout'
  
  # Read the evalutation,
  eval_vec = dict()
  with open(eval_in, 'r') as f:
    while True:
      # Reading 4 lines encoding one utterance,
      ref = f.readline()
      hyp = f.readline()
      op = f.readline()
      csid = f.readline()
      if not ref: break
      # Parse the input,
      utt,tag,hyp_vec = hyp.split(' ',2)
      assert(tag == 'hyp')
      utt,tag,op_vec = op.split(' ',2)
      assert(tag == 'op')
      hyp_vec = hyp_vec.split()
      op_vec = op_vec.split()
      # Fill create eval vector with symbols 'C', 'S', 'I'
      assert(utt not in eval_vec)
      eval_vec[utt] = []
      for op,hyp in zip(op_vec, hyp_vec):
        if op != 'D': eval_vec[utt].append((op,hyp))
  
  # Load the 'ctm' into dictionary,
  ctm = dict()
  with open(ctm_in) as f:
    for l in f:
      utt, ch, beg, dur, wrd, conf = l.split()
      if not utt in ctm: ctm[utt] = []
      ctm[utt].append((utt, ch, float(beg), float(dur), wrd, float(conf)))
  
  # Build the 'ctm' with 'eval' column added,
  ctm_eval = []
  for utt,ctm_part in ctm.items():
    ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time,
    try:
      # merging 'tuples' by '+', the record has format:
      # (utt, ch, beg, dur, ctm_wrd, conf, op, hyp_wrd)
      merged = [ ctm_tup + evl_tup for ctm_tup,evl_tup in zip(ctm_part,eval_vec[utt]) ]
      # check,
      for j in range(len(merged)):
        hyp_wrd = merged[j][-1]
        ctm_wrd = merged[j][-4]
        assert hyp_wrd == ctm_wrd, "We failed with words: hyp_wrd %s, ctm_wrd %s" % (hyp_wrd,ctm_wrd) # Check that words in 'ctm' and 'utt_stats' match!
        merged[j] = merged[j][:-1] # dropping the 'hyp_wrd' (the last element of tuple),
      # append,
      ctm_eval.extend(merged)
    except KeyError:
      print('Missing key', utt, 'in the word-evaluation stats from scoring')
  
  # Sort again,
  ctm_eval.sort(key = operator.itemgetter(0,1,2))
  
  # Store,
  with open(ctm_eval_out,'w') as f:
    for tup in ctm_eval:
      f.write('%s %s %f %f %s %f %s
  ' % tup)