append_eval_to_ctm.py
2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# Copyright 2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
from __future__ import print_function
import sys,operator
# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
# (i.e. the output of 'align-text' post-processed by 'wer_per_utt_details.pl')
# The tags in the appended column are:
# 'C' = correct
# 'S' = substitution
# 'I' = insertion
# 'U' = unknown (not part of scored segment)
if len(sys.argv) != 4:
print('Usage: %s eval-in ctm-in ctm-eval-out' % __file__)
sys.exit(1)
dummy, eval_in, ctm_in, ctm_eval_out = sys.argv
if ctm_eval_out == '-': ctm_eval_out = '/dev/stdout'
# Read the evalutation,
eval_vec = dict()
with open(eval_in, 'r') as f:
while True:
# Reading 4 lines encoding one utterance,
ref = f.readline()
hyp = f.readline()
op = f.readline()
csid = f.readline()
if not ref: break
# Parse the input,
utt,tag,hyp_vec = hyp.split(' ',2)
assert(tag == 'hyp')
utt,tag,op_vec = op.split(' ',2)
assert(tag == 'op')
hyp_vec = hyp_vec.split()
op_vec = op_vec.split()
# Fill create eval vector with symbols 'C', 'S', 'I'
assert(utt not in eval_vec)
eval_vec[utt] = []
for op,hyp in zip(op_vec, hyp_vec):
if op != 'D': eval_vec[utt].append((op,hyp))
# Load the 'ctm' into dictionary,
ctm = dict()
with open(ctm_in) as f:
for l in f:
utt, ch, beg, dur, wrd, conf = l.split()
if not utt in ctm: ctm[utt] = []
ctm[utt].append((utt, ch, float(beg), float(dur), wrd, float(conf)))
# Build the 'ctm' with 'eval' column added,
ctm_eval = []
for utt,ctm_part in ctm.items():
ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time,
try:
# merging 'tuples' by '+', the record has format:
# (utt, ch, beg, dur, ctm_wrd, conf, op, hyp_wrd)
merged = [ ctm_tup + evl_tup for ctm_tup,evl_tup in zip(ctm_part,eval_vec[utt]) ]
# check,
for j in range(len(merged)):
hyp_wrd = merged[j][-1]
ctm_wrd = merged[j][-4]
assert hyp_wrd == ctm_wrd, "We failed with words: hyp_wrd %s, ctm_wrd %s" % (hyp_wrd,ctm_wrd) # Check that words in 'ctm' and 'utt_stats' match!
merged[j] = merged[j][:-1] # dropping the 'hyp_wrd' (the last element of tuple),
# append,
ctm_eval.extend(merged)
except KeyError:
print('Missing key', utt, 'in the word-evaluation stats from scoring')
# Sort again,
ctm_eval.sort(key = operator.itemgetter(0,1,2))
# Store,
with open(ctm_eval_out,'w') as f:
for tup in ctm_eval:
f.write('%s %s %f %f %s %f %s\n' % tup)