Blame view
egs/wsj/s5/steps/conf/append_prf_to_ctm.py
2.03 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
#!/usr/bin/env python # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 from __future__ import print_function import sys # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM': # (parsed from the 'prf' output of 'sclite') # The tags in appended column are: # 'C' = correct # 'S' = substitution # 'I' = insertion # 'U' = unknown (not part of scored segment) # Parse options, if len(sys.argv) != 4: print("Usage: %s prf ctm_in ctm_out" % __file__) sys.exit(1) prf_file, ctm_file, ctm_out_file = sys.argv[1:] if ctm_out_file == '-': ctm_out_file = '/dev/stdout' # Load the prf file, prf = [] with open(prf_file) as f: for l in f: # Store the data, if l[:5] == 'File:': file_id = l.split()[1] if l[:8] == 'Channel:': chan = l.split()[1] if l[:5] == 'H_T1:': h_t1 = l if l[:5] == 'Eval:': evl = l prf.append((file_id,chan,h_t1,evl)) # Parse the prf records into dictionary, prf_dict = dict() for (f,c,t,e) in prf: t_pos = 0 # position in the 't' string, while t_pos < len(t): t1 = t[t_pos:].split(' ',1)[0] # get 1st token at 't_pos' try: # get word evaluation letter 'C,S,I', evl = e[t_pos] if e[t_pos] != ' ' else 'C' # add to dictionary, key='%s,%s' % (f,c) # file,channel if key not in prf_dict: prf_dict[key] = dict() prf_dict[key][float(t1)] = evl except ValueError: pass t_pos += len(t1)+1 # advance position for parsing, # Load the ctm file (with confidences), with open(ctm_file) as f: ctm = [ l.split() for l in f ] # Append the sclite alignment tags to ctm, ctm_out = [] for f, chan, beg, dur, wrd, conf in ctm: # U = unknown, C = correct, S = substitution, I = insertion, sclite_tag = 'U' try: sclite_tag = prf_dict[('%s,%s'%(f,chan)).lower()][float(beg)] except KeyError: pass ctm_out.append([f,chan,beg,dur,wrd,conf,sclite_tag]) # Save the augmented ctm file, with open(ctm_out_file, 'w') as f: f.writelines([' '.join(ctm_record)+' ' for ctm_record in ctm_out]) |