Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/conf/append_prf_to_ctm.py 2.03 KB
  #!/usr/bin/env python
  
  # Copyright 2015  Brno University of Technology (author: Karel Vesely)
  # Apache 2.0
  
  from __future__ import print_function
  import sys
  
  # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
  # (parsed from the 'prf' output of 'sclite')
  
  # The tags in appended column are:
  #  'C' = correct
  #  'S' = substitution
  #  'I' = insertion
  #  'U' = unknown (not part of scored segment)
  
  # Parse options,
  if len(sys.argv) != 4:
    print("Usage: %s prf ctm_in ctm_out" % __file__)
    sys.exit(1)
  prf_file, ctm_file, ctm_out_file = sys.argv[1:]
  
  if ctm_out_file == '-': ctm_out_file = '/dev/stdout'
  
  # Load the prf file,
  prf = []
  with open(prf_file) as f:
    for l in f:
      # Store the data,
      if l[:5] == 'File:':
        file_id = l.split()[1]
      if l[:8] == 'Channel:':
        chan = l.split()[1]
      if l[:5] == 'H_T1:':
        h_t1 = l
      if l[:5] == 'Eval:':
        evl = l
        prf.append((file_id,chan,h_t1,evl))
  
  # Parse the prf records into dictionary,
  prf_dict = dict()
  for (f,c,t,e) in prf:
    t_pos = 0 # position in the 't' string,
    while t_pos < len(t):
      t1 = t[t_pos:].split(' ',1)[0] # get 1st token at 't_pos'
      try:
        # get word evaluation letter 'C,S,I',
        evl = e[t_pos] if e[t_pos] != ' ' else 'C' 
        # add to dictionary,
        key='%s,%s' % (f,c) # file,channel
        if key not in prf_dict: prf_dict[key] = dict()
        prf_dict[key][float(t1)] = evl
      except ValueError:
        pass
      t_pos += len(t1)+1 # advance position for parsing,
  
  # Load the ctm file (with confidences),
  with open(ctm_file) as f:
    ctm = [ l.split() for l in f ]
  
  # Append the sclite alignment tags to ctm,
  ctm_out = []
  for f, chan, beg, dur, wrd, conf in ctm:
    # U = unknown, C = correct, S = substitution, I = insertion,
    sclite_tag = 'U' 
    try:
      sclite_tag = prf_dict[('%s,%s'%(f,chan)).lower()][float(beg)]
    except KeyError:
      pass
    ctm_out.append([f,chan,beg,dur,wrd,conf,sclite_tag])
  
  # Save the augmented ctm file,
  with open(ctm_out_file, 'w') as f:
    f.writelines([' '.join(ctm_record)+'
  ' for ctm_record in ctm_out])