Blame view

egs/wsj/s5/steps/conf/prepare_calibration_data.py 4.88 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
  #!/usr/bin/env python
  
  # Copyright 2015  Brno University of Technology (author: Karel Vesely)
  # Apache 2.0
  
  from __future__ import division
  import sys, math
  
  from optparse import OptionParser
  desc = """
  Prepare input features and training targets for logistic regression,
  which calibrates the Minimum Bayes Risk posterior confidences.
  
  The logisitc-regression input features are: 
  - posteriors from 'ctm' transformed by logit,
  - logarithm of word-length in letters,
  - 10base logarithm of unigram probability of a word from language model,
  - logarithm of average lattice-depth at position of the word (optional),
  
  The logistic-regresion targets are:
  - 1 for correct word,
  - 0 for incorrect word (substitution, insertion),
  
  The iput 'ctm' is augmented by per-word tags (or 'U' is added if no tags),
  'C' = correct
  'S' = substitution
  'I' = insertion
  'U' = unknown (not part of scored segment)
  
  The script can be used both to prepare the training data,
  or to prepare input features for forwarding through trained model.
  """
  usage = "%prog [opts] ctm word-filter word-length unigrams depth-per-frame-ascii.ark word-categories"
  parser = OptionParser(usage=usage, description=desc)
  parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='')
  parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='')
  parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='')
  (o, args) = parser.parse_args()
  
  if len(args) != 3:
    parser.print_help()
    sys.exit(1)
  ctm_file, word_feats_file, word_categories_file = args
  
  assert(o.conf_feats != '')
  
  # Load the ctm (optionally add eval colmn with 'U'):
  ctm = [ l.split() for l in open(ctm_file) ]
  if len(ctm[0]) == 6: [ l.append('U') for l in ctm ]
  assert(len(ctm[0]) == 7)
  
  # Load the word-features, the format: "wrd wrd_id filter length other_feats"
  # (typically 'other_feats' are unigram log-probabilities),
  word_feats = [ l.split(None,4) for l in open(word_feats_file) ]
  
  # Prepare filtering dict,
  word_filter = { wrd_id:bool(int(filter)) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
  # Prepare the lenght dict,
  word_length = { wrd_id:float(length) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
  # Prepare other_feats dict,
  other_feats = { wrd_id:other_feats.strip() for (wrd,wrd_id,filter,length,other_feats) in word_feats }
  
  # Build the targets,
  if o.conf_targets != '':
    with open(o.conf_targets,'w') as f:
      for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
        # Skip the words we don't know if being correct, 
        if score_tag == 'U': continue 
        # Some words are excluded from training (partial words, hesitations, etc.),
        # (Value: 1 == keep word, 0 == exclude word from the targets),
        if not word_filter[wrd_id]: continue 
        # Build the key,
        key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
        # Build the target,
        tgt = 1 if score_tag == 'C' else 0 # Correct = 1, else 0,
        # Write,
        f.write('%s %d
  ' % (key,tgt))
  
  # Load the per-frame lattice-depth,
  # - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file,
  # - if the 'ctm' and 'ark' keys don't match, we leave this feature out,
  if o.lattice_depth:
    depths = dict()
    for l in open(o.lattice_depth):
      utt,d = l.split(' ',1)
      depths[utt] = [int(i) for i in d.split()]
  
  # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
  wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
  wrd_to_cat = { wrd_id:int(category) for wrd,wrd_id,category in wrd_to_cat }
  wrd_cat_num = max(wrd_to_cat.values()) + 1
  
  # Build the input features,
  with open(o.conf_feats,'w') as f:
    for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
      # Build the key, same as previously,
      key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
  
      # Build input features,
      # - logit of MBR posterior,
      damper = 0.001 # avoid -inf,+inf from log,
      logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper)
      # - log of word-length,
      log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word,
      # - categorical distribution of words (with frequency higher than min-count),
      wrd_1_of_k = [0]*wrd_cat_num; 
      wrd_1_of_k[wrd_to_cat[wrd_id]] = 1;
  
      # Compose the input feature vector,
      feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k
  
      # Optionally add average-depth of lattice at the word position,
      if o.lattice_depth != '':
        depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
        log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
        feats += [ log_avg_depth ]
  
      # Store the input features, 
      f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]
  ')