Blame view
egs/wsj/s5/steps/conf/prepare_calibration_data.py
4.88 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env python # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 from __future__ import division import sys, math from optparse import OptionParser desc = """ Prepare input features and training targets for logistic regression, which calibrates the Minimum Bayes Risk posterior confidences. The logisitc-regression input features are: - posteriors from 'ctm' transformed by logit, - logarithm of word-length in letters, - 10base logarithm of unigram probability of a word from language model, - logarithm of average lattice-depth at position of the word (optional), The logistic-regresion targets are: - 1 for correct word, - 0 for incorrect word (substitution, insertion), The iput 'ctm' is augmented by per-word tags (or 'U' is added if no tags), 'C' = correct 'S' = substitution 'I' = insertion 'U' = unknown (not part of scored segment) The script can be used both to prepare the training data, or to prepare input features for forwarding through trained model. """ usage = "%prog [opts] ctm word-filter word-length unigrams depth-per-frame-ascii.ark word-categories" parser = OptionParser(usage=usage, description=desc) parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') (o, args) = parser.parse_args() if len(args) != 3: parser.print_help() sys.exit(1) ctm_file, word_feats_file, word_categories_file = args assert(o.conf_feats != '') # Load the ctm (optionally add eval colmn with 'U'): ctm = [ l.split() for l in open(ctm_file) ] if len(ctm[0]) == 6: [ l.append('U') for l in ctm ] assert(len(ctm[0]) == 7) # Load the word-features, the format: "wrd wrd_id filter length other_feats" # (typically 'other_feats' are unigram log-probabilities), word_feats = [ l.split(None,4) for l in open(word_feats_file) ] # Prepare filtering dict, word_filter = { wrd_id:bool(int(filter)) for (wrd,wrd_id,filter,length,other_feats) in word_feats } # Prepare the lenght dict, word_length = { wrd_id:float(length) for (wrd,wrd_id,filter,length,other_feats) in word_feats } # Prepare other_feats dict, other_feats = { wrd_id:other_feats.strip() for (wrd,wrd_id,filter,length,other_feats) in word_feats } # Build the targets, if o.conf_targets != '': with open(o.conf_targets,'w') as f: for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: # Skip the words we don't know if being correct, if score_tag == 'U': continue # Some words are excluded from training (partial words, hesitations, etc.), # (Value: 1 == keep word, 0 == exclude word from the targets), if not word_filter[wrd_id]: continue # Build the key, key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) # Build the target, tgt = 1 if score_tag == 'C' else 0 # Correct = 1, else 0, # Write, f.write('%s %d ' % (key,tgt)) # Load the per-frame lattice-depth, # - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file, # - if the 'ctm' and 'ark' keys don't match, we leave this feature out, if o.lattice_depth: depths = dict() for l in open(o.lattice_depth): utt,d = l.split(' ',1) depths[utt] = [int(i) for i in d.split()] # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt', wrd_to_cat = [ l.split() for l in open(word_categories_file) ] wrd_to_cat = { wrd_id:int(category) for wrd,wrd_id,category in wrd_to_cat } wrd_cat_num = max(wrd_to_cat.values()) + 1 # Build the input features, with open(o.conf_feats,'w') as f: for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: # Build the key, same as previously, key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) # Build input features, # - logit of MBR posterior, damper = 0.001 # avoid -inf,+inf from log, logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper) # - log of word-length, log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, # - categorical distribution of words (with frequency higher than min-count), wrd_1_of_k = [0]*wrd_cat_num; wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; # Compose the input feature vector, feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k # Optionally add average-depth of lattice at the word position, if o.lattice_depth != '': depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) feats += [ log_avg_depth ] # Store the input features, f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ] ') |