prepare_calibration_data.py
4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# Copyright 2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
from __future__ import division
import sys, math
from optparse import OptionParser
desc = """
Prepare input features and training targets for logistic regression,
which calibrates the Minimum Bayes Risk posterior confidences.
The logisitc-regression input features are:
- posteriors from 'ctm' transformed by logit,
- logarithm of word-length in letters,
- 10base logarithm of unigram probability of a word from language model,
- logarithm of average lattice-depth at position of the word (optional),
The logistic-regresion targets are:
- 1 for correct word,
- 0 for incorrect word (substitution, insertion),
The iput 'ctm' is augmented by per-word tags (or 'U' is added if no tags),
'C' = correct
'S' = substitution
'I' = insertion
'U' = unknown (not part of scored segment)
The script can be used both to prepare the training data,
or to prepare input features for forwarding through trained model.
"""
usage = "%prog [opts] ctm word-filter word-length unigrams depth-per-frame-ascii.ark word-categories"
parser = OptionParser(usage=usage, description=desc)
parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='')
parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='')
parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='')
(o, args) = parser.parse_args()
if len(args) != 3:
parser.print_help()
sys.exit(1)
ctm_file, word_feats_file, word_categories_file = args
assert(o.conf_feats != '')
# Load the ctm (optionally add eval colmn with 'U'):
ctm = [ l.split() for l in open(ctm_file) ]
if len(ctm[0]) == 6: [ l.append('U') for l in ctm ]
assert(len(ctm[0]) == 7)
# Load the word-features, the format: "wrd wrd_id filter length other_feats"
# (typically 'other_feats' are unigram log-probabilities),
word_feats = [ l.split(None,4) for l in open(word_feats_file) ]
# Prepare filtering dict,
word_filter = { wrd_id:bool(int(filter)) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
# Prepare the lenght dict,
word_length = { wrd_id:float(length) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
# Prepare other_feats dict,
other_feats = { wrd_id:other_feats.strip() for (wrd,wrd_id,filter,length,other_feats) in word_feats }
# Build the targets,
if o.conf_targets != '':
with open(o.conf_targets,'w') as f:
for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
# Skip the words we don't know if being correct,
if score_tag == 'U': continue
# Some words are excluded from training (partial words, hesitations, etc.),
# (Value: 1 == keep word, 0 == exclude word from the targets),
if not word_filter[wrd_id]: continue
# Build the key,
key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
# Build the target,
tgt = 1 if score_tag == 'C' else 0 # Correct = 1, else 0,
# Write,
f.write('%s %d\n' % (key,tgt))
# Load the per-frame lattice-depth,
# - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file,
# - if the 'ctm' and 'ark' keys don't match, we leave this feature out,
if o.lattice_depth:
depths = dict()
for l in open(o.lattice_depth):
utt,d = l.split(' ',1)
depths[utt] = [int(i) for i in d.split()]
# Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
wrd_to_cat = { wrd_id:int(category) for wrd,wrd_id,category in wrd_to_cat }
wrd_cat_num = max(wrd_to_cat.values()) + 1
# Build the input features,
with open(o.conf_feats,'w') as f:
for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
# Build the key, same as previously,
key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
# Build input features,
# - logit of MBR posterior,
damper = 0.001 # avoid -inf,+inf from log,
logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper)
# - log of word-length,
log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word,
# - categorical distribution of words (with frequency higher than min-count),
wrd_1_of_k = [0]*wrd_cat_num;
wrd_1_of_k[wrd_to_cat[wrd_id]] = 1;
# Compose the input feature vector,
feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k
# Optionally add average-depth of lattice at the word position,
if o.lattice_depth != '':
depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
feats += [ log_avg_depth ]
# Store the input features,
f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n')