plda.h
12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
// ivector/plda.h
// Copyright 2013 Daniel Povey
// 2015 David Snyder
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_IVECTOR_PLDA_H_
#define KALDI_IVECTOR_PLDA_H_
#include <vector>
#include <algorithm>
#include "base/kaldi-common.h"
#include "matrix/matrix-lib.h"
#include "gmm/model-common.h"
#include "gmm/diag-gmm.h"
#include "gmm/full-gmm.h"
#include "itf/options-itf.h"
#include "util/common-utils.h"
namespace kaldi {
/* This code implements Probabilistic Linear Discriminant Analysis: see
"Probabilistic Linear Discriminant Analysis" by Sergey Ioffe, ECCV 2006.
At least, that was the inspiration. The E-M is an efficient method
that I derived myself (note: it could be made even more efficient but
it doesn't seem to be necessary as it's already very fast).
This implementation of PLDA only supports estimating with a between-class
dimension equal to the feature dimension. If you want a between-class
covariance that has a lower dimension, you can just remove the smallest
elements of the diagonalized between-class covariance matrix. This is not
100% exact (wouldn't give you as good likelihood as E-M estimation with that
dimension) but it's close enough. */
struct PldaConfig {
// This config is for the application of PLDA as a transform to iVectors,
// prior to dot-product scoring.
bool normalize_length;
bool simple_length_norm;
PldaConfig(): normalize_length(true), simple_length_norm(false) { }
void Register(OptionsItf *opts) {
opts->Register("normalize-length", &normalize_length,
"If true, do length normalization as part of PLDA (see "
"code for details). This does not set the length unit; "
"by default it instead ensures that the inner product "
"with the PLDA model's inverse variance (which is a "
"function of how many utterances the iVector was averaged "
"over) has the expected value, equal to the iVector "
"dimension.");
opts->Register("simple-length-normalization", &simple_length_norm,
"If true, replace the default length normalization by an "
"alternative that normalizes the length of the iVectors to "
"be equal to the square root of the iVector dimension.");
}
};
class Plda {
public:
Plda() { }
explicit Plda(const Plda &other):
mean_(other.mean_),
transform_(other.transform_),
psi_(other.psi_),
offset_(other.offset_) {
};
/// Transforms an iVector into a space where the within-class variance
/// is unit and between-class variance is diagonalized. The only
/// anticipated use of this function is to pre-transform iVectors
/// before giving them to the function LogLikelihoodRatio (it's
/// done this way for efficiency because a given iVector may be
/// used multiple times in LogLikelihoodRatio and we don't want
/// to repeat the matrix multiplication
///
/// If config.normalize_length == true, it will also normalize the iVector's
/// length by multiplying by a scalar that ensures that ivector^T inv_var
/// ivector = dim. In this case, "num_examples" comes into play because it
/// affects the expected covariance matrix of the iVector. The normalization
/// factor is returned, even if config.normalize_length == false, in which
/// case the normalization factor is computed but not applied.
/// If config.simple_length_normalization == true, then an alternative
/// normalization factor is computed that causes the iVector length
/// to be equal to the square root of the iVector dimension.
double TransformIvector(const PldaConfig &config,
const VectorBase<double> &ivector,
int32 num_examples,
VectorBase<double> *transformed_ivector) const;
/// float version of the above (not BaseFloat because we'd be implementing it
/// twice for the same type if BaseFloat == double).
float TransformIvector(const PldaConfig &config,
const VectorBase<float> &ivector,
int32 num_examples,
VectorBase<float> *transformed_ivector) const;
/// Returns the log-likelihood ratio
/// log (p(test_ivector | same) / p(test_ivector | different)).
/// transformed_train_ivector is an average over utterances for
/// that speaker. Both transformed_train_vector and transformed_test_ivector
/// are assumed to have been transformed by the function TransformIvector().
/// Note: any length normalization will have been done while computing
/// the transformed iVectors.
double LogLikelihoodRatio(const VectorBase<double> &transformed_train_ivector,
int32 num_train_utts,
const VectorBase<double> &transformed_test_ivector)
const;
/// This function smooths the within-class covariance by adding to it,
/// smoothing_factor (e.g. 0.1) times the between-class covariance (it's
/// implemented by modifying transform_). This is to compensate for
/// situations where there were too few utterances per speaker get a good
/// estimate of the within-class covariance, and where the leading elements of
/// psi_ were as a result very large.
void SmoothWithinClassCovariance(double smoothing_factor);
/// Apply a transform to the PLDA model. This is mostly used for
/// projecting the parameters of the model into a lower dimensional space,
/// i.e. in_transform.NumRows() <= in_transform.NumCols(), typically for
/// speaker diarization with a PCA transform.
void ApplyTransform(const Matrix<double> &in_transform);
int32 Dim() const { return mean_.Dim(); }
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary);
protected:
void ComputeDerivedVars(); // computes offset_.
friend class PldaEstimator;
friend class PldaUnsupervisedAdaptor;
Vector<double> mean_; // mean of samples in original space.
Matrix<double> transform_; // of dimension Dim() by Dim();
// this transform makes within-class covar unit
// and diagonalizes the between-class covar.
Vector<double> psi_; // of dimension Dim(). The between-class
// (diagonal) covariance elements, in decreasing order.
Vector<double> offset_; // derived variable: -1.0 * transform_ * mean_
private:
Plda &operator = (const Plda &other); // disallow assignment
/// This returns a normalization factor, which is a quantity we
/// must multiply "transformed_ivector" by so that it has the length
/// that it "should" have. We assume "transformed_ivector" is an
/// iVector in the transformed space (i.e., mean-subtracted, and
/// multiplied by transform_). The covariance it "should" have
/// in this space is \Psi + I/num_examples.
double GetNormalizationFactor(const VectorBase<double> &transformed_ivector,
int32 num_examples) const;
};
class PldaStats {
public:
PldaStats(): dim_(0) { } /// The dimension is set up the first time you add samples.
/// This function adds training samples corresponding to
/// one class (e.g. a speaker). Each row is a separate
/// sample from this group. The "weight" would normally
/// be 1.0, but you can set it to other values if you want
/// to weight your training samples.
void AddSamples(double weight,
const Matrix<double> &group);
int32 Dim() const { return dim_; }
void Init(int32 dim);
void Sort() { std::sort(class_info_.begin(), class_info_.end()); }
bool IsSorted() const;
~PldaStats();
protected:
friend class PldaEstimator;
int32 dim_;
int64 num_classes_;
int64 num_examples_; // total number of examples, summed over classes.
double class_weight_; // total over classes, of their weight.
double example_weight_; // total over classes, of weight times #examples.
Vector<double> sum_; // Weighted sum of class means (normalize by
// class_weight_ to get mean).
SpMatrix<double> offset_scatter_; // Sum over all examples, of the weight
// times (example - class-mean).
// We have one of these objects per class.
struct ClassInfo {
double weight;
Vector<double> *mean; // owned here, but as a pointer so
// sort can be lightweight
int32 num_examples; // the number of examples in the class
bool operator < (const ClassInfo &other) const {
return (num_examples < other.num_examples);
}
ClassInfo(double weight, Vector<double> *mean, int32 num_examples):
weight(weight), mean(mean), num_examples(num_examples) { }
};
std::vector<ClassInfo> class_info_;
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(PldaStats);
};
struct PldaEstimationConfig {
int32 num_em_iters;
PldaEstimationConfig(): num_em_iters(10){ }
void Register(OptionsItf *opts) {
opts->Register("num-em-iters", &num_em_iters,
"Number of iterations of E-M used for PLDA estimation");
}
};
class PldaEstimator {
public:
PldaEstimator(const PldaStats &stats);
void Estimate(const PldaEstimationConfig &config,
Plda *output);
private:
typedef PldaStats::ClassInfo ClassInfo;
/// Returns the part of the objf relating to
/// offsets from the class means. (total, not normalized)
double ComputeObjfPart1() const;
/// Returns the part of the obj relating to
/// the class means (total_not normalized)
double ComputeObjfPart2() const;
/// Returns the objective-function per sample.
double ComputeObjf() const;
int32 Dim() const { return stats_.Dim(); }
void EstimateOneIter();
void InitParameters();
void ResetPerIterStats();
// gets stats from intra-class variation (stats_.offset_scatter_).
void GetStatsFromIntraClass();
// gets part of stats relating to class means.
void GetStatsFromClassMeans();
// M-step
void EstimateFromStats();
// Copy to output.
void GetOutput(Plda *plda);
const PldaStats &stats_;
SpMatrix<double> within_var_;
SpMatrix<double> between_var_;
// These stats are reset on each iteration.
SpMatrix<double> within_var_stats_;
double within_var_count_; // count corresponding to within_var_stats_
SpMatrix<double> between_var_stats_;
double between_var_count_; // count corresponding to within_var_stats_
KALDI_DISALLOW_COPY_AND_ASSIGN(PldaEstimator);
};
struct PldaUnsupervisedAdaptorConfig {
BaseFloat mean_diff_scale;
BaseFloat within_covar_scale;
BaseFloat between_covar_scale;
PldaUnsupervisedAdaptorConfig():
mean_diff_scale(1.0),
within_covar_scale(0.3),
between_covar_scale(0.7) { }
void Register(OptionsItf *opts) {
opts->Register("mean-diff-scale", &mean_diff_scale,
"Scale with which to add to the total data variance, the outer "
"product of the difference between the original mean and the "
"adaptation-data mean");
opts->Register("within-covar-scale", &within_covar_scale,
"Scale that determines how much of excess variance in a "
"particular direction gets attributed to within-class covar.");
opts->Register("between-covar-scale", &between_covar_scale,
"Scale that determines how much of excess variance in a "
"particular direction gets attributed to between-class covar.");
}
};
/**
This class takes unlabeled iVectors from the domain of interest and uses their
mean and variance to adapt your PLDA matrices to a new domain. This class
also stores stats for this form of adaptation. */
class PldaUnsupervisedAdaptor {
public:
PldaUnsupervisedAdaptor(): tot_weight_(0.0) { }
// Add stats to this class. Normally the weight will be 1.0.
void AddStats(double weight, const Vector<double> &ivector);
void AddStats(double weight, const Vector<float> &ivector);
void UpdatePlda(const PldaUnsupervisedAdaptorConfig &config,
Plda *plda) const;
private:
double tot_weight_;
Vector<double> mean_stats_;
SpMatrix<double> variance_stats_;
};
} // namespace kaldi
#endif