// ivector/plda.cc // Copyright 2013 Daniel Povey // 2015 David Snyder // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include #include "ivector/plda.h" namespace kaldi { void Plda::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); mean_.Write(os, binary); transform_.Write(os, binary); psi_.Write(os, binary); WriteToken(os, binary, ""); } void Plda::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); mean_.Read(is, binary); transform_.Read(is, binary); psi_.Read(is, binary); ExpectToken(is, binary, ""); ComputeDerivedVars(); } template /// This function computes a projection matrix that when applied makes the /// covariance unit (i.e. all 1). static void ComputeNormalizingTransform(const SpMatrix &covar, MatrixBase *proj) { int32 dim = covar.NumRows(); TpMatrix C(dim); // Cholesky of covar, covar = C C^T C.Cholesky(covar); C.Invert(); // The matrix that makes covar unit is C^{-1}, because // C^{-1} covar C^{-T} = C^{-1} C C^T C^{-T} = I. proj->CopyFromTp(C, kNoTrans); // set "proj" to C^{-1}. } void Plda::ComputeDerivedVars() { KALDI_ASSERT(Dim() > 0); offset_.Resize(Dim()); offset_.AddMatVec(-1.0, transform_, kNoTrans, mean_, 0.0); } /** This comment explains the thinking behind the function LogLikelihoodRatio. The reference is "Probabilistic Linear Discriminant Analysis" by Sergey Ioffe, ECCV 2006. I'm looking at the un-numbered equation between eqs. (4) and (5), that says P(u^p | u^g_{1...n}) = N (u^p | \frac{n \Psi}{n \Psi + I} \bar{u}^g, I + \frac{\Psi}{n\Psi + I}) Here, the superscript ^p refers to the "probe" example (e.g. the example to be classified), and u^g_1 is the first "gallery" example, i.e. the first training example of that class. \psi is the between-class covariance matrix, assumed to be diagonalized, and I can be interpreted as the within-class covariance matrix which we have made unit. We want the likelihood ratio P(u^p | u^g_{1..n}) / P(u^p), where the numerator is the probability of u^p given that it's in that class, and the denominator is the probability of u^p with no class assumption at all (e.g. in its own class). The expression above even works for n = 0 (e.g. the denominator of the likelihood ratio), where it gives us P(u^p) = N(u^p | 0, I + \Psi) i.e. it's distributed with zero mean and covarance (within + between). The likelihood ratio we want is: N(u^p | \frac{n \Psi}{n \Psi + I} \bar{u}^g, I + \frac{\Psi}{n \Psi + I}) / N(u^p | 0, I + \Psi) where \bar{u}^g is the mean of the "gallery examples"; and we can expand the log likelihood ratio as - 0.5 [ (u^p - m) (I + \Psi/(n \Psi + I))^{-1} (u^p - m) + logdet(I + \Psi/(n \Psi + I)) ] + 0.5 [u^p (I + \Psi) u^p + logdet(I + \Psi) ] where m = (n \Psi)/(n \Psi + I) \bar{u}^g. */ double Plda::GetNormalizationFactor( const VectorBase &transformed_ivector, int32 num_examples) const { KALDI_ASSERT(num_examples > 0); // Work out the normalization factor. The covariance for an average over // "num_examples" training iVectors equals \Psi + I/num_examples. Vector transformed_ivector_sq(transformed_ivector); transformed_ivector_sq.ApplyPow(2.0); // inv_covar will equal 1.0 / (\Psi + I/num_examples). Vector inv_covar(psi_); inv_covar.Add(1.0 / num_examples); inv_covar.InvertElements(); // "transformed_ivector" should have covariance (\Psi + I/num_examples), i.e. // within-class/num_examples plus between-class covariance. So // transformed_ivector_sq . (I/num_examples + \Psi)^{-1} should be equal to // the dimension. double dot_prod = VecVec(inv_covar, transformed_ivector_sq); return sqrt(Dim() / dot_prod); } double Plda::TransformIvector(const PldaConfig &config, const VectorBase &ivector, int32 num_examples, VectorBase *transformed_ivector) const { KALDI_ASSERT(ivector.Dim() == Dim() && transformed_ivector->Dim() == Dim()); double normalization_factor; transformed_ivector->CopyFromVec(offset_); transformed_ivector->AddMatVec(1.0, transform_, kNoTrans, ivector, 1.0); if (config.simple_length_norm) normalization_factor = sqrt(transformed_ivector->Dim()) / transformed_ivector->Norm(2.0); else normalization_factor = GetNormalizationFactor(*transformed_ivector, num_examples); if (config.normalize_length) transformed_ivector->Scale(normalization_factor); return normalization_factor; } // "float" version of TransformIvector. float Plda::TransformIvector(const PldaConfig &config, const VectorBase &ivector, int32 num_examples, VectorBase *transformed_ivector) const { Vector tmp(ivector), tmp_out(ivector.Dim()); float ans = TransformIvector(config, tmp, num_examples, &tmp_out); transformed_ivector->CopyFromVec(tmp_out); return ans; } // There is an extended comment within this file, referencing a paper by // Ioffe, that may clarify what this function is doing. double Plda::LogLikelihoodRatio( const VectorBase &transformed_train_ivector, int32 n, // number of training utterances. const VectorBase &transformed_test_ivector) const { int32 dim = Dim(); double loglike_given_class, loglike_without_class; { // work out loglike_given_class. // "mean" will be the mean of the distribution if it comes from the // training example. The mean is \frac{n \Psi}{n \Psi + I} \bar{u}^g // "variance" will be the variance of that distribution, equal to // I + \frac{\Psi}{n\Psi + I}. Vector mean(dim, kUndefined); Vector variance(dim, kUndefined); for (int32 i = 0; i < dim; i++) { mean(i) = n * psi_(i) / (n * psi_(i) + 1.0) * transformed_train_ivector(i); variance(i) = 1.0 + psi_(i) / (n * psi_(i) + 1.0); } double logdet = variance.SumLog(); Vector sqdiff(transformed_test_ivector); sqdiff.AddVec(-1.0, mean); sqdiff.ApplyPow(2.0); variance.InvertElements(); loglike_given_class = -0.5 * (logdet + M_LOG_2PI * dim + VecVec(sqdiff, variance)); } { // work out loglike_without_class. Here the mean is zero and the variance // is I + \Psi. Vector sqdiff(transformed_test_ivector); // there is no offset. sqdiff.ApplyPow(2.0); Vector variance(psi_); variance.Add(1.0); // I + \Psi. double logdet = variance.SumLog(); variance.InvertElements(); loglike_without_class = -0.5 * (logdet + M_LOG_2PI * dim + VecVec(sqdiff, variance)); } double loglike_ratio = loglike_given_class - loglike_without_class; return loglike_ratio; } void Plda::SmoothWithinClassCovariance(double smoothing_factor) { KALDI_ASSERT(smoothing_factor >= 0.0 && smoothing_factor <= 1.0); // smoothing_factor > 1.0 is possible but wouldn't really make sense. KALDI_LOG << "Smoothing within-class covariance by " << smoothing_factor << ", Psi is initially: " << psi_; Vector within_class_covar(Dim()); within_class_covar.Set(1.0); // It's now the current within-class covariance // (a diagonal matrix) in the space transformed // by transform_. within_class_covar.AddVec(smoothing_factor, psi_); /// We now revise our estimate of the within-class covariance to this /// larger value. This means that the transform has to change to as /// to make this new, larger covariance unit. And our between-class /// covariance in this space is now less. psi_.DivElements(within_class_covar); KALDI_LOG << "New value of Psi is " << psi_; within_class_covar.ApplyPow(-0.5); transform_.MulRowsVec(within_class_covar); ComputeDerivedVars(); } void Plda::ApplyTransform(const Matrix &in_transform) { KALDI_ASSERT(in_transform.NumRows() <= Dim() && in_transform.NumCols() == Dim()); // Apply in_transform to mean_. Vector mean_new(in_transform.NumRows()); mean_new.AddMatVec(1.0, in_transform, kNoTrans, mean_, 0.0); mean_.Resize(in_transform.NumRows()); mean_.CopyFromVec(mean_new); SpMatrix between_var(in_transform.NumCols()), within_var(in_transform.NumCols()), psi_mat(in_transform.NumCols()), between_var_new(Dim()), within_var_new(Dim()); Matrix transform_invert(transform_); // Next, compute the between_var and within_var that existed // prior to diagonalization. psi_mat.AddDiagVec(1.0, psi_); transform_invert.Invert(); within_var.AddMat2(1.0, transform_invert, kNoTrans, 0.0); between_var.AddMat2Sp(1.0, transform_invert, kNoTrans, psi_mat, 0.0); // Next, transform the variances using the input transformation. between_var_new.AddMat2Sp(1.0, in_transform, kNoTrans, between_var, 0.0); within_var_new.AddMat2Sp(1.0, in_transform, kNoTrans, within_var, 0.0); // Finally, we need to recompute psi_ and transform_. The remainder of // the code in this function is a lightly modified copy of // PldaEstimator::GetOutput(). Matrix transform1(Dim(), Dim()); ComputeNormalizingTransform(within_var_new, &transform1); // Now transform is a matrix that if we project with it, // within_var becomes unit. // between_var_proj is between_var after projecting with transform1. SpMatrix between_var_proj(Dim()); between_var_proj.AddMat2Sp(1.0, transform1, kNoTrans, between_var_new, 0.0); Matrix U(Dim(), Dim()); Vector s(Dim()); // Do symmetric eigenvalue decomposition between_var_proj = U diag(s) U^T, // where U is orthogonal. between_var_proj.Eig(&s, &U); KALDI_ASSERT(s.Min() >= 0.0); int32 n; s.ApplyFloor(0.0, &n); if (n > 0) { KALDI_WARN << "Floored " << n << " eigenvalues of between-class " << "variance to zero."; } // Sort from greatest to smallest eigenvalue. SortSvd(&s, &U); // The transform U^T will make between_var_proj diagonal with value s // (i.e. U^T U diag(s) U U^T = diag(s)). The final transform that // makes within_var unit and between_var diagonal is U^T transform1, // i.e. first transform1 and then U^T. transform_.Resize(Dim(), Dim()); transform_.AddMatMat(1.0, U, kTrans, transform1, kNoTrans, 0.0); psi_.Resize(Dim()); psi_.CopyFromVec(s); ComputeDerivedVars(); } void PldaStats::AddSamples(double weight, const Matrix &group) { if (dim_ == 0) { Init(group.NumCols()); } else { KALDI_ASSERT(dim_ == group.NumCols()); } int32 n = group.NumRows(); // number of examples for this class Vector *mean = new Vector(dim_); mean->AddRowSumMat(1.0 / n, group); offset_scatter_.AddMat2(weight, group, kTrans, 1.0); // the following statement has the same effect as if we // had first subtracted the mean from each element of // the group before the statement above. offset_scatter_.AddVec2(-n * weight, *mean); class_info_.push_back(ClassInfo(weight, mean, n)); num_classes_ ++; num_examples_ += n; class_weight_ += weight; example_weight_ += weight * n; sum_.AddVec(weight, *mean); } PldaStats::~PldaStats() { for (size_t i = 0; i < class_info_.size(); i++) delete class_info_[i].mean; } bool PldaStats::IsSorted() const { for (size_t i = 0; i + 1 < class_info_.size(); i++) if (class_info_[i+1] < class_info_[i]) return false; return true; } void PldaStats::Init(int32 dim) { KALDI_ASSERT(dim_ == 0); dim_ = dim; num_classes_ = 0; num_examples_ = 0; class_weight_ = 0.0; example_weight_ = 0.0; sum_.Resize(dim); offset_scatter_.Resize(dim); KALDI_ASSERT(class_info_.empty()); } PldaEstimator::PldaEstimator(const PldaStats &stats): stats_(stats) { KALDI_ASSERT(stats.IsSorted()); InitParameters(); } double PldaEstimator::ComputeObjfPart1() const { // Returns the part of the objf relating to offsets from the class means. // within_class_count equals the sum over the classes, of the weight of that // class (normally 1) times (1 - #examples) of that class, which equals the // rank of the covariance we're modeling. We imagine that we're modeling (1 - // #examples) separate samples, each with the within-class covariance.. the // argument is a little complicated and involves an orthogonal complement of a // matrix whose first row computes the mean. double within_class_count = stats_.example_weight_ - stats_.class_weight_, within_logdet, det_sign; SpMatrix inv_within_var(within_var_); inv_within_var.Invert(&within_logdet, &det_sign); KALDI_ASSERT(det_sign == 1 && "Within-class covariance is singular"); double objf = -0.5 * (within_class_count * (within_logdet + M_LOG_2PI * Dim()) + TraceSpSp(inv_within_var, stats_.offset_scatter_)); return objf; } double PldaEstimator::ComputeObjfPart2() const { double tot_objf = 0.0; int32 n = -1; // the number of examples for the current class SpMatrix combined_inv_var(Dim()); // combined_inv_var = (between_var_ + within_var_ / n)^{-1} double combined_var_logdet; for (size_t i = 0; i < stats_.class_info_.size(); i++) { const ClassInfo &info = stats_.class_info_[i]; if (info.num_examples != n) { n = info.num_examples; // variance of mean of n examples is between-class + 1/n * within-class combined_inv_var.CopyFromSp(between_var_); combined_inv_var.AddSp(1.0 / n, within_var_); combined_inv_var.Invert(&combined_var_logdet); } Vector mean (*(info.mean)); mean.AddVec(-1.0 / stats_.class_weight_, stats_.sum_); tot_objf += info.weight * -0.5 * (combined_var_logdet + M_LOG_2PI * Dim() + VecSpVec(mean, combined_inv_var, mean)); } return tot_objf; } double PldaEstimator::ComputeObjf() const { double ans1 = ComputeObjfPart1(), ans2 = ComputeObjfPart2(), ans = ans1 + ans2, example_weights = stats_.example_weight_, normalized_ans = ans / example_weights; KALDI_LOG << "Within-class objf per sample is " << (ans1 / example_weights) << ", between-class is " << (ans2 / example_weights) << ", total is " << normalized_ans; return normalized_ans; } void PldaEstimator::InitParameters() { within_var_.Resize(Dim()); within_var_.SetUnit(); between_var_.Resize(Dim()); between_var_.SetUnit(); } void PldaEstimator::ResetPerIterStats() { within_var_stats_.Resize(Dim()); within_var_count_ = 0.0; between_var_stats_.Resize(Dim()); between_var_count_ = 0.0; } void PldaEstimator::GetStatsFromIntraClass() { within_var_stats_.AddSp(1.0, stats_.offset_scatter_); // Note: in the normal case, the expression below will be equal to the sum // over the classes, of (1-n), where n is the #examples for that class. That // is the rank of the scatter matrix that "offset_scatter_" has for that // class. [if weights other than 1.0 are used, it will be different.] within_var_count_ += (stats_.example_weight_ - stats_.class_weight_); } /** GetStatsFromClassMeans() is the more complicated part of PLDA estimation. Let's suppose the mean of a particular class is m, and suppose that that class had n examples. We suppose that m ~ N(0, between_var_ + 1/n within_var_) i.e. m is Gaussian-distributed with zero mean and variance equal to the between-class variance plus 1/n times the within-class variance. Now, m is observed (as stats_.class_info_[something].mean). We're doing an E-M procedure where we treat m as the sum of two variables: m = x + y where x ~ N(0, between_var_) y ~ N(0, 1/n * within_var_) The distribution of x will contribute to the stats of between_var_, and y to within_var_. Now, y = m - x, so we can focus on working out the distribution of x and then we can very simply get the distribution of y. The following expression also includes the likelihood of y as a function of x. Note: the C is different from line to line. log p(x) = C - 0.5 ( x^T between_var^{-1} x + (m-x)^T (1/n within_var)^{-1) (m-x) ) = C - 0.5 x^T (between_var^{-1} + n within_var^{-1}) x + x^T z where z = n within_var^{-1} m, and we can write this as: log p(x) = C - 0.5 (x-w)^T (between_var^{-1} + n within_var^{-1}) (x-w) where x^T (between_var^{-1} + n within_var^{-1}) w = x^T z, i.e. (between_var^{-1} + n within_var^{-1}) w = z = n within_var^{-1} m, so w = (between_var^{-1} + n within_var^{-1})^{-1} * n within_var^{-1} m We can see that the distribution over x is Gaussian, with mean w and variance (between_var^{-1} + n within_var^{-1})^{-1}. The distribution over y is Gaussian with the same variance, and mean m - w. So the update to the between-var stats will be: between-var-stats += w w^T + (between_var^{-1} + n within_var^{-1})^{-1}. and the update to the within-var stats will be: within-var-stats += n ( (m-w) (m-w)^T (between_var^{-1} + n within_var^{-1})^{-1} ). The drawback of this formulation is that each time we encounter a different value of n (number of examples) we will have to do a different matrix inversion. We'll try to improve on this later using a suitable transform. */ void PldaEstimator::GetStatsFromClassMeans() { SpMatrix between_var_inv(between_var_); between_var_inv.Invert(); SpMatrix within_var_inv(within_var_); within_var_inv.Invert(); // mixed_var will equal (between_var^{-1} + n within_var^{-1})^{-1}. SpMatrix mixed_var(Dim()); int32 n = -1; // the current number of examples for the class. for (size_t i = 0; i < stats_.class_info_.size(); i++) { const ClassInfo &info = stats_.class_info_[i]; double weight = info.weight; if (info.num_examples != n) { n = info.num_examples; mixed_var.CopyFromSp(between_var_inv); mixed_var.AddSp(n, within_var_inv); mixed_var.Invert(); } Vector m = *(info.mean); // the mean for this class. m.AddVec(-1.0 / stats_.class_weight_, stats_.sum_); // remove global mean Vector temp(Dim()); // n within_var^{-1} m temp.AddSpVec(n, within_var_inv, m, 0.0); Vector w(Dim()); // w, as defined in the comment. w.AddSpVec(1.0, mixed_var, temp, 0.0); Vector m_w(m); // m - w m_w.AddVec(-1.0, w); between_var_stats_.AddSp(weight, mixed_var); between_var_stats_.AddVec2(weight, w); between_var_count_ += weight; within_var_stats_.AddSp(weight * n, mixed_var); within_var_stats_.AddVec2(weight * n, m_w); within_var_count_ += weight; } } void PldaEstimator::EstimateFromStats() { within_var_.CopyFromSp(within_var_stats_); within_var_.Scale(1.0 / within_var_count_); between_var_.CopyFromSp(between_var_stats_); between_var_.Scale(1.0 / between_var_count_); KALDI_LOG << "Trace of within-class variance is " << within_var_.Trace(); KALDI_LOG << "Trace of between-class variance is " << between_var_.Trace(); } void PldaEstimator::EstimateOneIter() { ResetPerIterStats(); GetStatsFromIntraClass(); GetStatsFromClassMeans(); EstimateFromStats(); KALDI_VLOG(2) << "Objective function is " << ComputeObjf(); } void PldaEstimator::Estimate(const PldaEstimationConfig &config, Plda *plda) { KALDI_ASSERT(stats_.example_weight_ > 0 && "Cannot estimate with no stats"); for (int32 i = 0; i < config.num_em_iters; i++) { KALDI_LOG << "Plda estimation iteration " << i << " of " << config.num_em_iters; EstimateOneIter(); } GetOutput(plda); } void PldaEstimator::GetOutput(Plda *plda) { plda->mean_ = stats_.sum_; plda->mean_.Scale(1.0 / stats_.class_weight_); KALDI_LOG << "Norm of mean of iVector distribution is " << plda->mean_.Norm(2.0); Matrix transform1(Dim(), Dim()); ComputeNormalizingTransform(within_var_, &transform1); // now transform is a matrix that if we project with it, // within_var_ becomes unit. // between_var_proj is between_var after projecting with transform1. SpMatrix between_var_proj(Dim()); between_var_proj.AddMat2Sp(1.0, transform1, kNoTrans, between_var_, 0.0); Matrix U(Dim(), Dim()); Vector s(Dim()); // Do symmetric eigenvalue decomposition between_var_proj = U diag(s) U^T, // where U is orthogonal. between_var_proj.Eig(&s, &U); KALDI_ASSERT(s.Min() >= 0.0); int32 n; s.ApplyFloor(0.0, &n); if (n > 0) { KALDI_WARN << "Floored " << n << " eigenvalues of between-class " << "variance to zero."; } // Sort from greatest to smallest eigenvalue. SortSvd(&s, &U); // The transform U^T will make between_var_proj diagonal with value s // (i.e. U^T U diag(s) U U^T = diag(s)). The final transform that // makes within_var_ unit and between_var_ diagonal is U^T transform1, // i.e. first transform1 and then U^T. plda->transform_.Resize(Dim(), Dim()); plda->transform_.AddMatMat(1.0, U, kTrans, transform1, kNoTrans, 0.0); plda->psi_ = s; KALDI_LOG << "Diagonal of between-class variance in normalized space is " << s; if (GetVerboseLevel() >= 2) { // at higher verbose levels, do a self-test // (just tests that this function does what it // should). SpMatrix tmp_within(Dim()); tmp_within.AddMat2Sp(1.0, plda->transform_, kNoTrans, within_var_, 0.0); KALDI_ASSERT(tmp_within.IsUnit(0.0001)); SpMatrix tmp_between(Dim()); tmp_between.AddMat2Sp(1.0, plda->transform_, kNoTrans, between_var_, 0.0); KALDI_ASSERT(tmp_between.IsDiagonal(0.0001)); Vector psi(Dim()); psi.CopyDiagFromSp(tmp_between); AssertEqual(psi, plda->psi_); } plda->ComputeDerivedVars(); } void PldaUnsupervisedAdaptor::AddStats(double weight, const Vector &ivector) { if (mean_stats_.Dim() == 0) { mean_stats_.Resize(ivector.Dim()); variance_stats_.Resize(ivector.Dim()); } KALDI_ASSERT(weight >= 0.0); tot_weight_ += weight; mean_stats_.AddVec(weight, ivector); variance_stats_.AddVec2(weight, ivector); } void PldaUnsupervisedAdaptor::AddStats(double weight, const Vector &ivector) { Vector ivector_dbl(ivector); this->AddStats(weight, ivector_dbl); } void PldaUnsupervisedAdaptor::UpdatePlda(const PldaUnsupervisedAdaptorConfig &config, Plda *plda) const { KALDI_ASSERT(tot_weight_ > 0.0); int32 dim = mean_stats_.Dim(); KALDI_ASSERT(dim == plda->Dim()); Vector mean(mean_stats_); mean.Scale(1.0 / tot_weight_); SpMatrix variance(variance_stats_); variance.Scale(1.0 / tot_weight_); variance.AddVec2(-1.0, mean); // Make it the uncentered variance. // mean_diff of the adaptation data from the training data. We optionally add // this to our total covariance matrix Vector mean_diff(mean); mean_diff.AddVec(-1.0, plda->mean_); KALDI_ASSERT(config.mean_diff_scale >= 0.0); variance.AddVec2(config.mean_diff_scale, mean_diff); // update the plda's mean data-member with our adaptation-data mean. plda->mean_.CopyFromVec(mean); // transform_model_ is a row-scaled version of plda->transform_ that // transforms into the space where the total covariance is 1.0. Because // plda->transform_ transforms into a space where the within-class covar is // 1.0 and the the between-class covar is diag(plda->psi_), we need to scale // each dimension i by 1.0 / sqrt(1.0 + plda->psi_(i)) Matrix transform_mod(plda->transform_); for (int32 i = 0; i < dim; i++) transform_mod.Row(i).Scale(1.0 / sqrt(1.0 + plda->psi_(i))); // project the variance of the adaptation set into this space where // the total covariance is unit. SpMatrix variance_proj(dim); variance_proj.AddMat2Sp(1.0, transform_mod, kNoTrans, variance, 0.0); // Do eigenvalue decomposition of variance_proj; this will tell us the // directions in which the adaptation-data covariance is more than // the training-data covariance. Matrix P(dim, dim); Vector s(dim); variance_proj.Eig(&s, &P); SortSvd(&s, &P); KALDI_LOG << "Eigenvalues of adaptation-data total-covariance in space where " << "training-data total-covariance is unit, is: " << s; // W, B are the (within,between)-class covars in the space transformed by // transform_mod. SpMatrix W(dim), B(dim); for (int32 i = 0; i < dim; i++) { W(i, i) = 1.0 / (1.0 + plda->psi_(i)), B(i, i) = plda->psi_(i) / (1.0 + plda->psi_(i)); } // OK, so variance_proj (projected by transform_mod) is P diag(s) P^T. // Suppose that after transform_mod we project by P^T. Then the adaptation-data's // variance would be P^T P diag(s) P^T P = diag(s), and the PLDA model's // within class variance would be P^T W P and its between-class variance would be // P^T B P. We'd still have that W+B = I in this space. // First let's compute these projected variances... we call the "proj2" because // it's after the data has been projected twice (actually, transformed, as there is no // dimension loss), by transform_mod and then P^T. SpMatrix Wproj2(dim), Bproj2(dim); Wproj2.AddMat2Sp(1.0, P, kTrans, W, 0.0); Bproj2.AddMat2Sp(1.0, P, kTrans, B, 0.0); Matrix Ptrans(P, kTrans); SpMatrix Wproj2mod(Wproj2), Bproj2mod(Bproj2); for (int32 i = 0; i < dim; i++) { // For this eigenvalue, compute the within-class covar projected with this direction, // and the same for between. BaseFloat within = Wproj2(i, i), between = Bproj2(i, i); KALDI_LOG << "For " << i << "'th eigenvalue, value is " << s(i) << ", within-class covar in this direction is " << within << ", between-class is " << between; if (s(i) > 1.0) { double excess_eig = s(i) - 1.0; double excess_within_covar = excess_eig * config.within_covar_scale, excess_between_covar = excess_eig * config.between_covar_scale; Wproj2mod(i, i) += excess_within_covar; Bproj2mod(i, i) += excess_between_covar; } /* Below I was considering a method like below, to try to scale up the dimensions that had less variance than expected in our sample.. this didn't help, and actually when I set that power to +0.2 instead of -0.5 it gave me an improvement on sre08. But I'm not sure about this.. it just doesn't seem right. else { BaseFloat scale = pow(std::max(1.0e-10, s(i)), -0.5); BaseFloat max_scale = 10.0; // I'll make this configurable later. scale = std::min(scale, max_scale); Ptrans.Row(i).Scale(scale); } */ } // combined transform "transform_mod" and then P^T that takes us to the space // where {W,B}proj2{,mod} are. Matrix combined_trans(dim, dim); combined_trans.AddMatMat(1.0, Ptrans, kNoTrans, transform_mod, kNoTrans, 0.0); Matrix combined_trans_inv(combined_trans); // ... and its inverse. combined_trans_inv.Invert(); // Wmod and Bmod are as Wproj2 and Bproj2 but taken back into the original // iVector space. SpMatrix Wmod(dim), Bmod(dim); Wmod.AddMat2Sp(1.0, combined_trans_inv, kNoTrans, Wproj2mod, 0.0); Bmod.AddMat2Sp(1.0, combined_trans_inv, kNoTrans, Bproj2mod, 0.0); TpMatrix C(dim); // Do Cholesky Wmod = C C^T. Now if we use C^{-1} as a transform, we have // C^{-1} W C^{-T} = I, so it makes the within-class covar unit. C.Cholesky(Wmod); TpMatrix Cinv(C); Cinv.Invert(); // Bmod_proj is Bmod projected by Cinv. SpMatrix Bmod_proj(dim); Bmod_proj.AddTp2Sp(1.0, Cinv, kNoTrans, Bmod, 0.0); Vector psi_new(dim); Matrix Q(dim, dim); // Do symmetric eigenvalue decomposition of Bmod_proj, so // Bmod_proj = Q diag(psi_new) Q^T Bmod_proj.Eig(&psi_new, &Q); SortSvd(&psi_new, &Q); // This means that if we use Q^T as a transform, then Q^T Bmod_proj Q = // diag(psi_new), hence Q^T diagonalizes Bmod_proj (while leaving the // within-covar unit). // The final transform we want, that projects from our original // space to our newly normalized space, is: // first Cinv, then Q^T, i.e. the // matrix Q^T Cinv. Matrix final_transform(dim, dim); final_transform.AddMatTp(1.0, Q, kTrans, Cinv, kNoTrans, 0.0); KALDI_LOG << "Old diagonal of between-class covar was: " << plda->psi_ << ", new diagonal is " << psi_new; plda->transform_.CopyFromMat(final_transform); plda->psi_.CopyFromVec(psi_new); } } // namespace kaldi