// gmmbin/gmm-train-lvtln-special.cc // Copyright 2009-2011 Microsoft Corporation // Copyright 2014 Vimal Manohar // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-common.h" #include "util/common-utils.h" #include "transform/lvtln.h" #include "hmm/posterior.h" int main(int argc, char *argv[]) { try { using namespace kaldi; using kaldi::int32; const char *usage = "Set one of the transforms in lvtln to the minimum-squared-error solution\n" "to mapping feats-untransformed to feats-transformed; posteriors may\n" "optionally be used to downweight/remove silence.\n" "Usage: gmm-train-lvtln-special [options] class-index

" "

[]\n" "e.g.: \n" " gmm-train-lvtln-special 5 5.lvtln 6.lvtln scp:train.scp scp:train_warp095.scp ark:nosil.post\n"; BaseFloat warp = -1.0; bool binary = true; bool normalize_var = false; bool normalize_covar = false; std::string weights_rspecifier; ParseOptions po(usage); po.Register("binary", &binary, "Write output in binary mode"); po.Register("warp", &warp, "If supplied, can be used to set warp factor" "for this transform"); po.Register("normalize-var", &normalize_var, "Normalize diagonal of variance " "to be the same before and after transform."); po.Register("normalize-covar", &normalize_covar, "Normalize (matrix-valued) " "covariance to be the same before and after transform."); po.Register("weights-in", &weights_rspecifier, "Can be used to take posteriors as an scp or ark file of weights " "instead of giving "); po.Read(argc, argv); if (po.NumArgs() < 5 || po.NumArgs() > 6) { po.PrintUsage(); exit(1); } std::string class_idx_str = po.GetArg(1); int32 class_idx; if (!ConvertStringToInteger(class_idx_str, &class_idx)) KALDI_ERR << "Expected integer first argument: got " << class_idx_str; std::string lvtln_rxfilename = po.GetArg(2), lvtln_wxfilename = po.GetArg(3), feats_orig_rspecifier = po.GetArg(4), feats_transformed_rspecifier = po.GetArg(5), posteriors_rspecifier = po.GetOptArg(6); // Get lvtln object. LinearVtln lvtln; ReadKaldiObject(lvtln_rxfilename, &lvtln); int32 dim = lvtln.Dim(); // feature dimension [we hope!]. if (!normalize_covar) { // Below is the computation if we are not normalizing the full covariance. // Ignoring weighting (which is a straightforward extension), the problem is this: // we have original features x(t) and transformed features y(t) [both x(t) and y(t) // are vectors of size D]. We are training an affine transform to minimize the sum-squared // error between A x(t) + b and y(t). Let x(t)^+ be x(t) with a 1 appended, and let // w_i be the i'th row of the matrix [ A; b ], as in CMLLR. // We are minimizeing // \sum_{t = 1}^T \sum_{i = 1}^D (w_i^T x(t)^+ - y_i(t))^2, // We can express this in terms of sufficient statistics as: // \sum_{i = 1}^D w_i^T Q w_i - 2 w_i^T l_i + c_i, // where // Q_i = \sum_{t = 1}^T x(t)^+ x(t)^+^T // l_i = \sum_{t = 1}^T x(t)^+ y_i(t) // c_i = \sum_{t = 1}^T y_i(t)^2 // The solution for row i is: w_i = Q^{-1} l_i // and the sum-square error for index i is: // w_i^T Q w_i - 2 w_i^T l_i + c_i . // Note that for lvtln purposes we throw away the "offset" element (i.e. the last // element of each row w_i). // Declare statistics we use to estimate transform. SpMatrix Q(dim+1); // quadratic stats == outer product of x^+. Matrix l(dim, dim+1); // i'th row of l is l_i Vector c(dim); double beta = 0.0; Vector sum_xplus(dim+1); // sum of x_i^+ Vector sumsq_x(dim); // sumsq of x_i Vector sumsq_diff(dim); // sum of x_i-y_i SequentialBaseFloatMatrixReader x_reader(feats_orig_rspecifier); RandomAccessBaseFloatMatrixReader y_reader(feats_transformed_rspecifier); RandomAccessPosteriorReader post_reader(posteriors_rspecifier); RandomAccessBaseFloatVectorReader weights_reader(weights_rspecifier); for (; !x_reader.Done(); x_reader.Next()) { std::string utt = x_reader.Key(); if (!y_reader.HasKey(utt)) { KALDI_WARN << "No transformed features for key " << utt; continue; } const Matrix &x_feats = x_reader.Value(); const Matrix &y_feats = y_reader.Value(utt); if (x_feats.NumRows() != y_feats.NumRows() || x_feats.NumCols() != y_feats.NumCols() || x_feats.NumCols() != dim) { KALDI_ERR << "Number of rows and/or columns differs in features, or features have different dim from lvtln object"; } Vector weights(x_feats.NumRows()); if (weights_rspecifier == "" && posteriors_rspecifier != "") { if (!post_reader.HasKey(utt)) { KALDI_WARN << "No posteriors for utterance " << utt; continue; } const Posterior &post = post_reader.Value(utt); if (static_cast(post.size()) != x_feats.NumRows()) KALDI_ERR << "Mismatch in size of posterior"; for (size_t i = 0; i < post.size(); i++) for (size_t j = 0; j < post[i].size(); j++) weights(i) += post[i][j].second; } else if (weights_rspecifier != "") { if (!weights_reader.HasKey(utt)) { KALDI_WARN << "No weights for utterance " << utt; continue; } weights.CopyFromVec(weights_reader.Value(utt)); } else { weights.Add(1.0); } // Now get stats. for (int32 i = 0; i < x_feats.NumRows(); i++) { BaseFloat weight = weights(i); SubVector x_row(x_feats, i); SubVector y_row(y_feats, i); Vector xplus_row_dbl(dim+1); for (int32 j = 0; j < dim; j++) xplus_row_dbl(j) = x_row(j); xplus_row_dbl(dim) = 1.0; Vector y_row_dbl(y_row); Q.AddVec2(weight, xplus_row_dbl); l.AddVecVec(weight, y_row_dbl, xplus_row_dbl); beta += weight; sum_xplus(dim) += weight; for (int32 j = 0; j < dim; j++) { sum_xplus(j) += weight * x_row(j); sumsq_x(j) += weight * x_row(j)*x_row(j); sumsq_diff(j) += weight * (x_row(j)-y_row(j)) * (x_row(j)-y_row(j)); c(j) += weight * y_row(j)*y_row(j); } } } Matrix A(dim, dim); // will give this to LVTLN object // as transform matrix. SpMatrix Qinv(Q); Qinv.Invert(); for (int32 i = 0; i < dim; i++) { Vector w_i(dim+1); SubVector l_i(l, i); w_i.AddSpVec(1.0, Qinv, l_i, 0.0); // w_i = Q^{-1} l_i SubVector a_i(w_i, 0, dim); A.Row(i).CopyFromVec(a_i); BaseFloat error = (VecSpVec(w_i, Q, w_i) - 2.0*VecVec(w_i, l_i) + c(i)) / beta, sqdiff = sumsq_diff(i) / beta, scatter = sumsq_x(i) / beta; KALDI_LOG << "For dimension " << i << ", sum-squared error in linear approximation is " << error << ", versus feature-difference " << sqdiff << ", orig-sumsq is " << scatter; if (normalize_var) { // add a scaling to normalize the variance. double x_var = scatter - pow(sum_xplus(i) / beta, 2.0); double y_var = VecSpVec(w_i, Q, w_i)/beta - pow(VecVec(w_i, sum_xplus)/beta, 2.0); double scale = sqrt(x_var / y_var); KALDI_LOG << "For dimension " << i << ", variance of original and transformed data is " << x_var << " and " << y_var << " respectively; scaling matrix row by " << scale << " to make them equal."; A.Row(i).Scale(scale); } } lvtln.SetTransform(class_idx, A); } else { // Here is the computation if we normalize the full covariance. // see the document "Notes for affine-transform-based VTLN" for explanation, // here: http://www.danielpovey.com/files/2010_vtln_notes.pdf double T = 0.0; SpMatrix XX(dim); // sum of x x^t Vector x(dim); // sum of x. Vector y(dim); // sum of y. Matrix XY(dim, dim); // sum of x y^t SequentialBaseFloatMatrixReader x_reader(feats_orig_rspecifier); RandomAccessBaseFloatMatrixReader y_reader(feats_transformed_rspecifier); RandomAccessPosteriorReader post_reader(posteriors_rspecifier); for (; !x_reader.Done(); x_reader.Next()) { std::string utt = x_reader.Key(); if (!y_reader.HasKey(utt)) { KALDI_WARN << "No transformed features for key " << utt; continue; } const Matrix &x_feats = x_reader.Value(); const Matrix &y_feats = y_reader.Value(utt); if (x_feats.NumRows() != y_feats.NumRows() || x_feats.NumCols() != y_feats.NumCols() || x_feats.NumCols() != dim) { KALDI_ERR << "Number of rows and/or columns differs in features, or features have different dim from lvtln object"; } Vector weights(x_feats.NumRows()); if (posteriors_rspecifier != "") { if (!post_reader.HasKey(utt)) { KALDI_WARN << "No posteriors for utterance " << utt; continue; } const Posterior &post = post_reader.Value(utt); if (static_cast(post.size()) != x_feats.NumRows()) KALDI_ERR << "Mismatch in size of posterior"; for (size_t i = 0; i < post.size(); i++) for (size_t j = 0; j < post[i].size(); j++) weights(i) += post[i][j].second; } else weights.Add(1.0); // Now get stats. for (int32 i = 0; i < x_feats.NumRows(); i++) { BaseFloat weight = weights(i); SubVector x_row(x_feats, i); SubVector y_row(y_feats, i); Vector x_dbl(x_row); Vector y_dbl(y_row); T += weight; XX.AddVec2(weight, x_dbl); x.AddVec(weight, x_row); y.AddVec(weight, y_row); XY.AddVecVec(weight, x_dbl, y_dbl); } } KALDI_ASSERT(T > 0.0); Vector xbar(x); xbar.Scale(1.0/T); SpMatrix S(XX); S.Scale(1.0/T); S.AddVec2(-1.0, xbar); TpMatrix C_tp(dim); C_tp.Cholesky(S); // get cholesky factor. TpMatrix Cinv_tp(C_tp); Cinv_tp.Invert(); Matrix C(C_tp); // use regular matrix as more stuff is implemented for this case. Matrix Cinv(Cinv_tp); Matrix P0(XY); P0.AddVecVec(-1.0, xbar, y); Matrix P(dim, dim), tmp(dim, dim); tmp.AddMatMat(1.0, P0, kNoTrans, Cinv, kTrans, 0.0); // tmp := P0 * C^{-T} P.AddMatMat(1.0, Cinv, kNoTrans, tmp, kNoTrans, 0.0); // P := C^{-1} * P0 Vector l(dim); Matrix U(dim, dim), Vt(dim, dim); P.Svd(&l, &U, &Vt); l.Scale(1.0/T); // normalize for diagnostic purposes. KALDI_LOG << "Singular values of P are: " << l; Matrix N(dim, dim); N.AddMatMat(1.0, Vt, kTrans, U, kTrans, 0.0); // N := V * U^T. Matrix M(dim, dim); tmp.AddMatMat(1.0, N, kNoTrans, Cinv, kNoTrans, 0.0); // tmp := N * C^{-1} M.AddMatMat(1.0, C, kNoTrans, tmp, kNoTrans, 0.0); // M := C * tmp = C * N * C^{-1} Matrix Mf(M); lvtln.SetTransform(class_idx, Mf); // in this setup we don't // need the offset, v. } if (warp >= 0.0) lvtln.SetWarp(class_idx, warp); { // Write lvtln object. Output ko(lvtln_wxfilename, binary); lvtln.Write(ko.Stream(), binary); } return 0; } catch(const std::exception &e) { std::cerr << e.what(); return -1; } }