// nnet3/natural-gradient-online-test.cc // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "nnet3/natural-gradient-online.h" #include "util/common-utils.h" namespace kaldi { namespace nnet3 { // Simple version of OnlineNaturalGradient that we use to make // sure it is behaving as advertised. class OnlineNaturalGradientSimple { public: OnlineNaturalGradientSimple(): rank_(40), num_samples_history_(2000.0), alpha_(4.0), epsilon_(1.0e-10), delta_(5.0e-04) { } void SetRank(int32 rank) { rank_ = rank; } void PreconditionDirections( CuMatrixBase *R, CuVectorBase *row_prod, BaseFloat *scale); private: BaseFloat Eta(int32 N) const; void PreconditionDirectionsCpu( MatrixBase *R, VectorBase *row_prod, BaseFloat *scale); void Init(const MatrixBase &R0); void InitDefault(int32 D); int32 rank_; double num_samples_history_; double alpha_; double epsilon_; double delta_; // Fisher matrix defined as F_t = R_t^T diag(d_t) R_t + rho_t I. Vector d_t_; Matrix R_t_; double rho_t_; }; void OnlineNaturalGradientSimple::PreconditionDirections( CuMatrixBase *R, CuVectorBase *row_prod, BaseFloat *scale) { Matrix R_cpu(*R); Vector row_prod_cpu(*row_prod); Matrix R_cpu_dbl(R_cpu); Vector row_prod_cpu_dbl(row_prod_cpu); PreconditionDirectionsCpu(&R_cpu_dbl, &row_prod_cpu_dbl, scale); row_prod_cpu.CopyFromVec(row_prod_cpu_dbl); R_cpu.CopyFromMat(R_cpu_dbl); R->CopyFromMat(R_cpu); row_prod->CopyFromVec(row_prod_cpu); } void OnlineNaturalGradientSimple::InitDefault(int32 D) { if (rank_ >= D) { KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D << ", setting it to " << (D - 1) << " (but this is probably still too high)"; rank_ = D - 1; } int32 R = rank_; R_t_.Resize(R, D); for (int32 r = 0; r < R; r++) { std::vector cols; for (int32 c = r; c < D; c += R) cols.push_back(c); for (int32 i = 0; i < cols.size(); i++) { int32 c = cols[i]; R_t_(r, c) = (i == 0 ? 1.1 : 1.0) / sqrt(1.1 * 1.1 + cols.size() - 1); } } d_t_.Resize(R); d_t_.Set(epsilon_); rho_t_ = epsilon_; } void OnlineNaturalGradientSimple::Init(const MatrixBase &R0) { int32 D = R0.NumCols(), N = R0.NumRows(); InitDefault(D); int32 num_init_iters = 3; for (int32 i = 0; i < num_init_iters; i++) { CuMatrix R0_copy(R0); CuVector row_products(N); BaseFloat scale; PreconditionDirections(&R0_copy, &row_products, &scale); } } BaseFloat OnlineNaturalGradientSimple::Eta(int32 N) const { KALDI_ASSERT(num_samples_history_ > 0.0); BaseFloat ans = 1.0 - exp(-N / num_samples_history_); if (ans > 0.9) ans = 0.9; return ans; } void OnlineNaturalGradientSimple::PreconditionDirectionsCpu( MatrixBase *X_t, VectorBase *row_prod, BaseFloat *scale) { if (R_t_.NumRows() == 0) Init(*X_t); int32 R = R_t_.NumRows(), D = R_t_.NumCols(), N = X_t->NumRows(); BaseFloat eta = Eta(N); SpMatrix F_t(D); // F_t =(def) R_t^T D_t R_t + \rho_t I F_t.AddToDiag(rho_t_); F_t.AddMat2Vec(1.0, R_t_, kTrans, d_t_, 1.0); // Make sure F_t is +ve definite. { KALDI_ASSERT(d_t_.Min() > 0); Vector eigs(D); F_t.Eig(&eigs, NULL); KALDI_ASSERT(eigs.Min() > 0); } // S_t =(def) 1/N X_t^T X_t. SpMatrix S_t(D); S_t.AddMat2(1.0 / N, *X_t, kTrans, 0.0); // T_t =(def) \eta S_t + (1-\eta) F_t SpMatrix T_t(D); T_t.AddSp(eta, S_t); T_t.AddSp(1.0 - eta, F_t); // Y_t =(def) R_t T_t Matrix Y_t(R, D); Y_t.AddMatSp(1.0, R_t_, kNoTrans, T_t, 0.0); // Z_t =(def) Y_t Y_t^T SpMatrix Z_t(R); Z_t.AddMat2(1.0, Y_t, kNoTrans, 0.0); Matrix U_t(R, R); Vector c_t(R); // decompose Z_t = U_t C_t U_t^T Z_t.Eig(&c_t, &U_t); SortSvd(&c_t, &U_t); double c_t_floor = pow(rho_t_ * (1.0 - eta), 2); int32 nf; c_t.ApplyFloor(c_t_floor, &nf); if (nf > 0) { KALDI_WARN << "Floored " << nf << " elements of c_t."; } // KALDI_LOG << "c_t is " << c_t; // KALDI_LOG << "U_t is " << U_t; // KALDI_LOG << "Z_t is " << Z_t; Vector sqrt_c_t(c_t); sqrt_c_t.ApplyPow(0.5); Vector inv_sqrt_c_t(sqrt_c_t); inv_sqrt_c_t.InvertElements(); Matrix R_t1(R, D); // R_{t+1} = C_t^{-0.5} U_t^T Y_t R_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0); R_t1.MulRowsVec(inv_sqrt_c_t); double rho_t1 = (1.0 / (D - R)) * (eta * S_t.Trace() + (1.0 - eta) * (D * rho_t_ + d_t_.Sum()) - sqrt_c_t.Sum()); Vector d_t1(sqrt_c_t); d_t1.Add(-rho_t1); double floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max()); if (rho_t1 < floor_val) { KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1; rho_t1 = floor_val; } d_t1.ApplyFloor(floor_val, &nf); if (nf > 0) { KALDI_VLOG(3) << "d_t1 was " << d_t1; KALDI_WARN << "Floored " << nf << " elements of d_{t+1}."; } // a check. if (nf == 0 && rho_t1 > floor_val) { double tr_F_t1 = D * rho_t1 + d_t1.Sum(), tr_T_t = T_t.Trace(); AssertEqual(tr_F_t1, tr_T_t); } // G_t = F_t + alpha/D tr(F_t) SpMatrix G_t(F_t); G_t.AddToDiag(alpha_ / D * F_t.Trace()); SpMatrix G_t_inv(G_t); G_t_inv.Invert(); double beta_t = rho_t_ + alpha_/D * F_t.Trace(); // X_hat_t = beta_t X_t G_t^{-1}. Matrix X_hat_t(N, D); X_hat_t.AddMatSp(beta_t, *X_t, kNoTrans, G_t_inv, 0.0); double tr_x_x = TraceMatMat(*X_t, *X_t, kTrans), tr_Xhat_Xhat = TraceMatMat(X_hat_t, X_hat_t, kTrans); double gamma = (tr_Xhat_Xhat == 0 ? 1.0 : sqrt(tr_x_x / tr_Xhat_Xhat)); X_t->CopyFromMat(X_hat_t); row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0); *scale = gamma; // Update the parameters rho_t_ = rho_t1; d_t_.CopyFromVec(d_t1); R_t_.CopyFromMat(R_t1); KALDI_VLOG(3) << "rho_t_ = " << rho_t_; KALDI_VLOG(3) << "d_t_ = " << d_t_; KALDI_VLOG(3) << "R_t_ = " << R_t_; { // check that R_t_ R_t_^T = I. SpMatrix unit(R); unit.AddMat2(1.0, R_t_, kNoTrans, 0.0); if (!unit.IsUnit(1.0e-03)) { KALDI_WARN << "R is not orthogonal, reorthogonalizing."; for (int32 i = 0; i < R; i++) { SubVector row(R_t_, i); for (int32 j = 0; j < i; j++) { SubVector row_j(R_t_, j); row.AddVec(-VecVec(row_j, row), row_j); } row.Scale(1.0 / row.Norm(2.0)); } } unit.AddMat2(1.0, R_t_, kNoTrans, 0.0); KALDI_ASSERT(unit.IsUnit(1.0e-03)); } } void UnitTestPreconditionDirectionsOnline() { MatrixIndexT R = 1 + Rand() % 30, // rank of correction N = (2 * R) + Rand() % 30, // batch size D = R + 1 + Rand() % 20; // problem dimension. Must be > R. // Test sometimes with features that are all-zero or all-one; this will // help to make sure low-rank or zero input doesn't crash the code. bool zero = false; bool one = false; if (Rand() % 3 == 0) zero = true; //else if (Rand() % 2 == 0) one = true; CuVector row_prod1(N); BaseFloat gamma1, gamma2; BaseFloat big_eig_factor = RandInt(1, 20); big_eig_factor = big_eig_factor * big_eig_factor; Vector big_eig_vector(D); big_eig_vector.SetRandn(); big_eig_vector.Scale(big_eig_factor); OnlineNaturalGradientSimple preconditioner1; OnlineNaturalGradient preconditioner2; preconditioner1.SetRank(R); preconditioner2.SetRank(R); preconditioner2.TurnOnDebug(); int32 num_iters = 100; for (int32 iter = 0; iter < num_iters; iter++) { Matrix M_cpu(N, D); if (one) M_cpu.Set(1.0); else if (!zero) { M_cpu.SetRandn(); Vector rand_vec(N); rand_vec.SetRandn(); M_cpu.AddVecVec(1.0, rand_vec, big_eig_vector); } CuMatrix M(M_cpu); CuMatrix Mcopy1(M), Mcopy2(M); preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1); preconditioner2.PreconditionDirections(&Mcopy2, &gamma2); BaseFloat trace1 = TraceMatMat(M, M, kTrans), trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans); AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02); AssertEqual(Mcopy1, Mcopy2); AssertEqual(gamma1, gamma2, 1.0e-02); // make sure positive definite CuVector inner_prods(M.NumRows()); inner_prods.AddDiagMatMat(1.0, M, kNoTrans, Mcopy1, kTrans, 0.0); KALDI_ASSERT(inner_prods.Min() >= 0.0); } return; } } // namespace nnet3 } // namespace kaldi int main() { using namespace kaldi; using namespace kaldi::nnet3; for (int32 loop = 0; loop < 2; loop++) { #if HAVE_CUDA == 1 CuDevice::Instantiate().SetDebugStrideMode(true); if (loop == 0) CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU else CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection #endif for (int32 i = 0; i < 5; i++) { UnitTestPreconditionDirectionsOnline(); } } }