// cudamatrix/cu-matrix-speed-test.cc // Copyright 2013 Johns Hopkins University (author: Daniel Povey) // 2015 Guoguo Chen // 2017 Shiyin Kang // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include #include #include #include "base/kaldi-common.h" #include "util/common-utils.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-vector.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-tp-matrix.h" #include "cudamatrix/cu-sp-matrix.h" #include "cudamatrix/cu-sparse-matrix.h" using namespace kaldi; namespace kaldi { template std::string NameOf() { return (sizeof(Real) == 8 ? "" : ""); } template void TestCuMatrixSum(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; Real result = 0; for (; tim.Elapsed() < time_in_secs; iter++) { result = M.Sum(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::TestCuMatrixSum" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops, result = " << result; } template void TestCuMatrixMax(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; Real result = 0; for (; tim.Elapsed() < time_in_secs; iter++) { result = M.Max(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::TestCuMatrixMax" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops, result = " << result; } template void TestCuMatrixMin(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; Real result = 0; for (; tim.Elapsed() < time_in_secs; iter++) { result = M.Min(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::TestCuMatrixMin" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops, result = " << result; } template void TestCuMatrixDivRowsVec(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); CuVector V(dim); M.SetRandn(); V.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.DivRowsVec(V); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::DivRowsVec" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixTransposeNS(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim / 2); M.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.Transpose(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter / 2) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::TransposeNS" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixTransposeS(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.Transpose(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::TransposeS" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixTransposeCross(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix Mf(dim / 2, dim), ref(dim, dim / 2); CuMatrix Md(dim, dim / 2); Mf.SetRandn(); ref = Mf; Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { Md.CopyFromMat(Mf, kTrans); Mf.CopyFromMat(Md, kTrans); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG<< "For CuMatrix::TransposeCross" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; AssertEqual(ref, Mf); } template void TestCuMatrixAddMat(int32 dim, int32 num_row_blocks, int32 num_col_blocks) { BaseFloat time_in_secs = 0.025; CuMatrix A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks); A.SetRandn(); B.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { for (int32 i = 0; i < num_row_blocks; i++) { for (int32 j = 0; j < num_col_blocks; j++) { A.AddMat(0.0, CuSubMatrix(B, i * dim, dim, j * dim, dim)); } } } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddMat" << NameOf() << ", for dim = " << dim << "numRowBlocks = "<< num_row_blocks << "numColBlocks = " << num_col_blocks << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixAddMatBlocks(int32 dim, int32 num_row_blocks, int32 num_col_blocks) { BaseFloat time_in_secs = 0.025; CuMatrix A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks); A.SetRandn(); B.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { A.AddMatBlocks(0.0, B); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddMatBlocks" << NameOf() << ", for dim = " << dim << ", numRowBlocks = "<< num_row_blocks << ", numColBlocks = " << num_col_blocks << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixMatMat(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim), O(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { O.AddMatMat(1.0, M, kNoTrans, N, kNoTrans, 0.0); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddMatMat" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixMatMatBatched(int32 dim, int32 batchCount) { std::vector* > a(batchCount), b(batchCount), c(batchCount); std::vector* > A, B, C; for (int32 i = 0; i < batchCount; i++) { // first create a Matrix intance and then creat a SubMatrix instance from that a[i] = new CuMatrix(dim, dim); b[i] = new CuMatrix(dim, dim); c[i] = new CuMatrix(dim, dim); a[i]->SetRandn(); b[i]->SetRandn(); A.push_back(new CuSubMatrix(*(a[i]), 0, a[i]->NumRows(), 0, a[i]->NumCols())); B.push_back(new CuSubMatrix(*(b[i]), 0, b[i]->NumRows(), 0, b[i]->NumCols())); C.push_back(new CuSubMatrix(*(c[i]), 0, c[i]->NumRows(), 0, c[i]->NumCols())); } BaseFloat time_in_secs = 0.025; Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { AddMatMatBatched(static_cast(1.0), C, A, kNoTrans, B, kNoTrans, static_cast(0.0)); } for (int32 i = 0; i< batchCount; i++) { delete a[i]; delete b[i]; delete c[i]; delete A[i]; delete B[i]; delete C[i]; } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * fdim * iter * batchCount) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddMatMatBatched" << NameOf() << ", for dim = " << dim << ", batchSize = " << batchCount << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) { BaseFloat time_in_secs = 0.015; CuMatrix M(dim, dim), N(dim, dim); CuVector v(dim); M.SetRandn(); v.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) N.AddDiagVecMat(1.0, v, M, trans, 0.0); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddDiagVecMat" << NameOf() << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestSymInvertPosDef(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim * 2), N(dim, dim); M.SetRandn(); N.SymAddMat2(1.0, M, kNoTrans, 0.0); CuMatrix Ncopy(N); int iter = 0; Timer tim; for (;tim.Elapsed() < time_in_secs; iter++) { Ncopy.CopyFromMat(N); Ncopy.SymInvertPosDef(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::TestCuInvertPosDef" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template static void TestCuMatrixCompObjfAndDeriv(int32 dim) { BaseFloat time_in_secs = 0.025; // Previously tested for larger dims, but test was slow. int32 n_r = dim, n_c = dim + Rand() % 5; CuMatrix A(n_r, n_c), B(n_r, n_c); B.SetRandn(); B.Add(1.0); B.ApplyFloor(1.0e-10); std::vector > labels; for(int i = 0; i < n_r; i++) { for(int j = 0; j < n_c; j++) { // have approximately one weight per row of the matrix. if (Rand() % n_c == 0) { A(i, j) = RandUniform(); MatrixElement t = {i, j, A(i, j)}; labels.push_back(t); } } } CuMatrix C(n_r, n_c); int iter = 0; Timer tim; Real a = 0.0, b = 0.0; for (;tim.Elapsed() < time_in_secs; iter++) C.CompObjfAndDeriv(labels, B, &a, &b); BaseFloat gflops = (n_r * n_c * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CompObjfAndDeriv" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; // do it one more time for correctness test. C.SetZero(); C.CompObjfAndDeriv(labels, B, &a, &b); KALDI_ASSERT(ApproxEqual(b, A.Sum())); // repeat the real test. Real sum2; // sum(i, j) A(i, j) log(B(i, j)); { CuMatrix Bcopy(B); Bcopy.ApplyLog(); sum2 = TraceMatMat(Bcopy, A, kTrans); } KALDI_ASSERT(ApproxEqual(a, sum2)); B.InvertElements(); A.MulElements(B); // each element of A is now A(i, j) / B(i, j); KALDI_ASSERT(ApproxEqual(A, C)); } template static void TestCuFindRowMaxId(int32 dim) { int32 dimM = dim, dimN = dimM + Rand() % 5; Matrix Hi(dimM, dimN); Hi.SetRandn(); CuMatrix Di(dimM, dimN); Di.CopyFromMat(Hi); std::vector Hmax(dimM); CuArray Dmax(dimN); BaseFloat time_in_secs = 0.025; int iter = 0; Timer tim; for (;tim.Elapsed() < time_in_secs; iter++) Di.FindRowMaxId(&Dmax); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::FindRowMaxId" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; // on cpu for(MatrixIndexT r=0; r max) { idx=c; max=Hi(r,c); } } Hmax[r] = idx; } std::vector Hmax2(dimM); Dmax.CopyToVec(&Hmax2); KALDI_ASSERT(Hmax == Hmax2); } template void TestCuMatrixSigmoid(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { N.Sigmoid(M); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::Sigmoid" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixHeaviside(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { N.ApplyHeaviside(); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::Heaviside" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixMulRowsGroupMat(int32 dim) { BaseFloat time_in_secs = 0.025; int32 group_size = 5; CuMatrix M(dim, dim * group_size), N(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { M.MulRowsGroupMat(N); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * group_size * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::MulRowsGroupMat" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixDiffSoftmax(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim), L(dim, dim); M.SetRandn(); N.SetRandn(); L.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { N.DiffSoftmaxPerRow(M, L); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::DiffSoftmaxPerRow" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixDiffLogSoftmax(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim), L(dim, dim); M.SetRandn(); N.SetRandn(); L.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { N.DiffLogSoftmaxPerRow(M, L); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::DiffLogSoftmaxPerRow" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixSoftmax(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { N.SoftMaxPerRow(M); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::Softmax" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixLogSoftmax(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { N.LogSoftMaxPerRow(M); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::LogSoftmax" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixGroupPnorm(int32 dim) { BaseFloat time_in_secs = 0.025; int32 group_size = 4; CuMatrix M(dim, dim), N(dim, dim / group_size); M.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) N.GroupPnorm(M, 2.0); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::GroupPnorm" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixDiffGroupPnorm(int32 dim) { BaseFloat time_in_secs = 0.025; int32 group_size = 8; CuMatrix iv(dim, dim), ov(dim, dim / group_size); CuMatrix id(dim, dim), od(dim, dim / group_size); iv.SetRandn(); od.SetRandn(); ov.GroupPnorm(iv, 2.0); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) id.DiffGroupPnorm(iv, ov, od, 2.0); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::DiffGroupPnorm" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixGroupMax(int32 dim) { BaseFloat time_in_secs = 0.025; int32 group_size = 4; CuMatrix M(dim, dim), N(dim, dim / group_size); M.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) N.GroupMax(M); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::GroupMax" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixGroupMaxAllGroupSizes(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs;) { for (int group_size = 1; group_size <= dim; group_size++) { if (dim % group_size == 0) { CuMatrix N(dim, dim / group_size, kUndefined); N.GroupMax(M); iter++; } } } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::GroupMax (all group sizes)" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixGroupMaxDeriv(int32 dim) { BaseFloat time_in_secs = 0.025; int32 group_size = 4; CuMatrix M(dim, dim), N(dim, dim / group_size), O(dim, dim); M.SetRandn(); N.GroupMax(M); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) O.GroupMaxDeriv(M, N); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::GroupMaxDeriv" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixTraceMatMat(int32 dim) { for (int32 n = 0; n < 2; n++) { MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans); BaseFloat time_in_secs = 0.02; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { TraceMatMat(M, N, trans); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf() << (trans == kTrans ? " [transposed]" : "") << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } } template void TestCuMatrixCholesky(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.AddToDiag(100.0); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) M.Cholesky(); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::Cholesky" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyLowerToUpper(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyLowerToUpper(); } CuMatrix M2(M, kTrans); AssertEqual(M, M2); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyLowerToUpper" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyFromTp(int32 dim, MatrixTransposeType trans) { BaseFloat time_in_secs = 0.025; CuTpMatrix T(dim); T.SetRandn(); CuMatrix M(dim, dim); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyFromTp(T, trans); } TpMatrix T_cpu(T); Matrix M_cpu(T_cpu, trans); Matrix M2_cpu(M); AssertEqual(M_cpu, M2_cpu); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyFromTp" << (trans == kNoTrans ? "[NoTrans]":"[Trans]") << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyFromSp(int32 dim) { BaseFloat time_in_secs = 0.025; CuSpMatrix S(dim); S.SetRandn(); CuMatrix M(dim, dim); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyFromSp(S); } SpMatrix S_cpu(S); Matrix M_cpu(S_cpu); Matrix M2_cpu(M); AssertEqual(M_cpu, M2_cpu); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyFromSp" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyUpperToLower(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyUpperToLower(); } CuMatrix M2(M, kTrans); AssertEqual(M, M2); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyUpperToLower" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixResize(int32 dim) { BaseFloat time_in_secs = 0.025; Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { CuMatrixM(dim, dim, kUndefined); // we are testing the allocation and deallocation time. } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::TestCuMatrixResize" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixSetZeroAboveDiag(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); M.SetRandn(); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) M.SetZeroAboveDiag(); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::SetZeroAboveDiag" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixLookup(int32 dim) { BaseFloat time_in_secs = 0.025; int32 dimM = dim, dimN = dim; CuMatrix H(dimM, dimN); H.SetRandn(); std::vector indices; std::vector reference; std::vector output; // Generates the indices and the reference. int32 num_index = dim * dim; output.resize(num_index); for (int32 j = 0; j < num_index; j++) { MatrixIndexT r = Rand() % dimM; MatrixIndexT c = Rand() % dimN; Int32Pair tmp_pair; tmp_pair.first = r; tmp_pair.second = c; indices.push_back(tmp_pair); reference.push_back(H(r, c)); } Timer tim; int32 iter = 0; for (; tim.Elapsed()< time_in_secs; iter++) H.Lookup(indices, &(output[0])); BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::Lookup" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyRows1(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector reorder(dim); for (int32 i = 0; i < dim; i++) { reorder[i] = i; } CuArray reorder_cuda(reorder); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyRows(N, reorder_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyRows" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyRows2(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector reorder_src(dim, NULL); for (int32 i = 0; i < dim; i++) { reorder_src[i] = N.RowData(i); } CuArray reorder_src_cuda(reorder_src); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyRows(reorder_src_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyRows" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyToRows(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector reorder_dst(dim, NULL); for (int32 i = 0; i < dim; i++) { reorder_dst[i] = N.RowData(i); } CuArray reorder_dst_cuda(reorder_dst); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.CopyToRows(reorder_dst_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyToRows" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixAddRows1(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector reorder(dim); for (int32 i = 0; i < dim; i++) { reorder[i] = i; } CuArray reorder_cuda(reorder); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.AddRows(0.5, N, reorder_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddRows" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixAddRows2(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector reorder_src(dim, NULL); for (int32 i = 0; i < dim; i++) { reorder_src[i] = N.RowData(i); } CuArray reorder_src_cuda(reorder_src); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.AddRows(0.5, reorder_src_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddRows" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixAddToRows(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector reorder_dst(dim, NULL); for (int32 i = 0; i < dim; i++) { reorder_dst[i] = N.RowData(i); } CuArray reorder_dst_cuda(reorder_dst); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.AddToRows(0.5, reorder_dst_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddToRows" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixAddRowRanges(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); std::vector indexes(dim); for (int32 i = 0; i < dim; i++) { indexes[i].first = i; indexes[i].second = i + 1; } CuArray indexes_cuda(indexes); Timer tim; int32 iter = 0; for (; tim.Elapsed() < time_in_secs; iter++) { M.AddRowRanges(N, indexes_cuda); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::AddRowRanges" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuSparseMatrixTraceMatSmat(int32 dim) { for (int32 n = 0; n < 2; n++) { MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans); BaseFloat time_in_secs = 0.02; CuMatrix M(dim, dim); M.SetRandn(); std::vector > > pairs(dim); for (auto && row : pairs) { row.push_back( { MatrixIndexT(Rand() % dim), Real(Rand() % dim) }); } SparseMatrix Ncpu(dim, pairs); CuSparseMatrix N(Ncpu); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) { TraceMatSmat(M, N, trans); } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuSparseMatrix::TraceMatSmat" << NameOf() << (trans == kTrans ? " [transposed]" : "") << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } } template void CudaMatrixSpeedTest() { std::vector sizes; sizes.push_back(16); sizes.push_back(32); sizes.push_back(64); sizes.push_back(128); sizes.push_back(256); sizes.push_back(512); sizes.push_back(1024); int32 ns = sizes.size(); for (int32 s = 0; s < ns; s++) TestCuMatrixDivRowsVec(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixResize(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixAddMat(sizes[s], 3, 3); for (int32 s = 0; s < ns; s++) TestCuMatrixAddMatBlocks(sizes[s], 3, 3); for (int32 s = 0; s < ns; s++) TestCuMatrixMatMat(sizes[s]); for (int32 s = 0; s + 1 < ns; s++) TestCuMatrixMatMatBatched(sizes[s], 10); for (int32 s = 0; s < ns; s++) { TestCuMatrixAddDiagVecMat(sizes[s], kNoTrans); TestCuMatrixAddDiagVecMat(sizes[s], kTrans); } for (int32 s = 0; s < ns; s++) TestSymInvertPosDef(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCholesky(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSigmoid(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixHeaviside(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuFindRowMaxId(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCompObjfAndDeriv(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixMulRowsGroupMat(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSoftmax(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixDiffSoftmax(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixDiffLogSoftmax(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixLogSoftmax(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixGroupPnorm(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixDiffGroupPnorm(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixGroupMax(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixGroupMaxAllGroupSizes(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixGroupMaxDeriv(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixTraceMatMat(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuSparseMatrixTraceMatSmat(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyLowerToUpper(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyFromTp(sizes[s], kNoTrans); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyFromTp(sizes[s], kTrans); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyFromSp(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyUpperToLower(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSetZeroAboveDiag(sizes[s]); for (int32 s = 0; s + 2 < ns; s++) TestCuMatrixLookup(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyRows1(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyRows2(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixCopyToRows(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixAddRows1(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixAddRows2(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixAddToRows(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixAddRowRanges(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixTransposeCross(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixTransposeS(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixTransposeNS(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSum(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixMax(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixMin(sizes[s]); } } // namespace kaldi int main() { SetVerboseLevel(1); #if HAVE_CUDA == 1 int32 loop = 0; for (loop = 0; loop < 2; loop++) { if (loop == 0) CuDevice::Instantiate().SelectGpuId("no"); else CuDevice::Instantiate().SelectGpuId("yes"); #endif kaldi::CudaMatrixSpeedTest(); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().DoublePrecisionSupported()) { kaldi::CudaMatrixSpeedTest(); } else { KALDI_WARN << "Double precision not supported"; } #else kaldi::CudaMatrixSpeedTest(); #endif #if HAVE_CUDA == 1 } // No for loop if 'HAVE_CUDA != 1', CuDevice::Instantiate().PrintProfile(); #endif KALDI_LOG << "Tests succeeded."; }