// cudamatrix/cu-sp-matrix.cc // Copyright 2013 Karel Vesely // 2014-2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #if HAVE_CUDA == 1 #include #include #endif #include "base/timer.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-vector.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-sp-matrix.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cublas-wrappers.h" namespace kaldi { template void CuSpMatrix::CopyFromMat(const CuMatrixBase &M, SpCopyType copy_type) { KALDI_ASSERT(this->num_rows_ == M.NumRows() && this->num_rows_ == M.NumCols()); if (this->num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; MatrixIndexT D = this->NumRows(); if (D == 0) return; switch (copy_type) { case kTakeMeanAndCheck: KALDI_ERR << "kTakeMeanAndCheck not supported!"; // The grid/block dimensions have been very roughly tuned for the // individual cases. case kTakeMean: { dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK)); cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim()); CU_SAFE_CALL(cudaGetLastError()); } break; case kTakeLower: { int32 block_size = std::min(CU1DBLOCK, this->num_rows_); dim3 dimBlock(1, block_size); dim3 dimGrid(D, n_blocks(D, block_size)); cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim()); CU_SAFE_CALL(cudaGetLastError()); } break; case kTakeUpper: { dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK)); cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim()); CU_SAFE_CALL(cudaGetLastError()); } break; default: KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat"); } CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim); } else #endif { Mat().CopyFromMat(M.Mat(), copy_type); } } template void CuSpMatrix::Invert() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuMatrix mat(this->num_rows_, this->num_rows_); mat.CopyFromSp(*this); mat.SymInvertPosDef(); this->CopyFromMat(mat); } else #endif { // Use inversion of CPU-based SpMatrix. Mat().Invert(); } } template void CuSpMatrix::AddVec2(const Real alpha, const CuVectorBase &v) { KALDI_ASSERT(v.Dim() == this->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->num_rows_ == 0) return; CuTimer tim; size_t nr = this->num_rows_; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK)); CUBLAS_SAFE_CALL(cublas_spr(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, this->num_rows_, alpha, v.Data(), 1, this->Data())); CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim); } else #endif { Mat().AddVec2(alpha, v.Vec()); } } template void CuSpMatrix::AddMat2(const Real alpha, const CuMatrixBase &M, MatrixTransposeType transM, const Real beta) { KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows()) || (transM == kTrans && this->NumRows() == M.NumCols())); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->num_rows_ == 0) return; CuTimer tim; MatrixIndexT this_dim = this->NumRows(), m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows()); if (this_dim == 0) return; if (alpha == 0.0) { if (beta != 1.0) this->Scale(beta); return; } cublasOperation_t trans = (transM == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T); CuMatrix tmp_mat(*this); cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, this_dim, m_other_dim, alpha, M.Data(), M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride()); this->CopyFromMat(tmp_mat, kTakeLower); CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim); } else #endif { Mat().AddMat2(alpha, M.Mat(), transM, beta); } } /** * C++ templatd wrapper of ANSI-C CUBLAS function GEMM (matrix multiply) */ template Real TraceSpSp(const CuSpMatrix &A, const CuSpMatrix &B) { KALDI_ASSERT(A.NumRows() == B.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (A.NumRows() == 0) return 0.0; MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2; CuVector Adiag(nr, kUndefined); CuVector Bdiag(nr, kUndefined); Adiag.CopyDiagFromPacked(A); Bdiag.CopyDiagFromPacked(B); CuSubVector Aall(A.Data(), size); CuSubVector Ball(B.Data(), size); // Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting // on the diagonal. return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag); } else #endif { return TraceSpSp(A.Mat(), B.Mat()); } } template float TraceSpSp(const CuSpMatrix &A, const CuSpMatrix &B); template float TraceSpSp(const CuSpMatrix &A, const CuSpMatrix &B); template double TraceSpSp(const CuSpMatrix &A, const CuSpMatrix &B); template double TraceSpSp(const CuSpMatrix &A, const CuSpMatrix &B); template bool CuSpMatrix::ApproxEqual(const CuSpMatrix &B, Real tol) const { KALDI_ASSERT(this->NumRows() == B.NumRows()); CuSpMatrix diff(*this); diff.AddSp(-1.0, B); Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(), d = diff.FrobeniusNorm(); return (d <= tol * std::max(a, b)); } template bool CuSpMatrix::IsUnit(Real tol) const { // want to return: //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.: //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows() // trace((*this - I)(*this - I)) <= tol * NumRows() // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows() // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows() // Note: we could do this more efficiently still, by slightly changing the // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp // that corrects for the diagonal being counted twice. return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <= tol * this->NumRows()); } template CuSpMatrix& CuSpMatrix::operator = (const CuSpMatrix &in) { this->Resize(in.NumRows(), kUndefined); this->CopyFromPacked(in); return *this; } template class CuSpMatrix; template class CuSpMatrix; } // namespace