// cudamatrix/cu-sparse-matrix.cc // Copyright 2015 Guoguo Chen // 2015 Johns Hopkins University (author: Daniel Povey) // 2017 Shiyin Kang // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #if HAVE_CUDA == 1 #include #include #endif #include #include #include "base/timer.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-vector.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" #include "cudamatrix/cu-array.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cublas-wrappers.h" namespace kaldi { template MatrixIndexT CuSparseMatrix::NumRows() const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { return num_rows_; } else #endif { return Smat().NumRows(); } } template MatrixIndexT CuSparseMatrix::NumCols() const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { return num_cols_; } else #endif { return Smat().NumCols(); } } template MatrixIndexT CuSparseMatrix::NumElements() const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { return nnz_; } else #endif { return Smat().NumElements(); } } template Real CuSparseMatrix::Sum() const { if (NumElements() == 0) return 0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuSubVector sum_vec(CsrVal(), NumElements()); return sum_vec.Sum(); } else #endif { return Smat().Sum(); } } template Real CuSparseMatrix::FrobeniusNorm() const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuSubVector element_vec(CsrVal(), NumElements()); return element_vec.Norm(2); } else #endif { return Smat().FrobeniusNorm(); } } template void CuSparseMatrix::SelectRows(const CuArray &row_indexes, const CuSparseMatrix &smat_other) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // Calculate nnz and row_ptr before copying selected col_idx and val. // We do this on CPU for now. We will move this part to GPU is mem copy // becomes a bottle-neck here. std::vector row_indexes_cpu(row_indexes.Dim()); row_indexes.CopyToVec(&row_indexes_cpu); CuSubArray other_row_ptr(smat_other.CsrRowPtr(), smat_other.NumRows() + 1); std::vector other_row_ptr_cpu(smat_other.NumRows() + 1); other_row_ptr.CopyToVec(&other_row_ptr_cpu); int nnz = 0; std::vector row_ptr_cpu(row_indexes_cpu.size() + 1); for (int i = 0; i < row_indexes_cpu.size(); ++i) { row_ptr_cpu[i] = nnz; nnz += other_row_ptr_cpu[row_indexes_cpu[i] + 1] - other_row_ptr_cpu[row_indexes_cpu[i]]; } row_ptr_cpu[row_indexes_cpu.size()] = nnz; Resize(row_indexes.Dim(), smat_other.NumCols(), nnz, kUndefined); CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.CopyFromVec(row_ptr_cpu); // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all selected rows. const int warpSize = 32; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y)); cuda_select_rows(dimGrid, dimBlock, CsrRowPtr(), CsrColIdx(), CsrVal(), row_indexes.Data(), row_indexes.Dim(), smat_other.CsrRowPtr(), smat_other.CsrColIdx(), smat_other.CsrVal()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { std::vector row_indexes_cpu(row_indexes.Dim()); row_indexes.CopyToVec(&row_indexes_cpu); Smat().SelectRows(row_indexes_cpu, smat_other.Smat()); } } template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, int32 dim, MatrixTransposeType trans) : num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined); if (NumElements() == 0) { return; } CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.Sequence(0); CuSubArray col_idx(CsrColIdx(), NumElements()); col_idx.CopyFromArray(indexes); CuSubVector val(CsrVal(), NumElements()); val.Set(1); if (trans == kTrans) { CuSparseMatrix tmp(*this, kTrans); this->Swap(&tmp); } } else #endif { std::vector idx(indexes.Dim()); indexes.CopyToVec(&idx); SparseMatrix tmp(idx, dim, trans); Smat().Swap(&tmp); } } template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, const CuVectorBase &weights, int32 dim, MatrixTransposeType trans) : num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined); if (NumElements() == 0) { return; } CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.Sequence(0); CuSubArray col_idx(CsrColIdx(), NumElements()); col_idx.CopyFromArray(indexes); CuSubVector val(CsrVal(), NumElements()); val.CopyFromVec(weights); if (trans == kTrans) { CuSparseMatrix tmp(*this, kTrans); this->Swap(&tmp); } } else #endif { std::vector idx(indexes.Dim()); indexes.CopyToVec(&idx); SparseMatrix tmp(idx, weights.Vec(), dim, trans); Smat().Swap(&tmp); } } template CuSparseMatrix& CuSparseMatrix::operator = ( const SparseMatrix &smat) { this->CopyFromSmat(smat); return *this; } template CuSparseMatrix& CuSparseMatrix::operator = ( const CuSparseMatrix &smat) { this->CopyFromSmat(smat, kNoTrans); return *this; } template void CuSparseMatrix::Resize(const MatrixIndexT num_rows, const MatrixIndexT num_cols, const MatrixIndexT nnz, MatrixResizeType resize_type) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined); if (num_rows == NumRows() && num_cols == NumCols() && nnz == NumElements()) { if (resize_type == kSetZero) { CuSubVector val(CsrVal(), NumElements()); val.Set(0); } return; } Destroy(); CuTimer tim; if (num_rows * num_cols == 0) { KALDI_ASSERT(num_rows == 0); KALDI_ASSERT(num_cols == 0); KALDI_ASSERT(nnz == 0); num_rows_ = 0; num_cols_ = 0; nnz_ = 0; csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( 1 * sizeof(int))); csr_val_ = NULL; } else { KALDI_ASSERT(num_rows > 0); KALDI_ASSERT(num_cols > 0); KALDI_ASSERT(nnz >= 0 && nnz <= num_rows * static_cast(num_cols)); num_rows_ = num_rows; num_cols_ = num_cols; nnz_ = nnz; csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( (num_rows + 1 + nnz) * sizeof(int))); csr_val_ = static_cast(CuDevice::Instantiate().Malloc( nnz * sizeof(Real))); CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.Set(nnz); if (resize_type == kSetZero) { CuSubVector val(CsrVal(), NumElements()); val.Set(0); } } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Smat().Resize(num_rows, num_cols, resize_type); } } template void CuSparseMatrix::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; if (csr_row_ptr_col_idx_) { CuDevice::Instantiate().Free(csr_row_ptr_col_idx_); } if (csr_val_) { CuDevice::Instantiate().Free(csr_val_); } num_rows_ = 0; num_cols_ = 0; nnz_ = 0; csr_row_ptr_col_idx_ = NULL; csr_val_ = NULL; CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Smat().Resize(0, 0); } } template template void CuSparseMatrix::CopyFromSmat(const SparseMatrix &smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Resize(smat.NumRows(), smat.NumCols(), smat.NumElements(), kUndefined); if (NumElements() == 0) { return; } std::vector row_ptr(NumRows() + 1); std::vector col_idx(NumElements()); Vector val(NumElements(), kUndefined); int n = 0; for (int32 i = 0; i < smat.NumRows(); ++i) { row_ptr[i] = n; for (int32 j = 0; j < (smat.Data() + i)->NumElements(); ++j, ++n) { col_idx[n] = ((smat.Data() + i)->Data() + j)->first; val(n) = static_cast(((smat.Data() + i)->Data() + j)->second); } } row_ptr[NumRows()] = n; KALDI_ASSERT(n == NumElements()); CuSubArray cu_row_ptr(CsrRowPtr(), NumRows() + 1); cu_row_ptr.CopyFromVec(row_ptr); CuSubArray cu_col_idx(CsrColIdx(), NumElements()); cu_col_idx.CopyFromVec(col_idx); CuSubVector cu_val(CsrVal(), NumElements()); cu_val.CopyFromVec(val); } else #endif { this->Smat().CopyFromSmat(smat); } } template void CuSparseMatrix::CopyFromSmat(const SparseMatrix &smat); template void CuSparseMatrix::CopyFromSmat(const SparseMatrix &smat); template void CuSparseMatrix::CopyFromSmat(const SparseMatrix &smat); template void CuSparseMatrix::CopyFromSmat(const SparseMatrix &smat); template void CuSparseMatrix::CopyFromSmat(const CuSparseMatrix& smat, MatrixTransposeType trans) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (trans == kNoTrans) { Resize(smat.NumRows(), smat.NumCols(), smat.NumElements(), kUndefined); CuSubVector val_to(CsrVal(), NumElements()); CuSubVector val_from(smat.CsrVal(), smat.NumElements()); val_to.CopyFromVec(val_from); CuSubArray idx_to(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements()); CuSubArray idx_from(smat.csr_row_ptr_col_idx_, smat.NumRows() + 1 + smat.NumElements()); idx_to.CopyFromArray(idx_from); } else { Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined); CuTimer tim; CUSPARSE_SAFE_CALL( cusparse_csr2csc(GetCusparseHandle(), smat.NumRows(), smat.NumCols(), smat.NumElements(), smat.CsrVal(), smat.CsrRowPtr(), smat.CsrColIdx(), CsrVal(), CsrColIdx(), CsrRowPtr(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); CuDevice::Instantiate().AccuProfile(__func__, tim); } } else #endif { Smat().CopyFromSmat(smat.Smat(), trans); } } template template void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { KALDI_ASSERT(smat != NULL); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (NumRows() == 0) { smat->Resize(0, 0); return; } CuSubArray idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements()); std::vector idx_cpu; idx.CopyToVec(&idx_cpu); CuSubVector val(CsrVal(), NumElements()); Vector val_cpu(NumElements(), kUndefined); val.CopyToVec(&val_cpu); std::vector > > pairs( NumRows()); int n = 0; for (int i = 0; i < NumRows(); ++i) { for (; n < idx_cpu[i + 1]; ++n) { const MatrixIndexT j = idx_cpu[NumRows() + 1 + n]; pairs[i].push_back( { j, val_cpu(n) }); } } KALDI_ASSERT(n == NumElements()); SparseMatrix tmp(num_cols_, pairs); smat->Swap(&tmp); } else #endif { smat->CopyFromSmat(this->Smat()); } } template void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const; template void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const; template void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const; template void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const; template void CuSparseMatrix::CopyElementsToVec(CuVectorBase *vec) const { KALDI_ASSERT(vec != NULL); KALDI_ASSERT(this->NumElements() == vec->Dim()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuSubVector val(CsrVal(), NumElements()); vec->CopyFromVec(val); } else #endif { Smat().CopyElementsToVec(&(vec->Vec())); } } template void CuSparseMatrix::Swap(SparseMatrix *smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuSparseMatrix tmp(*smat); Swap(&tmp); tmp.CopyToSmat(smat); } else #endif { Smat().Swap(smat); } } template void CuSparseMatrix::Swap(CuSparseMatrix *smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { std::swap(num_rows_, smat->num_rows_); std::swap(num_cols_, smat->num_cols_); std::swap(nnz_, smat->nnz_); std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_); std::swap(csr_val_, smat->csr_val_); } else #endif { Smat().Swap(&(smat->Smat())); } } template void CuSparseMatrix::SetRandn(BaseFloat zero_prob) { if (num_rows_ == 0) return; // Use the CPU function for the moment, not efficient... SparseMatrix tmp(num_rows_, num_cols_); tmp.SetRandn(zero_prob); Swap(&tmp); } template void CuSparseMatrix::Write(std::ostream &os, bool binary) const { SparseMatrix tmp; CopyToSmat(&tmp); tmp.Write(os, binary); } template void CuSparseMatrix::Read(std::istream &is, bool binary) { SparseMatrix tmp; tmp.Read(is, binary); this->Swap(&tmp); } template class CuSparseMatrix; template class CuSparseMatrix; template Real TraceMatSmat(const CuMatrixBase &A, const CuSparseMatrix &B, MatrixTransposeType trans) { if (A.NumCols() == 0) { KALDI_ASSERT(B.NumCols() == 0); return 0.0; } if (B.NumElements() == 0) { return 0.0; } Real result = 0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (trans == kTrans) { KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); } else { KALDI_ASSERT(A.NumCols() == B.NumRows() && A.NumRows() == B.NumCols()); } // The Sum() method in CuVector handles a bunch of logic, we use that to // comptue the trace. CuVector sum_vec(B.NumElements()); CuTimer tim; // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of B. const int warpSize = 32; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y)); if (trans == kNoTrans) { cuda_trace_mat_smat(dimGrid, dimBlock, A.Data(), A.Dim(), B.CsrRowPtr(), B.CsrColIdx(), B.CsrVal(), sum_vec.Data()); } else { cuda_trace_mat_smat_trans(dimGrid, dimBlock, A.Data(), A.Dim(), B.CsrRowPtr(), B.CsrColIdx(), B.CsrVal(), sum_vec.Data()); } result = sum_vec.Sum(); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { result = TraceMatSmat(A.Mat(), B.Smat(), trans); } return result; } template float TraceMatSmat(const CuMatrixBase &A, const CuSparseMatrix &B, MatrixTransposeType trans); template double TraceMatSmat(const CuMatrixBase &A, const CuSparseMatrix &B, MatrixTransposeType trans); void GeneralMatrix::CopyToMat(CuMatrixBase *cu_mat, MatrixTransposeType trans) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { switch (Type()) { case kFullMatrix: { cu_mat->CopyFromMat(mat_); break; } case kSparseMatrix: { CuSparseMatrix smat(smat_); smat.CopyToMat(cu_mat, trans); break; } case kCompressedMatrix: { Matrix mat(cmat_); if (trans == kNoTrans) { cu_mat->CopyFromMat(mat); break; } else { CuMatrix temp_cu; temp_cu.Swap(&mat); cu_mat->CopyFromMat(temp_cu, kTrans); break; } } default: KALDI_ERR << "Invalid GeneralMatrix type."; } return; } else #endif { CopyToMat(&(cu_mat->Mat()), trans); } } template template void CuSparseMatrix::CopyToMat(CuMatrixBase *M, MatrixTransposeType trans) const { if (trans == kNoTrans) { KALDI_ASSERT(M->NumRows() == NumRows() && M->NumCols() == NumCols()); } else { KALDI_ASSERT(M->NumRows() == NumCols() && M->NumCols() == NumRows()); } M->SetZero(); if (NumElements() == 0) { return; } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows. const int warpSize = 32; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(NumRows(), dimBlock.y)); if (trans == kNoTrans) { cuda_copy_from_smat(dimGrid, dimBlock, M->Data(), M->Dim(), CsrRowPtr(), CsrColIdx(), CsrVal()); } else { cuda_copy_from_smat_trans(dimGrid, dimBlock, M->Data(), M->Dim(), CsrRowPtr(), CsrColIdx(), CsrVal()); } CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Smat().CopyToMat(&(M->Mat()), trans); } } // Instantiate the template above. template void CuSparseMatrix::CopyToMat(CuMatrixBase *M, MatrixTransposeType trans) const; template void CuSparseMatrix::CopyToMat(CuMatrixBase *M, MatrixTransposeType trans) const; template void CuSparseMatrix::CopyToMat(CuMatrixBase *M, MatrixTransposeType trans) const; template void CuSparseMatrix::CopyToMat(CuMatrixBase *M, MatrixTransposeType trans) const; void GeneralMatrix::AddToMat(BaseFloat alpha, CuMatrixBase *cu_mat, MatrixTransposeType trans) const { switch (Type()) { case kFullMatrix: { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuMatrix cu_copy(mat_); cu_mat->AddMat(alpha, cu_copy); break; } #endif cu_mat->Mat().AddMat(alpha, mat_); break; } case kSparseMatrix: { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuSparseMatrix cu_smat(smat_); cu_mat->AddSmat(alpha, cu_smat, trans); break; } #endif cu_mat->Mat().AddSmat(alpha, smat_, trans); break; } case kCompressedMatrix: { Matrix mat(cmat_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuMatrix cu_mat_copy(mat); cu_mat->AddMat(alpha, cu_mat_copy, trans); break; } #endif cu_mat->Mat().AddMat(alpha, mat, trans); break; } default: KALDI_ERR << "Invalid GeneralMatrix type."; } } } // namespace kaldi