// cudamatrix/cu-vector.cc // Copyright 2012-2013 Karel Vesely // 2012-2014 Johns Hopkins University (author: Daniel Povey) // 2017 Daniel Galvez // 2016-2018 Shiyin Kang // 2019 Yiwen Shao // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #if HAVE_CUDA == 1 #include #include #endif #include "base/timer.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-vector.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-vector.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-rand.h" #include "cudamatrix/cu-tp-matrix.h" #include "cudamatrix/cu-sp-matrix.h" #include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cublas-wrappers.h" namespace kaldi { template Real VecVec(const CuVectorBase &a, const CuVectorBase &b) { //MatrixIndexT a_dim = a.Dim(); KALDI_ASSERT(a.Dim() == b.Dim()); Real result = 0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CUBLAS_SAFE_CALL(cublas_dot(GetCublasHandle(), a.Dim(), a.Data(), 1, b.Data(), 1, &result)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { result = VecVec(a.Vec(), b.Vec()); } return result; } // instantiate the template above template float VecVec(const CuVectorBase &a, const CuVectorBase &b); template double VecVec(const CuVectorBase &a, const CuVectorBase &b); // The version of VecVec that can do type conversion. For now we give this a // stupid implementation that converts one of the vectors. If it ever becomes // an efficiency bottleneck, we can revisit this. template Real VecVec(const CuVectorBase &A, const CuVectorBase &B) { CuVector B2(B); return VecVec(A, B2); // This will call the single-parameter template. } // instantiate the template above template float VecVec(const CuVectorBase &A, const CuVectorBase &B); template double VecVec(const CuVectorBase &A, const CuVectorBase &B); template Real VecMatVec(const CuVectorBase &v1, const CuMatrixBase &M, const CuVectorBase &v2) { KALDI_ASSERT(v1.Dim() == M.NumRows() && M.NumCols() == v2.Dim()); if (v1.Dim() > v2.Dim()) { // do v2*M first CuVector v2M(v1.Dim()); v2M.AddMatVec(1.0, M, kNoTrans, v2, 0.0); return VecVec(v2M, v1); } else { // do v1*M first CuVector v1M(v2.Dim()); v1M.AddMatVec(1.0, M, kTrans, v1, 0.0); return VecVec(v1M, v2); } } // instantiate the template above template float VecMatVec(const CuVectorBase &v1, const CuMatrixBase &M, const CuVectorBase &v2); template double VecMatVec(const CuVectorBase &v1, const CuMatrixBase &M, const CuVectorBase &v2); template void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, MatrixIndexT col) { KALDI_ASSERT(col < mat.NumCols()); KALDI_ASSERT(dim_ == mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; cublas_copy(GetCublasHandle(), this->dim_, mat.Data() + col, mat.Stride(), this->data_, 1); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim); } else #endif { Vec().CopyColFromMat(mat.Mat(),col); } } template<> template<> void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, MatrixIndexT col) { KALDI_ASSERT(col < mat.NumCols()); KALDI_ASSERT(dim_ == mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_copy_col_from_mat_df(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim); } else #endif { Vec().CopyColFromMat(mat.Mat(), col); } } template<> template<> void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, MatrixIndexT col) { KALDI_ASSERT(col < mat.NumCols()); KALDI_ASSERT(dim_ == mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_copy_col_from_mat_fd(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim); } else #endif { Vec().CopyColFromMat(mat.Mat(), col); } } template void CuVectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) { CU_SAFE_CALL( cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); vec_data += mat.NumCols(); } } CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim); } else #endif { Vec().CopyRowsFromMat(mat.Mat()); } } template Real CuVectorBase::Norm(Real p) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; Real ans; KALDI_ASSERT(p == 1.0 || p == 2.0); if (dim_ == 0) return 0.0; if (p == 1.0) { cublas_asum(GetCublasHandle(), dim_, data_, 1, &ans); } else { cublas_nrm2(GetCublasHandle(), dim_, data_, 1, &ans); } CuDevice::Instantiate().AccuProfile(__func__, tim); if (ans != ans) { KALDI_ERR << "NaN in norm " << *this; } return ans; } else #endif { return Vec().Norm(p); } } template void CuVectorBase::CopyRowsFromMat(const MatrixBase &mat) { KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; if (mat.Stride() == mat.NumCols()) { CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyHostToDevice, cudaStreamPerThread)); } else { Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(), cudaMemcpyHostToDevice, cudaStreamPerThread)); vec_data += mat.NumCols(); } } CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().CopyRowsFromMat(mat); } } template void MatrixBase::CopyRowsFromVec(const CuVectorBase &v) { KALDI_ASSERT(v.Dim() == NumCols() * NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (num_rows_ == 0) return; CuTimer tim; if (Stride() == NumCols()) { CU_SAFE_CALL(cudaMemcpyAsync(data_, v.Data(), sizeof(Real)*v.Dim(), cudaMemcpyDeviceToHost, cudaStreamPerThread)); } else { const Real* vec_data = v.Data(); for (MatrixIndexT r = 0; r < NumRows(); r++) { CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data, sizeof(Real) * NumCols(), cudaMemcpyDeviceToHost, cudaStreamPerThread)); vec_data += NumCols(); } } CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { CopyRowsFromVec(v.Vec()); } } // instantiate the template above. template void MatrixBase::CopyRowsFromVec(const CuVectorBase &v); template void MatrixBase::CopyRowsFromVec(const CuVectorBase &v); template void CuVectorBase::SetRandn() { if (dim_ == 0) return; CuRand tmp; tmp.RandGaussian(this); } template void CuVectorBase::SetRandUniform() { if (dim_ == 0) return; CuRand tmp; tmp.RandUniform(this); } template Real CuVectorBase::Sum() const { if (dim_ == 0) return 0.0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Real result; CuTimer tim; // Small vectors are copied to RAM and reduced on CPU. // The length is chosen by cu-vector-speed-test if (dim_ < 4096) { Vector ans_cpu(*this); result = ans_cpu.Sum(); } else { // Use no more than 256 blocks (still too many?) int dimBlock = CU1DBLOCK; int dimGrid = n_blocks(dim_, dimBlock); if (dimGrid > 256) { dimGrid = 256; } CuVector ans(dimGrid, kUndefined); cuda_vec_sum(dimGrid, dimBlock, data_, ans.Data(), dim_, 1); CU_SAFE_CALL(cudaGetLastError()); Vector ans_cpu(ans); result = ans_cpu.Sum(); } CuDevice::Instantiate().AccuProfile(__func__, tim); return result; } else #endif { return Vec().Sum(); } } template void CuVectorBase::ApplySoftMax() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; size_t dimBlock = CU1DBLOCK; size_t dimGrid = 1; // dimGrid value represent the number of rows ::MatrixDim dim = { 1, this->dim_, this->dim_}; cuda_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);//actually dim is not stride... CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().ApplySoftMax(); } } template void CuVectorBase::Floor(const CuVectorBase &src, Real floor_val, MatrixIndexT *floored_count) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); if (floored_count == nullptr) { if (dim_ == 0) return; CuTimer tim; // We are calling a function meant for matrices, by viewing the // vector as a matrix with a single row. ::MatrixDim dim = {1, Dim(), 1}; cuda_floor(dimGrid, dimBlock, this->data_, src.Data(), floor_val, dim, 1); CuDevice::Instantiate().AccuProfile("CuVectorBase::FloorNoCount", tim); } else { if (dim_ == 0) { *floored_count = 0; return; } CuTimer tim; CuVector count_vec(dim_, kUndefined); cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_); CU_SAFE_CALL(cudaGetLastError()); *floored_count = count_vec.Sum(); CuDevice::Instantiate().AccuProfile("CuVectorBase::Floor", tim); } } else #endif { Vec().Floor(src.Vec(), floor_val, floored_count); } } template void CuVectorBase::Ceiling(const CuVectorBase &src, Real ceiling_val, MatrixIndexT *ceiled_count) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); if (ceiled_count == nullptr) { if (dim_ == 0) return; CuTimer tim; // We are calling a function meant for matrices, by viewing the // vector as a matrix with a single row. ::MatrixDim dim = {1, Dim(), 1}; cuda_ceiling(dimGrid, dimBlock, this->data_, src.Data(), ceiling_val, dim, 1); CuDevice::Instantiate().AccuProfile("CuVectorBase::CeilingNoCount", tim); } else { if (dim_ == 0) { *ceiled_count = 0; return; } CuTimer tim; CuVector count_vec(dim_, kUndefined); cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_); CU_SAFE_CALL(cudaGetLastError()); *ceiled_count = count_vec.Sum(); CuDevice::Instantiate().AccuProfile("CuVectorBase::Ceiling", tim); } } else #endif { Vec().Ceiling(src.Vec(), ceiling_val, ceiled_count); } } template void CuVectorBase::Pow(const CuVectorBase &src, Real power) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; // for this particular kernel, x is #rows, y is #cols. so // fake matrix with 1 row, Dim() cols. dim3 dimBlock(CU1DBLOCK, 1); dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1); ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 }; // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care). cuda_pow(dimGrid, dimBlock, this->data_, src.Data(), power, fake_matrix_dim, 1); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyPow", tim); } else #endif { Vec().Pow(src.Vec(), power); } } template void CuVectorBase::ApplyExp() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_vec_apply_exp(dimGrid, dimBlock, data_, dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyExp", tim); } else #endif { Vec().ApplyExp(); } } template void CuVectorBase::ApplyLog() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); CuVector flag(1); cuda_vec_apply_log(dimGrid, dimBlock, data_, flag.Data(), dim_); CU_SAFE_CALL(cudaGetLastError()); if (flag(0) > 0) KALDI_ERR << "Trying to take log of a negative number."; CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyLog", tim); } else #endif { Vec().ApplyLog(); } } template void CuVectorBase::ApplyLogSoftMax() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; size_t dimBlock = CU1DBLOCK; size_t dimGrid = 1; // dimGrid value represent the number of rows ::MatrixDim dim = { 1, this->dim_, this->dim_}; cuda_log_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().ApplyLogSoftMax(); } } template void CuVectorBase::AddMatVec(const Real alpha, const CuMatrixBase &M, MatrixTransposeType trans, const CuVectorBase &v, const Real beta) { KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_) || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_)); KALDI_ASSERT(&v != this); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; // Everything is backwards in CuBlas. We need to reverse rows, columns, // transpose-ness. CUBLAS_SAFE_CALL(cublas_gemv(GetCublasHandle(), (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T), M.NumCols(), M.NumRows(), alpha, M.Data(), M.Stride(), v.Data(), 1, beta, data_, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().AddMatVec(alpha,M.Mat(),trans,v.Vec(),beta); } } template void CuVectorBase::AddSpVec(const Real alpha, const CuSpMatrix &M, const CuVectorBase &v, const Real beta) { KALDI_ASSERT(M.NumCols() == v.dim_ && M.NumRows() == dim_); KALDI_ASSERT(&v != this); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; // Note: in our opinion the CuSpMatrix represents a lower-triangular matrix, but // in CUBLAS, for some stupid reason, everything is reversed. CUBLAS_SAFE_CALL(cublas_spmv(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, Dim(), alpha, M.Data(), v.Data(), 1, beta, data_, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().AddSpVec(alpha,M.Mat(),v.Vec(),beta); } } template void CuVectorBase::AddVecVec(Real alpha, const CuVectorBase &v, const CuVectorBase &r, Real beta) { KALDI_ASSERT((dim_ == v.dim_ && dim_ == r.dim_)); KALDI_ASSERT(this != &v && this != &r); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_add_vec_vec(dimGrid, dimBlock, alpha, data_, v.Data(), r.Data(), beta, dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::AddVecVec", tim); } else #endif { Vec().AddVecVec(alpha, v.Vec(), r.Vec(), beta); } } template bool CuVectorBase::ApproxEqual(const CuVectorBase &other, float tol) const { if (dim_ != other.dim_) KALDI_ERR << "ApproxEqual: size mismatch " << dim_ << " vs. " << other.dim_; KALDI_ASSERT(tol >= 0.0); CuVector tmp(*this); tmp.AddVec(-1.0, other); BaseFloat tmp_norm = sqrt(VecVec(tmp, tmp)), this_norm = sqrt(VecVec(*this, *this)); return tmp_norm <= static_cast(tol) * this_norm; } template void CuVectorBase::AddDiagMat2(Real alpha, const CuMatrixBase &M, MatrixTransposeType trans, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; MatrixTransposeType other_trans = (trans == kTrans ? kNoTrans : kTrans); KALDI_ASSERT(dim_ == (trans == kNoTrans ? M.NumRows() : M.NumCols())); this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta); } else #endif { Vec().AddDiagMat2(alpha, M.Mat(), trans, beta); } } template void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, MatrixTransposeType transM, const CuMatrixBase &N, MatrixTransposeType transN, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; if (transM != transN) { KALDI_ASSERT(M.NumCols() == N.NumCols()); KALDI_ASSERT(M.NumRows() == N.NumRows()); if (transM == kNoTrans) { // Case 1: diag(M*N') == sum(M.*N, 2) // 1D grid and 1D block. One block per row of N. // 1D grid expands along the column of N. int dimBlock(CU1DBLOCK); int dimGrid(M.NumRows()); cuda_add_diag_mat_mat_MNT(dimGrid, dimBlock, alpha, M.Data(), M.Dim(), N.Data(), N.Stride(), beta, data_); } else { // Case 2: diag(M'*N) == sum(M.*N, 1) // 16x16 or 8x32 2D block for coalesced memory access. // Grid shape is designed as follows, // 1. for small matrices, use 1D grid with only 1 row of 16x16 block, // to avoid multiple kernel launch; // 2. for large enough matrices (no matter thin or fat), // use 1- or 2-D grid so that the grid contains // at least and not much larger than 'kOptNumBlocks' blocks // to fully utilize the GPU; const int32 warpSize = 32; const int32 kOptNumBlocks = 512; const int32 tile_dim = (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? 16 : 32; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), n_blocks(N.NumRows(), dimBlock.y)); dimGrid.y = std::min(dimGrid.y, (kOptNumBlocks - 1) / dimGrid.x + 1); dimGrid.y = tile_dim == 16 ? 1 : dimGrid.y; if (dimGrid.y > 1) { CuMatrix buf(dimGrid.y, N.NumCols()); cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, Real(1), M.Data(), M.Stride(), N.Data(), N.Dim(), Real(0), buf.Data(), buf.Stride()); this->AddRowSumMat(alpha, buf, beta); } else { cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), N.Data(), N.Dim(), beta, data_, dim_); } } } else { KALDI_ASSERT(M.NumCols() == N.NumRows()); KALDI_ASSERT(N.NumCols() == M.NumRows()); if (transM == kNoTrans) { // Case 3: diag(M*N) == sum(M'.*N, 1) // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access. // One block per 'tile_dim' columns of N. // 1D grid expands along the row of N. int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), N.Data(), N.Dim(), beta, data_); } else { // Case 4: diag(M'*N') == sum(N'.*M, 1) // Same kernel and config as case 3 except M and N are swapped. int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(M.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(), M.Data(), M.Dim(), beta, data_); } } CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().AddDiagMatMat(alpha, M.Mat(), transM, N.Mat(), transN, beta); } } template void CuVectorBase::AddTpVec(const Real alpha, const CuTpMatrix &M, const MatrixTransposeType trans, const CuVectorBase &v, const Real beta) { KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; if (beta == 0.0) { if (&v != this) CopyFromVec(v); MulTp(M, trans); if (alpha != 1.0) Scale(alpha); } else { CuVector tmp(v); tmp.MulTp(M, trans); if (beta != 1.0) Scale(beta); // *this <-- beta * *this AddVec(alpha, tmp, 1.0); // *this += alpha * M * v } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().AddTpVec(alpha, M.Mat(), trans, v.Vec(), beta); } } template void CuVectorBase::MulTp(const CuTpMatrix &M, const MatrixTransposeType trans) { KALDI_ASSERT(M.NumRows() == dim_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; cublas_tpmv(GetCublasHandle(), (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T), M.NumRows(), M.Data(), data_, 1); CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim); } else #endif { Vec().MulTp(M.Mat(), trans); } } template Real CuVectorBase::Min() const { Real result = 0.0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) { // min of an empty set is infinity. return std::numeric_limits::infinity(); } CuTimer tim; // Small vectors are copied to RAM and reduced on CPU. // The length is chosen by cu-vector-speed-test if (dim_ < 4096) { Vector ans_cpu(*this); result = ans_cpu.Min(); } else { // Use no more than 256 blocks (still too many?) int dimBlock = CU1DBLOCK; int dimGrid = n_blocks(dim_, dimBlock); if (dimGrid > 256) { dimGrid = 256; } CuVector ans(dimGrid, kUndefined); cuda_vec_min(dimGrid, dimBlock, data_, ans.Data(), dim_, 1); CU_SAFE_CALL(cudaGetLastError()); Vector ans_cpu(ans); result = ans_cpu.Min(); } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { result = (this->Vec()).Min(); } return result; } template Real CuVectorBase::Max() const { Real result = 0.0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) { // max of an empty set is -infinity. return -std::numeric_limits::infinity(); } CuTimer tim; // Small vectors are copied to RAM and reduced on CPU. // The length is chosen by cu-vector-speed-test if (dim_ < 4096) { Vector ans_cpu(*this); result = ans_cpu.Max(); } else { // Use no more than 256 blocks (still too many?) int dimBlock = CU1DBLOCK; int dimGrid = n_blocks(dim_, dimBlock); if (dimGrid > 256) { dimGrid = 256; } CuVector ans(dimGrid, kUndefined); cuda_vec_max(dimGrid, dimBlock, data_, ans.Data(), dim_, 1); CU_SAFE_CALL(cudaGetLastError()); Vector ans_cpu(ans); result = ans_cpu.Max(); } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { result = (this->Vec()).Max(); } return result; } template void CuVectorBase::ReplaceValue(Real orig, Real changed) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_, CU1DBLOCK)); cuda_replace_value(dimGrid, dimBlock, data_, dim_, orig, changed); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().ReplaceValue(orig, changed); } } template void CuVectorBase::MulElements(const CuVectorBase &v) { KALDI_ASSERT(dim_ == v.dim_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_, CU1DBLOCK)); cuda_vec_mul_elements(dimGrid, dimBlock, data_, v.Data(), dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::MulElements", tim); } else #endif { Vec().MulElements(v.Vec()); } } template void CuVectorBase::DivElements(const CuVectorBase &v) { // this just creates a matrix and calls the matrix version. KALDI_ASSERT(dim_ == v.dim_); CuSubMatrix this_mat(this->Data(), 1, dim_, dim_), v_mat(v.Data(), 1, dim_, dim_); this_mat.DivElements(v_mat); } template<> template<> void CuVectorBase::CopyFromVec(const CuVectorBase &src) { KALDI_ASSERT(src.Dim() == dim_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.Data(), 1, data_, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().CopyFromVec(src.Vec()); } } template<> template<> void CuVectorBase::CopyFromVec(const CuVectorBase &src) { KALDI_ASSERT(src.Dim() == dim_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, src.Data(), 1, data_, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().CopyFromVec(src.Vec()); } } template template void CuVectorBase::CopyFromVec(const VectorBase &src) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (sizeof(Real) != sizeof(OtherReal)) { CuVector temp(dim_, kUndefined); temp.CopyFromVec(src); this->CopyFromVec(temp); } else { KALDI_ASSERT(src.Dim() == dim_); if (dim_ == 0) return; CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D", tim); } } else #endif { Vec().CopyFromVec(src); } } // Instantiate the template above. template void CuVectorBase::CopyFromVec(const VectorBase &src); template void CuVectorBase::CopyFromVec(const VectorBase &src); template void CuVectorBase::CopyFromVec(const VectorBase &src); template void CuVectorBase::CopyFromVec(const VectorBase &src); template template void CuVectorBase::CopyToVec(VectorBase *dst) const { KALDI_ASSERT(dim_ == dst->Dim()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (sizeof(Real) != sizeof(OtherReal)) { CuVector temp(*this); temp.CopyToVec(dst); } else { if (dim_ == 0) return; CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(dst->Data(), this->data_, sizeof(Real) * dim_, cudaMemcpyDeviceToHost, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } } else #endif { dst->CopyFromVec(this->Vec()); } } template void CuVector::Read(std::istream &is, bool binary) { Vector temp; temp.Read(is, binary); Destroy(); Swap(&temp); } template void CuVector::Write(std::ostream &os, bool binary) const { Vector temp(this->dim_, kUndefined); this->CopyToVec(&temp); temp.Write(os, binary); } template CuVector::CuVector(const CuVectorBase &v) { this->Resize(v.Dim()); this->CopyFromVec(v); } template CuVector::CuVector(const VectorBase &v) { this->Resize(v.dim_); this->CopyFromVec(v); } template void CuVector::Resize(MatrixIndexT dim, MatrixResizeType t) { KALDI_ASSERT(t == kSetZero || t == kUndefined); // Others not implemented // yet. if (this->dim_ == dim) { this->SetZero(); return; } if (this->dim_ != 0) this->Destroy(); if (dim == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; this->data_ = static_cast(CuDevice::Instantiate().Malloc(dim * sizeof(Real))); this->dim_ = dim; if (t == kSetZero) this->SetZero(); CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim); } else #endif { Vector vec(dim); this->Swap(&vec); } } template void CuVector::Swap(CuVector *vec) { std::swap(this->data_, vec->data_); std::swap(this->dim_, vec->dim_); } template void CuVector::Swap(Vector *vec) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->dim_ == 0) { if (vec->dim_ != 0) { // *this is empty, but vec is nonempty. Resize(vec->dim_, kUndefined); this->CopyFromVec(*vec); vec->Resize(0); } // else both are empty. } else { // *this is nonempty. if (vec->dim_ != 0) { // Both *this and *vec are nonempty. Recurse to simpler cases. // this could be done more efficiently in the case where // the size does not change. Vector temp; this->Swap(&temp); // now temp is full, *this is empty. vec->Swap(&temp); // now vec has data from *this, temp has // data from vec. Swap(&temp); // copy data in vec to *this, which is now empty. } else { // *this is full but *vec is empty. vec->Resize(this->dim_, kUndefined); this->CopyToVec(vec); this->Destroy(); } } } else #endif { std::swap(vec->data_, this->data_); std::swap(vec->dim_, this->dim_); } } template void CuVector::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->data_ != NULL) CuDevice::Instantiate().Free(this->data_); } else #endif { if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_); } this->data_ = NULL; this->dim_ = 0; } template void CuVectorBase::CopyFromVec(const CuVectorBase &src) { KALDI_ASSERT(src.Dim() == dim_); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; CU_SAFE_CALL( cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { memcpy(static_cast(data_), static_cast(src.data_), dim_ * sizeof(Real)); } } template void CuVectorBase::SetZero() { if (dim_==0 || data_==NULL) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(dim_>=0); KALDI_ASSERT(data_!=NULL); CuTimer tim; CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real), cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim); } else #endif { Vec().SetZero(); } } /// Print the vector to stream template std::ostream &operator << (std::ostream &out, const CuVectorBase &vec) { Vector temp(vec.Dim()); vec.CopyToVec(&temp); out << temp; return out; } // Instantiate the above. template std::ostream &operator << (std::ostream &out, const CuVectorBase &vec); template std::ostream &operator << (std::ostream &out, const CuVectorBase &vec); /* * Methods wrapping the ANSI-C CUDA kernels */ template void CuVectorBase::Set(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU1DBLOCK); dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK)); ::MatrixDim d = { 1, Dim(), Dim() }; cuda_set_const(dimGrid, dimBlock, data_, value, d); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().Set(value); } } template void CuVectorBase::Add(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU1DBLOCK); dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK)); ::MatrixDim d = { 1, Dim(), Dim() }; cuda_add(dimGrid, dimBlock, data_, value, d); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().Add(value); } } template void CuVectorBase::CopyDiagFromPacked(const CuPackedMatrix &M) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(dim_ == M.NumRows()); if (dim_ == 0) return; CuTimer tim; int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(Dim(), CU1DBLOCK)); cuda_vec_copy_diag_from_packed(dimGrid, dimBlock, data_, M.Data(), dim_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().CopyDiagFromPacked(M.Mat()); } } template void CuVectorBase::CopyDiagFromMat(const CuMatrix &M) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols())); CuTimer tim; CUBLAS_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, M.Data(), M.Stride() + 1, data_, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().CopyDiagFromMat(M.Mat()); } } template void CuVectorBase::Scale(Real value) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (Dim() == 0 ) return; CuTimer tim; dim3 dimBlock(CU1DBLOCK); dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK)); ::MatrixDim d = { 1, Dim(), Dim() }; cuda_scale(dimGrid, dimBlock, data_, value, d); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().Scale(value); } } template void CuVectorBase::AddVec(Real alpha, const CuVectorBase &vec, Real beta) { KALDI_ASSERT(vec.Dim() == Dim()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; int32 dim = this->dim_; Real *data = this->data_; const Real *vec_data = vec.data_; if (beta != 1.0) CU_SAFE_CALL(cuda_scal(GetCublasHandle(), dim, beta, data, 1)); if (alpha != 0.0) CU_SAFE_CALL(cuda_axpy(GetCublasHandle(), dim, alpha, vec_data, 1, data, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { if (beta != 1.0) Vec().Scale(beta); Vec().AddVec(alpha, vec.Vec()); } } template template void CuVectorBase::AddVec(Real alpha, const CuVectorBase &vec, Real beta) { // We could implement this directly, without using a temporary-- this can // be done later, when we have time. CuVector temp(vec); this->AddVec(alpha, temp, beta); } // instantiate the template above. template void CuVectorBase::AddVec(float alpha, const CuVectorBase &vec, float beta); template void CuVectorBase::AddVec(double alpha, const CuVectorBase &vec, double beta); template void CuVectorBase::AddRowSumMat(Real alpha, const CuMatrixBase &mat, Real beta) { KALDI_ASSERT(mat.NumCols() == Dim()); if (Dim() == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; cuda_add_row_sum_mat(mat.NumCols(), CU1DBLOCK, Data(), mat.Data(), mat.Dim(), alpha, beta); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().AddRowSumMat(alpha, mat.Mat(), beta); } } template void CuVectorBase::AddColSumMat(Real alpha, const CuMatrixBase &mat, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; KALDI_ASSERT(mat.NumRows() == Dim()); cuda_add_col_sum_mat(mat.NumRows(), CU1DBLOCK, Data(), mat.Data(), mat.Dim(), alpha, beta); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().AddColSumMat(alpha, mat.Mat(), beta); } } template void CuVectorBase::InvertElements() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU1DBLOCK, 1); dim3 dimGrid(n_blocks(dim_, CU1DBLOCK)); MatrixDim d = {1, dim_, dim_}; cuda_invert_elements(dimGrid, dimBlock, data_, d); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { Vec().InvertElements(); } } template void CuVectorBase::CopyElements(const CuMatrixBase &mat, const MatrixTransposeType trans, const CuArrayBase &elements) { KALDI_ASSERT(elements.Dim() == Dim()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU1DBLOCK); dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK)); cuda_vector_copy_elements(dimGrid, dimBlock, this->data_, Dim(), mat.Data(), mat.Stride(), trans == kTrans, elements.Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { VectorBase &this_vec = this->Vec(); const MatrixBase &src_mat = mat.Mat(); const int32* index_map = elements.Data(); KALDI_ASSERT((Dim() == mat.NumRows() && trans == kNoTrans) || (Dim() == mat.NumCols() && trans == kTrans)); for (int32 i = 0; i < Dim(); i++) { int32 j = index_map[i]; KALDI_ASSERT(j >= 0); if (trans == kNoTrans) { KALDI_ASSERT(j < mat.NumCols()); this_vec(i) = src_mat(i, j); } else { KALDI_ASSERT(j < mat.NumRows()); this_vec(i) = src_mat(j, i); } } } } template void CuVectorBase::CopyToVec(VectorBase *dst) const; template void CuVectorBase::CopyToVec(VectorBase *dst) const; template void CuVectorBase::CopyToVec(VectorBase *dst) const; template void CuVectorBase::CopyToVec(VectorBase *dst) const; template class CuVectorBase; template class CuVectorBase; template class CuVector; template class CuVector; } // namespace