// cudamatrix/cu-block-matrix.cc // Copyright 2013 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #if HAVE_CUDA == 1 #include #include #endif #include #include "base/timer.h" #include "cudamatrix/cu-block-matrix.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-device.h" namespace kaldi { template CuBlockMatrix::CuBlockMatrix() { #if HAVE_CUDA == 1 cu_data_ = NULL; #endif } template CuBlockMatrix::CuBlockMatrix(const std::vector >&data) { #if HAVE_CUDA == 1 cu_data_ = NULL; #endif block_data_.resize(data.size()); MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0; for (size_t b = 0; b < data.size(); b++) { MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols(); KALDI_ASSERT(num_rows > 0 && num_cols > 0); BlockMatrixData block_data; block_data.num_rows = num_rows; block_data.num_cols = num_cols; block_data.row_offset = row_offset; block_data.col_offset = col_offset; row_offset += num_rows; col_offset += num_cols; max_num_rows = std::max(max_num_rows, num_rows); block_data_[b] = block_data; } num_rows_ = row_offset; data_.Resize(max_num_rows, col_offset); for (int32 b = 0; b < NumBlocks(); b++) Block(b).CopyFromMat(data[b]); SetCudaData(); } template const CuSubMatrix CuBlockMatrix::Block(int32 b) const { KALDI_ASSERT(static_cast(b) < block_data_.size()); const BlockMatrixData &block_data = block_data_[b]; return CuSubMatrix(data_, 0, block_data.num_rows, block_data.col_offset, block_data.num_cols); } template CuSubMatrix CuBlockMatrix::Block(int32 b) { KALDI_ASSERT(static_cast(b) < block_data_.size()); BlockMatrixData &block_data = block_data_[b]; return CuSubMatrix(data_, 0, block_data.num_rows, block_data.col_offset, block_data.num_cols); } template CuBlockMatrix::CuBlockMatrix(const CuBlockMatrix &other): data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) { #if HAVE_CUDA == 1 cu_data_ = NULL; #endif SetCudaData(); } template CuBlockMatrix &CuBlockMatrix::operator =(const CuBlockMatrix &other) { FreeCudaData(); data_ = other.data_; block_data_ = other.block_data_; num_rows_ = other.num_rows_; SetCudaData(); return *this; } template void CuBlockMatrix::FreeCudaData() { #if HAVE_CUDA == 1 if (cu_data_ != NULL) { if (CuDevice::Instantiate().Enabled()) { CuDevice::Instantiate().Free(cu_data_); cu_data_ = NULL; } else { KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but " << "no GPU is enabled: likely code error."; } } #endif } template void CuBlockMatrix::SetCudaData() { #if HAVE_CUDA == 1 KALDI_ASSERT(cu_data_ == NULL); if (block_data_.size() == 0) return; // Nothing to do. if (CuDevice::Instantiate().Enabled()) { CuTimer tim; std::vector tmp_cu_data(NumBlocks()); int32 row_offset = 0, col_offset = 0; for (size_t b = 0; b < NumBlocks(); b++) { CuSubMatrix this_mat = Block(b); CuBlockMatrixData &this_cu_data = tmp_cu_data[b]; this_cu_data.row_offset = row_offset; this_cu_data.col_offset = col_offset; this_cu_data.matrix_dim = this_mat.Dim(); this_cu_data.matrix_data = static_cast(this_mat.Data()); row_offset += this_mat.NumRows(); col_offset += this_mat.NumCols(); } size_t size = NumBlocks() * sizeof(CuBlockMatrixData); cu_data_ = static_cast( CuDevice::Instantiate().Malloc(size)); CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } #endif } template void CuBlockMatrix::Swap(CuBlockMatrix *other) { data_.Swap(&other->data_); block_data_.swap(other->block_data_); std::swap(num_rows_, other->num_rows_); #if HAVE_CUDA == 1 std::swap(cu_data_, other->cu_data_); #endif } template void CuBlockMatrix::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); int32 num_blocks = NumBlocks(); WriteBasicType(os, binary, num_blocks); for (int32 b = 0; b < num_blocks; b++) this->Block(b).Write(os, binary); WriteToken(os, binary, ""); } template void CuBlockMatrix::Read(std::istream &is, bool binary) { Destroy(); int i = Peek(is, binary); std::vector > data; if (i != static_cast('<')) { // back-compatibility code so we can read the older format of // MixtureProbComponent. This code should be deleted eventually. int32 size; ReadBasicType(is, binary, &size); KALDI_ASSERT(size >= 0); data.resize(size); for (int32 i = 0; i < size; i++) data[i].Read(is, binary); } else { ExpectToken(is, binary, ""); int32 size; ReadBasicType(is, binary, &size); KALDI_ASSERT(size >= 0); data.resize(size); for (int32 i = 0; i < size; i++) data[i].Read(is, binary); ExpectToken(is, binary, ""); } CuBlockMatrix block_mat(data); // initializer from std::vector > does // the main job of initialization. this->Swap(&block_mat); } template void CuBlockMatrix::Destroy() { data_.Resize(0, 0); block_data_.clear(); num_rows_ = 0; FreeCudaData(); } // Does *this = alpha A B + beta * *this, discarding elements outside // the block structure of the *this matrix. template void CuBlockMatrix::AddMatMat( BaseFloat alpha, const CuMatrix &A, MatrixTransposeType transA, const CuMatrix &B, MatrixTransposeType transB, BaseFloat beta) { MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(), A_row_stride = A.Stride(), A_col_stride = 1, B_num_rows = B.NumRows(), B_num_cols = B.NumCols(), B_row_stride = B.Stride(), B_col_stride = 1; if (transA == kTrans) { std::swap(A_num_rows, A_num_cols); std::swap(A_row_stride, A_col_stride); } if (transB == kTrans) { std::swap(B_num_rows, B_num_cols); std::swap(B_row_stride, B_col_stride); } KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols() && A_num_cols == B_num_rows); if (NumBlocks() == 0) return; // empty matrix. #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // (x,y,z) dimensions are (block-id, row-of-block, col-of-block) // First some logic to choose block dims... // we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block. int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx. int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols(); int32 y_blocksize = max_block_rows; while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK) y_blocksize--; int32 z_blocksize = max_block_cols; while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK) z_blocksize--; dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize); dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize), n_blocks(max_block_rows, y_blocksize), n_blocks(max_block_cols, z_blocksize)); cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(), A.Data(), A_num_cols, A_row_stride, A_col_stride, B.Data(), B_row_stride, B_col_stride, alpha, beta); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { int32 row_offset = 0, col_offset = 0; for (MatrixIndexT b = 0; b < NumBlocks(); b++) { CuSubMatrix this_block = Block(b); MatrixIndexT this_num_rows = this_block.NumRows(), this_num_cols = this_block.NumCols(); CuSubMatrix A_part = (transA == kNoTrans ? A.Range(row_offset, this_num_rows, 0, A.NumCols()) : A.Range(0, A.NumRows(), row_offset, this_num_rows)), B_part = (transB == kNoTrans ? B.Range(0, B.NumRows(), col_offset, this_num_cols) : B.Range(col_offset, this_num_cols, 0, B.NumCols())); this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta); row_offset += this_num_rows; col_offset += this_num_cols; } KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols()); } } template MatrixIndexT CuBlockMatrix::MaxBlockCols() const { MatrixIndexT max_cols = 0; for (size_t i = 0; i < block_data_.size(); i++) max_cols = std::max(max_cols, block_data_[i].num_cols); return max_cols; } template MatrixIndexT CuBlockMatrix::MaxBlockRows() const { return data_.NumRows(); } template void CuBlockMatrix::CopyFromMat(const CuMatrix &M) { KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols()); MatrixIndexT row_offset = 0, col_offset = 0; for (MatrixIndexT b = 0; b < NumBlocks(); b++) { CuSubMatrix this_block = Block(b); MatrixIndexT this_num_rows = this_block.NumRows(), this_num_cols = this_block.NumCols(); const CuSubMatrix src(M, row_offset, this_num_rows, col_offset, this_num_cols); this_block.CopyFromMat(src); row_offset += this_num_rows; col_offset += this_num_cols; } KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols()); } /** * Print the matrix to stream */ template std::ostream &operator << (std::ostream &out, const CuBlockMatrix &mat) { bool binary = false; mat.Write(out, binary); return out; } // instantiate the template template std::ostream &operator << (std::ostream &out, const CuBlockMatrix &mat); template std::ostream &operator << (std::ostream &out, const CuBlockMatrix &mat); // Instantiate the class for float and double. template class CuBlockMatrix; template class CuBlockMatrix; } // namespace kaldi