// cudamatrix/cu-block-matrix.h // Copyright 2013 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_ #define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_ #include #include #include "cudamatrix/cu-common.h" namespace kaldi { /** The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .. M_N and it represents the matrix diag(M_1, M_2, ... M_N). Note: the individual matrices do not have to be square. The reason the class is needed is mostly so that we can efficiently multiply by this block-diagonal structure in a parallel way. If we have a GPU available, CuBlockMatrix will store a copy of the individual CuMatrix quantities M_1 .. M_N on the GPU, but their 'primary' home remains on the CPU.. what we mean by this is that while the data remains on the GPU, the "primary" version of the Matrix object that holds the pointers will remain on the CPU. We just copy it over to the GPU whenever it is changed. */ template class CuBlockMatrix { public: friend class CuMatrixBase; CuBlockMatrix(); CuBlockMatrix(const std::vector > &data); ~CuBlockMatrix() { Destroy(); } /// Copy constructor CuBlockMatrix(const CuBlockMatrix &other); /// Assignment operator CuBlockMatrix &operator= (const CuBlockMatrix &other); void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); MatrixIndexT NumRows() const { return num_rows_; } MatrixIndexT NumCols() const { return data_.num_cols_; } MatrixIndexT NumBlocks() const { return block_data_.size(); } // Returns max num-columns of any block MatrixIndexT MaxBlockCols() const ; // Returns max num-rows of any block MatrixIndexT MaxBlockRows() const; const CuSubMatrix Block(MatrixIndexT b) const; CuSubMatrix Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes. /// Does *this = alpha A B + beta * *this, discarding elements of the product outside /// the block structure of the *this matrix. The transA and transB parameters /// can be used to substitute A^T for A and B^T for B, respectively. void AddMatMat(BaseFloat alpha, const CuMatrix &A, MatrixTransposeType transA, const CuMatrix &B, MatrixTransposeType transB, BaseFloat beta); /// Copies elements within the block structure from matrix M, discarding others. /// Note: this has not been implemented in a very efficient way, it's used only /// for testing. void CopyFromMat(const CuMatrix &M); /// Normalizes the columns of *this so that each one sums to one. /// On error (e.g. inf's), will set the column to a constant value that /// sums to one. void NormalizeColumns(); void Swap(CuBlockMatrix *other); protected: CuMatrix data_; // This is a single matrix into which // we pack all the blocks (possibly with spaces left over) struct BlockMatrixData{ MatrixIndexT num_rows; MatrixIndexT num_cols; MatrixIndexT row_offset; MatrixIndexT col_offset; }; #if HAVE_CUDA == 1 const CuBlockMatrixData* CuData() const { return cu_data_; } #endif private: /// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL void FreeCudaData(); /// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_". void SetCudaData(); /// Frees and deinitializes everything. void Destroy(); std::vector block_data_; MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_. #if HAVE_CUDA == 1 CuBlockMatrixData *cu_data_; // We store the pointers and some additional info // on the GPU card in a form more suited to // use by CUDA kernels. #endif }; // class CuBlockMatrix template std::ostream &operator << (std::ostream &out, const CuBlockMatrix &mat); } // namespace Kaldi #endif