Blame view
src/cudamatrix/cu-block-matrix.h
4.64 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
// cudamatrix/cu-block-matrix.h // Copyright 2013 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_ #define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_ #include <sstream> #include <vector> #include "cudamatrix/cu-common.h" namespace kaldi { /** The class CuBlockMatrix holds a vector of objects of type CuMatrix, say, M_1, M_2, .. M_N and it represents the matrix diag(M_1, M_2, ... M_N). Note: the individual matrices do not have to be square. The reason the class is needed is mostly so that we can efficiently multiply by this block-diagonal structure in a parallel way. If we have a GPU available, CuBlockMatrix will store a copy of the individual CuMatrix quantities M_1 .. M_N on the GPU, but their 'primary' home remains on the CPU.. what we mean by this is that while the data remains on the GPU, the "primary" version of the Matrix object that holds the pointers will remain on the CPU. We just copy it over to the GPU whenever it is changed. */ template<typename Real> class CuBlockMatrix { public: friend class CuMatrixBase<Real>; CuBlockMatrix(); CuBlockMatrix(const std::vector<CuMatrix<Real> > &data); ~CuBlockMatrix() { Destroy(); } /// Copy constructor CuBlockMatrix(const CuBlockMatrix &other); /// Assignment operator CuBlockMatrix &operator= (const CuBlockMatrix &other); void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); MatrixIndexT NumRows() const { return num_rows_; } MatrixIndexT NumCols() const { return data_.num_cols_; } MatrixIndexT NumBlocks() const { return block_data_.size(); } // Returns max num-columns of any block MatrixIndexT MaxBlockCols() const ; // Returns max num-rows of any block MatrixIndexT MaxBlockRows() const; const CuSubMatrix<Real> Block(MatrixIndexT b) const; CuSubMatrix<Real> Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes. /// Does *this = alpha A B + beta * *this, discarding elements of the product outside /// the block structure of the *this matrix. The transA and transB parameters /// can be used to substitute A^T for A and B^T for B, respectively. void AddMatMat(BaseFloat alpha, const CuMatrix<Real> &A, MatrixTransposeType transA, const CuMatrix<Real> &B, MatrixTransposeType transB, BaseFloat beta); /// Copies elements within the block structure from matrix M, discarding others. /// Note: this has not been implemented in a very efficient way, it's used only /// for testing. void CopyFromMat(const CuMatrix<Real> &M); /// Normalizes the columns of *this so that each one sums to one. /// On error (e.g. inf's), will set the column to a constant value that /// sums to one. void NormalizeColumns(); void Swap(CuBlockMatrix *other); protected: CuMatrix<Real> data_; // This is a single matrix into which // we pack all the blocks (possibly with spaces left over) struct BlockMatrixData{ MatrixIndexT num_rows; MatrixIndexT num_cols; MatrixIndexT row_offset; MatrixIndexT col_offset; }; #if HAVE_CUDA == 1 const CuBlockMatrixData* CuData() const { return cu_data_; } #endif private: /// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL void FreeCudaData(); /// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_". void SetCudaData(); /// Frees and deinitializes everything. void Destroy(); std::vector<BlockMatrixData> block_data_; MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_. #if HAVE_CUDA == 1 CuBlockMatrixData *cu_data_; // We store the pointers and some additional info // on the GPU card in a form more suited to // use by CUDA kernels. #endif }; // class CuBlockMatrix template<typename Real> std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat); } // namespace Kaldi #endif |