cu-block-matrix.h
4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// cudamatrix/cu-block-matrix.h
// Copyright 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
#define KALDI_CUDAMATRIX_CU_BLOCK_MATRIX_H_
#include <sstream>
#include <vector>
#include "cudamatrix/cu-common.h"
namespace kaldi {
/**
The class CuBlockMatrix holds a vector of objects of type CuMatrix,
say, M_1, M_2, .. M_N
and it represents the matrix diag(M_1, M_2, ... M_N). Note:
the individual matrices do not have to be square. The reason the
class is needed is mostly so that we can efficiently multiply by this
block-diagonal structure in a parallel way.
If we have a GPU available, CuBlockMatrix will store a copy of the
individual CuMatrix quantities M_1 .. M_N on the GPU, but their
'primary' home remains on the CPU.. what we mean by this is that
while the data remains on the GPU, the "primary" version of the
Matrix object that holds the pointers will remain on the CPU.
We just copy it over to the GPU whenever it is changed.
*/
template<typename Real>
class CuBlockMatrix {
public:
friend class CuMatrixBase<Real>;
CuBlockMatrix();
CuBlockMatrix(const std::vector<CuMatrix<Real> > &data);
~CuBlockMatrix() { Destroy(); }
/// Copy constructor
CuBlockMatrix(const CuBlockMatrix &other);
/// Assignment operator
CuBlockMatrix &operator= (const CuBlockMatrix &other);
void Write(std::ostream &os, bool binary) const;
void Read(std::istream &is, bool binary);
MatrixIndexT NumRows() const { return num_rows_; }
MatrixIndexT NumCols() const { return data_.num_cols_; }
MatrixIndexT NumBlocks() const { return block_data_.size(); }
// Returns max num-columns of any block
MatrixIndexT MaxBlockCols() const ;
// Returns max num-rows of any block
MatrixIndexT MaxBlockRows() const;
const CuSubMatrix<Real> Block(MatrixIndexT b) const;
CuSubMatrix<Real> Block(MatrixIndexT b); // return CuMatrixBase to disallow resizes.
/// Does *this = alpha A B + beta * *this, discarding elements of the product outside
/// the block structure of the *this matrix. The transA and transB parameters
/// can be used to substitute A^T for A and B^T for B, respectively.
void AddMatMat(BaseFloat alpha,
const CuMatrix<Real> &A, MatrixTransposeType transA,
const CuMatrix<Real> &B, MatrixTransposeType transB,
BaseFloat beta);
/// Copies elements within the block structure from matrix M, discarding others.
/// Note: this has not been implemented in a very efficient way, it's used only
/// for testing.
void CopyFromMat(const CuMatrix<Real> &M);
/// Normalizes the columns of *this so that each one sums to one.
/// On error (e.g. inf's), will set the column to a constant value that
/// sums to one.
void NormalizeColumns();
void Swap(CuBlockMatrix *other);
protected:
CuMatrix<Real> data_; // This is a single matrix into which
// we pack all the blocks (possibly with spaces left over)
struct BlockMatrixData{
MatrixIndexT num_rows;
MatrixIndexT num_cols;
MatrixIndexT row_offset;
MatrixIndexT col_offset;
};
#if HAVE_CUDA == 1
const CuBlockMatrixData* CuData() const { return cu_data_; }
#endif
private:
/// If using GPU and cu_data_ != NULL, free cu_data_ and set it to NULL
void FreeCudaData();
/// If using GPU, allocate and set cu_data_ on the GPU to reflect "data_".
void SetCudaData();
/// Frees and deinitializes everything.
void Destroy();
std::vector<BlockMatrixData> block_data_;
MatrixIndexT num_rows_; // sum of num_rows of elements of block_data_.
#if HAVE_CUDA == 1
CuBlockMatrixData *cu_data_; // We store the pointers and some additional info
// on the GPU card in a form more suited to
// use by CUDA kernels.
#endif
}; // class CuBlockMatrix
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat);
} // namespace Kaldi
#endif