cu-packed-matrix.h
5.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// cudamatrix/cu-packed-matrix.h
// Copyright 2009-2013 Johns Hopkins University (author: Daniel Povey)
// Karel Vesely
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
#define KALDI_CUDAMATRIX_CU_PACKED_MATRIX_H_
#include <sstream>
#include "cudamatrix/cu-common.h"
#include "cudamatrix/cu-value.h"
#include "matrix/matrix-common.h"
#include "matrix/kaldi-matrix.h"
#include "matrix/packed-matrix.h"
#include "matrix/sp-matrix.h"
#include "cudamatrix/cu-array.h"
#include "cudamatrix/cu-math.h"
#include "cudamatrix/cu-matrix.h"
namespace kaldi {
/**
* Matrix for CUDA computing. This is a base class for packed
* triangular and symmetric matrices.
* Does the computation on the CUDA card when CUDA is compiled in and
* we have a suitable GPU (CuDevice::Instantiate().Enabled() == true);
* otherwise, does it on the CPU.
*/
/// @brief Packed CUDA matrix: base class for triangular and symmetric matrices on
/// a GPU card.
template<typename Real>
class CuPackedMatrix {
public:
friend class CuMatrixBase<Real>;
friend class CuVectorBase<Real>;
friend class CuSubMatrix<Real>;
friend class CuRand<Real>;
CuPackedMatrix() : data_(NULL), num_rows_(0) {}
explicit CuPackedMatrix(MatrixIndexT r,
MatrixResizeType resize_type = kSetZero):
data_(NULL), num_rows_(0) { Resize(r, resize_type); }
explicit CuPackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
Resize(orig.num_rows_, kUndefined);
CopyFromPacked(orig);
}
explicit CuPackedMatrix(const CuPackedMatrix<Real> &orig) : data_(NULL), num_rows_(0) {
Resize(orig.NumRows(), kUndefined);
CopyFromPacked(orig);
}
void SetZero(); /// < Set to zero
void SetUnit(); /// < Set to unit matrix.
void SetRandn(); /// < Set to random values of a normal distribution
void SetDiag(Real alpha); /// < Set the diagonal value to alpha
void AddToDiag(Real r); ///< Add this quantity to the diagonal of the matrix.
void Scale(Real alpha);
void ScaleDiag(Real alpha);
Real Trace() const;
~CuPackedMatrix() { Destroy(); }
/// Set packed matrix to a specified size (can be zero).
/// The value of the new data depends on resize_type:
/// -if kSetZero, the new data will be zero
/// -if kUndefined, the new data will be undefined
/// -if kCopyData, the new data will be the same as the old data in any
/// shared positions, and zero elsewhere.
/// This function takes time proportional to the number of data elements.
void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
// Copy functions (do not resize).
void CopyFromPacked(const CuPackedMatrix<Real> &src);
void CopyFromPacked(const PackedMatrix<Real> &src);
void CopyToPacked(PackedMatrix<Real> *dst) const;
void Read(std::istream &in, bool binary);
void Write(std::ostream &out, bool binary) const;
void Destroy();
/// Swaps the contents of *this and *other. Shallow swap.
void Swap(CuPackedMatrix<Real> *other);
/// Swaps the contents of *this and *other.
void Swap(PackedMatrix<Real> *other);
Real* Data() { return data_; }
const Real* Data() const { return data_; }
inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
if (static_cast<UnsignedMatrixIndexT>(c) >
static_cast<UnsignedMatrixIndexT>(r))
std::swap(c, r);
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
static_cast<UnsignedMatrixIndexT>(this->num_rows_));
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Real value;
CU_SAFE_CALL(cudaMemcpyAsync(&value, this->data_ + (r * (r+1)) / 2 + c,
sizeof(Real), cudaMemcpyDeviceToHost,
cudaStreamPerThread));
CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
return value;
} else
#endif
return this->data_[(r * (r+1)) / 2 + c];
}
inline MatrixIndexT NumRows() const { return num_rows_; }
inline MatrixIndexT NumCols() const { return num_rows_; }
/// Returns size in bytes of the data held by the matrix.
size_t SizeInBytes() const {
size_t nr = static_cast<size_t>(num_rows_),
num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
return num_bytes;
}
protected:
// The following two functions should only be called if we did not compile with CUDA
// or could not get a CUDA card; in that case the contents are interpreted the
// same as a regular matrix.
inline const PackedMatrix<Real> &Mat() const {
return *(reinterpret_cast<const PackedMatrix<Real>* >(this));
}
inline PackedMatrix<Real> &Mat() {
return *(reinterpret_cast<PackedMatrix<Real>* >(this));
}
// Will only be called from this class or derived classes.
Real *data_;
MatrixIndexT num_rows_;
void AddPacked(const Real alpha, const CuPackedMatrix<Real> &M);
private:
// Disallow assignment.
PackedMatrix<Real> & operator=(const PackedMatrix<Real> &other);
}; // class CuPackedMatrix
/// I/O
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuPackedMatrix<Real> &mat);
} // namespace
#endif