cu-device.h
15.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// cudamatrix/cu-device.h
// Copyright 2009-2012 Karel Vesely
// 2012-2015 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
#define KALDI_CUDAMATRIX_CU_DEVICE_H_
#if HAVE_CUDA == 1
#include <cublas_v2.h>
#include <cusparse.h>
#include <curand.h>
#include <map>
#include <string>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "base/kaldi-common.h"
#include "base/timer.h"
#include "cudamatrix/cu-allocator.h"
#include "cudamatrix/cu-common.h"
#if CUDA_VERSION >= 9010
#include <cusolverDn.h>
#else
// cusolver not supported.
// Setting a few types to minimize compiler guards.
// If a user tries to use cusovler it will throw an error.
typedef void* cusolverDnHandle_t;
typedef int cusolverStatus_t;
#endif
namespace kaldi {
class CuTimer;
/**
This class contains code for selecting the CUDA device, initializing the
cuBLAS and cuSparse handles, and providing an interface for memory allocation
(which supports caching, to avoid the slowness of the CUDA memory allocator).
There is a separate instance of the CuDevice object for each thread of the
program, but many of its variables are static (hence, shared between all
instances).
We only (currently) support using a single GPU device; however, we support
multiple CUDA streams. The expected programming model here is that you will
have multiple CPU threads, and each CPU thread automatically gets its own
CUDA stream because we compile with -DCUDA_API_PER_THREAD_DEFAULT_STREAM.
In terms of synchronizing the activities of multiple threads: The CuDevice
object (with help from the underlying CuAllocator object) ensures that the
memory caching code won't itself be a cause of synchronization problems,
i.e. you don't have to worry that when you allocate with CuDevice::Malloc(),
the memory will still be in use by another thread on the GPU. However, it
may sometimes still be necessary to synchronize the activities of multiple
streams by calling the function SynchronizeGpu()-- probably right before a
thread increments a semaphore, right after it waits on a semaphore, or
right after it acquires a mutex, or something like that.
*/
class CuDevice {
public:
// You obtain the CuDevice for the current thread by calling
// CuDevice::Instantiate()
// At the beginning of the program, if you want to use a GPU, you
// should call CuDevice::Instantiate().SelectGpuId(..).
static inline CuDevice& Instantiate() {
CuDevice &ans = this_thread_device_;
if (!ans.initialized_)
ans.Initialize();
return ans;
}
inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
inline cusolverDnHandle_t GetCusolverDnHandle() {
#if CUDA_VERSION < 9010
KALDI_ERR << "CUDA VERSION '" << CUDA_VERSION << "' not new enough to support "
<< "cusolver. Upgrade to at least 9.1";
#endif
return cusolverdn_handle_;
}
inline void SeedGpu() {
if (CuDevice::Instantiate().Enabled()) {
// To get same random sequence, call srand() before the method is invoked,
CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
curand_handle_, RandInt(128, RAND_MAX)));
CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
}
}
// We provide functions Malloc(), MallocPitch() and Free() which replace
// cudaMalloc(), cudaMallocPitch() and cudaFree(). Their function is to cache
// the results of previous allocations to avoid the very large overhead that
// CUDA's allocation seems to give for some setups.
inline void* Malloc(size_t size) {
return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
g_cuda_allocator.Malloc(size);
}
inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
if (multi_threaded_) {
return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
} else if (debug_stride_mode_) {
// The pitch bucket size is hardware dependent.
// It is 512 on K40c with CUDA 7.5
// "% 8" ensures that any 8 adjacent allocations have different pitches
// if their original pitches are same in the normal mode.
return g_cuda_allocator.MallocPitch(
row_bytes + 512 * RandInt(0, 4), num_rows,
pitch);
} else {
return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
}
}
inline void Free(void *ptr) {
if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
else g_cuda_allocator.Free(ptr);
}
/// Select a GPU for computation. You are supposed to call this function just
/// once, at the beginning of the program (from the main thread), or not at
/// all.
/// The 'use_gpu' modes are:
/// "yes" -- Select GPU automatically and die if this fails. If you have set
/// the GPUs to exclusive mode it will select one
/// pseudo-randomly; otherwise it will choose whichever one has
/// the most free memory (but we recommend to set GPUs to
/// exclusive mode, or controlling which GPU to use by setting
/// the variable CUDA_VISIBLE_DEVICES to the id of the GPU you
/// want the program to use.
/// "optional" -- Do as above, but if it fails, back off to CPU.
/// "no" -- Run on CPU.
void SelectGpuId(std::string use_gpu);
/// Check if the CUDA GPU is selected for use
bool Enabled() const {
return (device_id_ > -1);
}
/// Returns true if either we have no GPU, or we have a GPU
/// and it supports double precision.
bool DoublePrecisionSupported();
/// This function accumulates stats on timing that
/// are printed out when you call PrintProfile(). However,
/// it only does something if VerboseLevel() >= 1.
void AccuProfile(const char *function_name, const CuTimer &timer);
/// Print some profiling information using KALDI_LOG.
void PrintProfile();
/// Print some memory-usage information using KALDI_LOG.
void PrintMemoryUsage() const;
/// The user should call this if the program plans to access the GPU (e.g. via
/// using class CuMatrix) from more than one thread. If you fail to call this
/// for a multi-threaded program, it may occasionally segfault (and also
/// the code will detect that you failed to call it, and will print a warning).
inline void AllowMultithreading() { multi_threaded_ = true; }
/// Get the name of the GPU
void DeviceGetName(char* name, int32 len, int32 dev);
/// Check if GPU is in good condition by multiplying small matrices on GPU+CPU.
/// Overheated GPUs may give inaccurate results, which we want to detect.
void CheckGpuHealth();
/// If Enabled(), returns the number n of bytes such that the matrix stride
/// will always be a multiple of n (from properties_.textureAlignment).
/// Otherwise, return 16, which is the stride used for CPU matrices.
int32 GetMatrixAlignment() const;
/// Call SetDebugStrideMode(true) to activate a mode where calls
/// to MallocPitch will purposely allocate arrays with different pitch
/// (inconsistent between calls). This is only useful for testing code.
/// This function returns the previous mode, where true means inconsistent
/// pitch. Note that you cannot ever rely on the strides from MallocPitch()
/// being consistent for the same request, but in practice they tend to be
/// consistent unless you are close to running out of memory.
bool SetDebugStrideMode(bool mode) {
bool old_mode = debug_stride_mode_;
debug_stride_mode_ = mode;
return old_mode;
}
/// Check if the GPU is set to compute exclusive mode (you can set this mode,
/// if you are root, by doing: `nvidia-smi -c 3`). Returns true if we have a
/// GPU and it is running in compute exclusive mode. Returns false otherwise.
/// WILL CRASH if we are not using a GPU at all. If calling this as a user
/// (i.e. from outside the class), call this only if Enabled() returns true.
bool IsComputeExclusive();
// Register command line options for CUDA device.
// This must be done before calling CuDevice::Initialize()
// Example:
// CuDevice::RegisterDeviceOptions(&po);
// po.Read(argc, argv);
// CuDevice::Initialize();
static void RegisterDeviceOptions(OptionsItf *po) {
CuDevice::device_options_.Register(po);
}
~CuDevice();
private:
struct CuDeviceOptions {
bool use_tensor_cores; // Enable tensor cores
CuDeviceOptions () : use_tensor_cores(false) {};
void Register(OptionsItf *po) {
po->Register("cuda-use-tensor-cores", &use_tensor_cores,
"Enable FP16 tensor math. "
"This is higher performance but less accuracy. "
"This is only recommended for inference.");
}
};
static CuDeviceOptions device_options_;
// Default constructor used to initialize this_thread_device_
CuDevice();
CuDevice(CuDevice&); // Disallow.
CuDevice &operator=(CuDevice&); // Disallow.
/// The Initialize() function exists to do the following, in threads other
/// than the main thread, and only if we are using a GPU: call
/// cudaSetDevice(), and set up cublas_handle_ and cusparse_handle_. It does
/// get called in the main thread (see documentation by its definition), but
/// does nothing interesting there.
void Initialize();
/// Automatically select GPU and get CUDA context (this is only called, from
/// SelectGpuId(), if the GPUs are in non-exclusive mode). Returns true on
/// success.
bool SelectGpuIdAuto();
/// This function, called from SelectGpuId(), is to be called when a
/// GPU context corresponding to the GPU we want to use exists; it
/// works out the device-id, creates the cuBLAS and cuSparse handles,
/// and prints out some information that's useful for debugging.
/// It also sets initialized_ to true, to suppress Initialize() from
/// being called on this, the main thread, in future, since
/// that would try to create the handles again.
void FinalizeActiveGpu();
/// Should only be called if Enabled() == true.
int32 MajorDeviceVersion();
/// Should only be called if Enabled() == true.
int32 MinorDeviceVersion();
// Each thread has its own CuDevice object, which contains the cublas and
// cusparse handles. These are unique to the thread (which is what is
// recommended by NVidia).
static thread_local CuDevice this_thread_device_;
// The GPU device-id that we are using. This will be initialized to -1, and will
// be set when the user calls
// CuDevice::Instantiate::SelectGpuId(...)
// from the main thread. Background threads will, when spawned and when
// CuDevice::Instantiate() is called from them the first time, will
// call cudaSetDevice(device_id))
static int32 device_id_;
// This will automatically be set to true if the application has multiple
// threads that access the GPU device. It is used to know whether to
// use locks when accessing the allocator and the profiling-related code.
static bool multi_threaded_;
// The variable profile_map_ will only be used if the verbose level is >= 1;
// it will accumulate some function-level timing information that is printed
// out at program end. This makes things a bit slower as we have to call
// cudaDeviceSynchronize() to make the timing information meaningful.
static unordered_map<std::string, double, StringHasher> profile_map_;
// profile_mutex_ guards profile_map_ in case multi_threaded_ is true.
static std::mutex profile_mutex_;
// free_memory_at_startup_ is just used in printing the memory used according
// to the device.
static int64 free_memory_at_startup_;
static cudaDeviceProp properties_;
// If set to true by SetDebugStrideMode(), code will be activated to use
// pseudo-random stride values when allocating data (to detect errors which
// otherwise would be rare).
static bool debug_stride_mode_;
// The following member variable is initialized to false; if the user calls
// Instantiate() in a thread where it is still false, Initialize() will be
// called, in order to -- if a GPU is being used-- call cudaSetDevice() and
// set up the cublas and cusparse handles.
bool initialized_;
// This variable is just a copy of the static variable device_id_. It's used
// to detect when this code is called in the wrong way.
int32 device_id_copy_;
cublasHandle_t cublas_handle_;
cusparseHandle_t cusparse_handle_;
curandGenerator_t curand_handle_;
cusolverDnHandle_t cusolverdn_handle_;
}; // class CuDevice
// Class CuTimer is a convenience wrapper for class Timer which only
// sets the time if the verbose level is >= 1. This helps avoid
// an unnecessary system call if the verbose level is 0 and you
// won't be accumulating the timing stats.
class CuTimer: public Timer {
public:
CuTimer(): Timer(GetVerboseLevel() >= 1) { }
};
// This function is declared as a more convenient way to get the CUDA device handle for use
// in the CUBLAS v2 API, since we so frequently need to access it.
inline cublasHandle_t GetCublasHandle() {
return CuDevice::Instantiate().GetCublasHandle();
}
inline cusolverDnHandle_t GetCusolverDnHandle() {
return CuDevice::Instantiate().GetCusolverDnHandle();
}
// A more convenient way to get the handle to use cuSPARSE APIs.
inline cusparseHandle_t GetCusparseHandle() {
return CuDevice::Instantiate().GetCusparseHandle();
}
inline curandGenerator_t GetCurandHandle() {
return CuDevice::Instantiate().GetCurandHandle();
}
} // namespace kaldi
#endif // HAVE_CUDA
namespace kaldi {
/**
The function SynchronizeGpu(), which for convenience is defined whether or
not we have compiled for CUDA, is intended to be called in places where threads
need to be synchronized.
It just launches a no-op kernel into the legacy default stream. This will
have the effect that it will run after any kernels previously launched from
any stream(*), and before kernels that will later be launched from any stream(*).
(*) does not apply to non-blocking streams.
Note: at the time of writing we never call SynchronizeGpu() from binary-level
code because it hasn't become necessary yet; the only program that might have
multiple threads actually using the GPU is rnnlm-train (if the user were to
invoke it with the ,bg option for loading training examples); but the only
CUDA invocation the RnnlmExample::Read() function uses (via
CuMatrix::Read()), is cudaMemcpy, which is synchronous already.
*/
void SynchronizeGpu();
} // namespace kaldi
#endif // KALDI_CUDAMATRIX_CU_DEVICE_H_