cu-device.h 15.1 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386


// cudamatrix/cu-device.h

// Copyright 2009-2012  Karel Vesely
//           2012-2015  Johns Hopkins University (author: Daniel Povey)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
#define KALDI_CUDAMATRIX_CU_DEVICE_H_

#if HAVE_CUDA == 1
#include <cublas_v2.h>
#include <cusparse.h>
#include <curand.h>
#include <map>
#include <string>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "base/kaldi-common.h"
#include "base/timer.h"
#include "cudamatrix/cu-allocator.h"
#include "cudamatrix/cu-common.h"

#if CUDA_VERSION >= 9010
#include <cusolverDn.h>
#else
// cusolver not supported.  
// Setting a few types to minimize compiler guards.
// If a user tries to use cusovler it will throw an error.
typedef void* cusolverDnHandle_t;
typedef int cusolverStatus_t;
#endif

namespace kaldi {

class CuTimer;

/**
   This class contains code for selecting the CUDA device, initializing the
   cuBLAS and cuSparse handles, and providing an interface for memory allocation
   (which supports caching, to avoid the slowness of the CUDA memory allocator).

   There is a separate instance of the CuDevice object for each thread of the
   program, but many of its variables are static (hence, shared between all
   instances).

   We only (currently) support using a single GPU device; however, we support
   multiple CUDA streams.  The expected programming model here is that you will
   have multiple CPU threads, and each CPU thread automatically gets its own
   CUDA stream because we compile with -DCUDA_API_PER_THREAD_DEFAULT_STREAM.

   In terms of synchronizing the activities of multiple threads: The CuDevice
   object (with help from the underlying CuAllocator object) ensures that the
   memory caching code won't itself be a cause of synchronization problems,
   i.e. you don't have to worry that when you allocate with CuDevice::Malloc(),
   the memory will still be in use by another thread on the GPU.  However, it
   may sometimes still be necessary to synchronize the activities of multiple
   streams by calling the function SynchronizeGpu()-- probably right before a
   thread increments a semaphore, right after it waits on a semaphore, or
   right after it acquires a mutex, or something like that.

 */
class CuDevice {
 public:

  // You obtain the CuDevice for the current thread by calling
  //   CuDevice::Instantiate()
  // At the beginning of the program, if you want to use a GPU, you
  // should call CuDevice::Instantiate().SelectGpuId(..).
  static inline CuDevice& Instantiate() {
    CuDevice &ans = this_thread_device_;
    if (!ans.initialized_)
      ans.Initialize();
    return ans;
  }

  inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
  inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
  inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
  inline cusolverDnHandle_t GetCusolverDnHandle() { 
#if CUDA_VERSION < 9010
    KALDI_ERR << "CUDA VERSION '" << CUDA_VERSION << "' not new enough to support "
      << "cusolver. Upgrade to at least 9.1";
#endif
    return cusolverdn_handle_; 
  }

  inline void SeedGpu() {
    if (CuDevice::Instantiate().Enabled()) {
      // To get same random sequence, call srand() before the method is invoked,
      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
            curand_handle_, RandInt(128, RAND_MAX)));
      CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
    }
  }
  // We provide functions Malloc(), MallocPitch() and Free() which replace
  // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
  // the results of previous allocations to avoid the very large overhead that
  // CUDA's allocation seems to give for some setups.
  inline void* Malloc(size_t size) {
    return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
        g_cuda_allocator.Malloc(size);
  }

  inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
    if (multi_threaded_) {
      return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
    } else if (debug_stride_mode_) {
      // The pitch bucket size is hardware dependent.
      // It is 512 on K40c with CUDA 7.5
      // "% 8" ensures that any 8 adjacent allocations have different pitches
      // if their original pitches are same in the normal mode.
      return g_cuda_allocator.MallocPitch(
          row_bytes + 512 * RandInt(0, 4), num_rows,
          pitch);
    } else {
      return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
    }
  }

  inline void Free(void *ptr) {
    if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
    else g_cuda_allocator.Free(ptr);
  }

  /// Select a GPU for computation.  You are supposed to call this function just
  /// once, at the beginning of the program (from the main thread), or not at
  /// all.
  /// The 'use_gpu' modes are:
  ///  "yes" -- Select GPU automatically and die if this fails.  If you have set
  ///           the GPUs to exclusive mode it will select one
  ///           pseudo-randomly; otherwise it will choose whichever one has
  ///           the most free memory (but we recommend to set GPUs to
  ///           exclusive mode, or controlling which GPU to use by setting
  ///           the variable CUDA_VISIBLE_DEVICES to the id of the GPU you
  ///           want the program to use.
  ///  "optional" -- Do as above, but if it fails, back off to CPU.
  ///  "no"       -- Run on CPU.
  void SelectGpuId(std::string use_gpu);

  /// Check if the CUDA GPU is selected for use
  bool Enabled() const {
    return (device_id_ > -1);
  }

  /// Returns true if either we have no GPU, or we have a GPU
  /// and it supports double precision.
  bool DoublePrecisionSupported();

  /// This function accumulates stats on timing that
  /// are printed out when you call PrintProfile().  However,
  /// it only does something if VerboseLevel() >= 1.
  void AccuProfile(const char *function_name, const CuTimer &timer);

  /// Print some profiling information using KALDI_LOG.
  void PrintProfile();

  /// Print some memory-usage information using KALDI_LOG.
  void PrintMemoryUsage() const;

  /// The user should call this if the program plans to access the GPU (e.g. via
  /// using class CuMatrix) from more than one thread.  If you fail to call this
  /// for a multi-threaded program, it may occasionally segfault (and also
  /// the code will detect that you failed to call it, and will print a warning).
  inline void AllowMultithreading() { multi_threaded_ = true; }

  /// Get the name of the GPU
  void DeviceGetName(char* name, int32 len, int32 dev);

  /// Check if GPU is in good condition by multiplying small matrices on GPU+CPU.
  /// Overheated GPUs may give inaccurate results, which we want to detect.
  void CheckGpuHealth();

  /// If Enabled(), returns the number n of bytes such that the matrix stride
  /// will always be a multiple of n (from properties_.textureAlignment).
  /// Otherwise, return 16, which is the stride used for CPU matrices.
  int32 GetMatrixAlignment() const;

  /// Call SetDebugStrideMode(true) to activate a mode where calls
  /// to MallocPitch will purposely allocate arrays with different pitch
  /// (inconsistent between calls).  This is only useful for testing code.
  /// This function returns the previous mode, where true means inconsistent
  /// pitch.  Note that you cannot ever rely on the strides from MallocPitch()
  /// being consistent for the same request, but in practice they tend to be
  /// consistent unless you are close to running out of memory.
  bool SetDebugStrideMode(bool mode) {
    bool old_mode = debug_stride_mode_;
    debug_stride_mode_ = mode;
    return old_mode;
  }

  /// Check if the GPU is set to compute exclusive mode (you can set this mode,
  /// if you are root, by doing: `nvidia-smi -c 3`).  Returns true if we have a
  /// GPU and it is running in compute exclusive mode.  Returns false otherwise.
  /// WILL CRASH if we are not using a GPU at all.  If calling this as a user
  /// (i.e. from outside the class), call this only if Enabled() returns true.
  bool IsComputeExclusive();

  // Register command line options for CUDA device.  
  // This must be done before calling CuDevice::Initialize()
  // Example:
  //  CuDevice::RegisterDeviceOptions(&po);
  //  po.Read(argc, argv);
  //  CuDevice::Initialize();
  static void RegisterDeviceOptions(OptionsItf *po) {
    CuDevice::device_options_.Register(po);  
  }
  ~CuDevice();
 private:

  struct CuDeviceOptions {
    bool use_tensor_cores; // Enable tensor cores
    CuDeviceOptions () : use_tensor_cores(false) {};
    void Register(OptionsItf *po) {
      po->Register("cuda-use-tensor-cores", &use_tensor_cores, 
          "Enable FP16 tensor math. "
          "This is higher performance but less accuracy. "
          "This is only recommended for inference.");
    }
  };

  static CuDeviceOptions device_options_;

  // Default constructor used to initialize this_thread_device_
  CuDevice();
  CuDevice(CuDevice&); // Disallow.
  CuDevice &operator=(CuDevice&);  // Disallow.


  /// The Initialize() function exists to do the following, in threads other
  /// than the main thread, and only if we are using a GPU: call
  /// cudaSetDevice(), and set up cublas_handle_ and cusparse_handle_.  It does
  /// get called in the main thread (see documentation by its definition), but
  /// does nothing interesting there.
  void Initialize();

  /// Automatically select GPU and get CUDA context (this is only called, from
  /// SelectGpuId(), if the GPUs are in non-exclusive mode).  Returns true on
  /// success.
  bool SelectGpuIdAuto();

  /// This function, called from SelectGpuId(), is to be called when a
  /// GPU context corresponding to the GPU we want to use exists; it
  /// works out the device-id, creates the cuBLAS and cuSparse handles,
  /// and prints out some information that's useful for debugging.
  /// It also sets initialized_ to true, to suppress Initialize() from
  /// being called on this, the main thread, in future, since
  /// that would try to create the handles again.
  void FinalizeActiveGpu();

  /// Should only be called if Enabled() == true.
  int32 MajorDeviceVersion();

  /// Should only be called if Enabled() == true.
  int32 MinorDeviceVersion();


  // Each thread has its own CuDevice object, which contains the cublas and
  // cusparse handles.  These are unique to the thread (which is what is
  // recommended by NVidia).
  static thread_local CuDevice this_thread_device_;

  // The GPU device-id that we are using.  This will be initialized to -1, and will
  // be set when the user calls
  //  CuDevice::Instantiate::SelectGpuId(...)
  // from the main thread.  Background threads will, when spawned and when
  // CuDevice::Instantiate() is called from them the first time, will
  // call cudaSetDevice(device_id))
  static int32 device_id_;

  // This will automatically be set to true if the application has multiple
  // threads that access the GPU device.  It is used to know whether to
  // use locks when accessing the allocator and the profiling-related code.
  static bool multi_threaded_;

  // The variable profile_map_ will only be used if the verbose level is >= 1;
  // it will accumulate some function-level timing information that is printed
  // out at program end.  This makes things a bit slower as we have to call
  // cudaDeviceSynchronize() to make the timing information meaningful.
  static unordered_map<std::string, double, StringHasher> profile_map_;
  // profile_mutex_ guards profile_map_ in case multi_threaded_ is true.
  static std::mutex profile_mutex_;

  // free_memory_at_startup_ is just used in printing the memory used according
  // to the device.
  static int64 free_memory_at_startup_;
  static cudaDeviceProp properties_;

  // If set to true by SetDebugStrideMode(), code will be activated to use
  // pseudo-random stride values when allocating data (to detect errors which
  // otherwise would be rare).
  static bool debug_stride_mode_;


  // The following member variable is initialized to false; if the user calls
  // Instantiate() in a thread where it is still false, Initialize() will be
  // called, in order to -- if a GPU is being used-- call cudaSetDevice() and
  // set up the cublas and cusparse handles.
  bool initialized_;

  // This variable is just a copy of the static variable device_id_.  It's used
  // to detect when this code is called in the wrong way.
  int32 device_id_copy_;

  cublasHandle_t cublas_handle_;
  cusparseHandle_t cusparse_handle_;
  curandGenerator_t curand_handle_;
  cusolverDnHandle_t cusolverdn_handle_;
}; // class CuDevice


// Class CuTimer is a convenience wrapper for class Timer which only
// sets the time if the verbose level is >= 1.  This helps avoid
// an unnecessary system call if the verbose level is 0 and you
// won't be accumulating the timing stats.
class CuTimer: public Timer {
 public:
  CuTimer(): Timer(GetVerboseLevel() >= 1) { }
};

// This function is declared as a more convenient way to get the CUDA device handle for use
// in the CUBLAS v2 API, since we so frequently need to access it.
inline cublasHandle_t GetCublasHandle() { 
  return CuDevice::Instantiate().GetCublasHandle(); 
}

inline cusolverDnHandle_t GetCusolverDnHandle() { 
  return CuDevice::Instantiate().GetCusolverDnHandle(); 
}

// A more convenient way to get the handle to use cuSPARSE APIs.
inline cusparseHandle_t GetCusparseHandle() { 
  return CuDevice::Instantiate().GetCusparseHandle(); 
}

inline curandGenerator_t GetCurandHandle() { 
  return CuDevice::Instantiate().GetCurandHandle(); 
}


}  // namespace kaldi

#endif // HAVE_CUDA


namespace kaldi {

/**
   The function SynchronizeGpu(), which for convenience is defined whether or
   not we have compiled for CUDA, is intended to be called in places where threads
   need to be synchronized.

   It just launches a no-op kernel into the legacy default stream.  This will
   have the effect that it will run after any kernels previously launched from
   any stream(*), and before kernels that will later be launched from any stream(*).
   (*) does not apply to non-blocking streams.

   Note: at the time of writing we never call SynchronizeGpu() from binary-level
   code because it hasn't become necessary yet; the only program that might have
   multiple threads actually using the GPU is rnnlm-train (if the user were to
   invoke it with the ,bg option for loading training examples); but the only
   CUDA invocation the RnnlmExample::Read() function uses (via
   CuMatrix::Read()), is cudaMemcpy, which is synchronous already.

*/
void SynchronizeGpu();

}   // namespace kaldi

#endif // KALDI_CUDAMATRIX_CU_DEVICE_H_