cuda-gpu-available.cc
3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// nnetbin/cuda-gpu-available.cc
// Copyright 2015 Brno University of Technology (author: Karel Vesely)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef _MSC_VER
#include <unistd.h>
#include <errno.h>
#endif
#include "base/kaldi-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-matrix.h"
using namespace kaldi;
#if HAVE_CUDA == 1
/**
* With incorrect CUDA setup, this will trigger "invalid device function" error.
*/
void TestGpuComputation() {
CuMatrix<BaseFloat> m(100,100);
m.SetRandn();
m.SoftMaxPerRow(m);
}
#endif
int main(int argc, char *argv[]) try {
/* only for Doxygen documentation, never shown in command line */
const char *usage =
"Test if there is a GPU available, and if the GPU setup is correct.\n"
"A GPU is acquired and a small computation is done\n"
"(generating a random matrix and computing softmax for its rows).\n"
"\n"
"exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
"\n"
"Usage: cuda-gpu-available\n";
char hostname[100] = "UNKNOWN-HOSTNAME";
#if !defined(_MSC_VER) && !defined(__CYGWIN__)
if (gethostname(hostname, 100)) {
KALDI_WARN << "Cannot get hostname, " << strerror(errno);
}
#endif
KALDI_LOG << "\n\n### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
#if HAVE_CUDA == 1
CuDevice::Instantiate().SelectGpuId("yes");
fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
fprintf(stderr, "### Testing CUDA setup with a small computation "
"(setup = cuda-toolkit + gpu-driver + kaldi):\n");
// the test of setup by computation,
try {
TestGpuComputation();
} catch (const std::exception &e) {
fprintf(stderr, "%s\n", e.what());
KALDI_LOG << "...\n"
<< "### The CUDA setup is wrong! "
<< "(\"invalid device function\" == problem with 'compute capability' "
<< "in compiled kaldi)\n"
<< "### Before posting the error to forum, please try following:\n"
<< "### 1) update kaldi & cuda-toolkit (& GPU driver),\n"
<< "### 2) re-run 'src/configure',\n"
<< "### 3) re-compile kaldi by 'make clean; make -j depend; make -j'\n"
<< "###\n"
<< "### If the problem persists, please send us your:\n"
<< "### - GPU model name, cuda-toolkit version, driver version "
<< "(run nvidia-smi), variable $(CUDA_ARCH) from src/kaldi.mk";
return -1;
}
fprintf(stderr, "### Test OK!\n");
return 0;
#else
std::cerr
<< "### CUDA WAS NOT COMPILED IN! ###\n"
<< "To support CUDA, you must run 'configure' on a machine "
<< "that has the CUDA compiler 'nvcc' available.\n";
return 1;
#endif
} catch (const std::exception &e) {
fprintf(stderr, "%s\n", e.what());
KALDI_LOG << "...\n"
<< "### WE DID NOT GET A CUDA GPU!!! ###\n"
<< "### If your system has a 'free' CUDA GPU, try re-installing "
<< "latest 'CUDA toolkit' from NVidia (this updates GPU drivers too).\n"
<< "### Otherwise 'nvidia-smi' shows the status of GPUs:\n"
<< "### - The versions should match ('NVIDIA-SMI' and 'Driver Version'), "
<< "otherwise reboot or reload kernel module,\n"
<< "### - The GPU should be unused "
<< "(no 'process' in list, low 'memory-usage' (<100MB), low 'gpu-fan' (<30%)),\n"
<< "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
return -1;
}