cuda-gpu-available.cc 3.96 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107


// nnetbin/cuda-gpu-available.cc

// Copyright 2015 Brno University of Technology (author: Karel Vesely)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef _MSC_VER
  #include <unistd.h>
  #include <errno.h>
#endif

#include "base/kaldi-common.h"
#include "cudamatrix/cu-device.h"
#include "cudamatrix/cu-matrix.h"

using namespace kaldi;

#if HAVE_CUDA == 1
/**
 * With incorrect CUDA setup, this will trigger "invalid device function" error.
 */
void TestGpuComputation() {
  CuMatrix<BaseFloat> m(100,100);
  m.SetRandn();
  m.SoftMaxPerRow(m);
}
#endif

int main(int argc, char *argv[]) try {

  /* only for Doxygen documentation, never shown in command line */
  const char *usage =
        "Test if there is a GPU available, and if the GPU setup is correct.\n"
        "A GPU is acquired and a small computation is done\n"
        "(generating a random matrix and computing softmax for its rows).\n"
        "\n"
        "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
        "\n"
        "Usage:  cuda-gpu-available\n";

  char hostname[100] = "UNKNOWN-HOSTNAME";
#if !defined(_MSC_VER) && !defined(__CYGWIN__)
  if (gethostname(hostname, 100)) {
    KALDI_WARN << "Cannot get hostname, " << strerror(errno);
  }
#endif
  KALDI_LOG << "\n\n### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
#if HAVE_CUDA == 1
  CuDevice::Instantiate().SelectGpuId("yes");
  fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
  fprintf(stderr, "### Testing CUDA setup with a small computation "
                  "(setup = cuda-toolkit + gpu-driver + kaldi):\n");
  // the test of setup by computation,
  try {
    TestGpuComputation();
  } catch (const std::exception &e) {
    fprintf(stderr, "%s\n", e.what());
    KALDI_LOG << "...\n"
      << "### The CUDA setup is wrong! "
      << "(\"invalid device function\" == problem with 'compute capability' "
      << "in compiled kaldi)\n"
      << "### Before posting the error to forum, please try following:\n"
      << "### 1) update kaldi & cuda-toolkit (& GPU driver),\n"
      << "### 2) re-run 'src/configure',\n"
      << "### 3) re-compile kaldi by 'make clean; make -j depend; make -j'\n"
      << "###\n"
      << "### If the problem persists, please send us your:\n"
      << "### - GPU model name, cuda-toolkit version, driver version "
      << "(run nvidia-smi), variable $(CUDA_ARCH) from src/kaldi.mk";
    return -1;
  }
  fprintf(stderr, "### Test OK!\n");
  return 0;
#else
  std::cerr
    << "### CUDA WAS NOT COMPILED IN! ###\n"
    << "To support CUDA, you must run 'configure' on a machine "
    << "that has the CUDA compiler 'nvcc' available.\n";
  return 1;
#endif
} catch (const std::exception &e) {
  fprintf(stderr, "%s\n", e.what());
  KALDI_LOG << "...\n"
    << "### WE DID NOT GET A CUDA GPU!!! ###\n"
    << "### If your system has a 'free' CUDA GPU, try re-installing "
    << "latest 'CUDA toolkit' from NVidia (this updates GPU drivers too).\n"
    << "### Otherwise 'nvidia-smi' shows the status of GPUs:\n"
    << "### - The versions should match ('NVIDIA-SMI' and 'Driver Version'), "
    << "otherwise reboot or reload kernel module,\n"
    << "### - The GPU should be unused "
    << "(no 'process' in list, low 'memory-usage' (<100MB), low 'gpu-fan' (<30%)),\n"
    << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
  return -1;
}