File: common_rtc.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (131 lines) | stat: -rw-r--r-- 4,300 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_
#define CAFFE2_CUDA_RTC_COMMON_RTC_H_

#include <sstream>
#include <string>

#include <cuda.h>
#include <nvrtc.h>

#define NVRTC_CHECK(condition)                                          \
  do {                                                                  \
    nvrtcResult result = condition;                                     \
    if (result != NVRTC_SUCCESS) {                                      \
      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
                 << nvrtcGetErrorString(result);                        \
    }                                                                   \
  } while (0)

namespace caffe2 {

template <typename Derived>
class CudaRTCFunction {
 public:
  CudaRTCFunction() : module_loaded_(false) {}
  ~CudaRTCFunction() {
    if (module_loaded_) {
      CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
    }
  }

  // TODO: this function is nontrivial and since CudaRTCFunction uses CRTP, it
  // may potentially increase the binary size. In that case, move common parts
  // into a separate function.
  template <typename... Args>
  void Compile(Args... args) {
    string src = static_cast<Derived*>(this)->GetSource(args...);
    string name = static_cast<Derived*>(this)->KernelName(args...);
    VLOG(1) << "function name: " << name;
    VLOG(1) << "function src:\n" << src;
    // Actually do the compiling.
    nvrtcProgram prog;
    NVRTC_CHECK(
        nvrtcCreateProgram(&prog, src.c_str(), nullptr, 0, nullptr, nullptr));
    // Compile the program.
    // TODO(Yangqing): how to find the current gpu architecture instead of hard
    // coding it?
    const char* nvrtc_opts[] = {
        "--gpu-architecture=compute_35", "--use_fast_math"};
    nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts);
    if (compile_result != NVRTC_SUCCESS) {
      size_t log_size;
      NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
      vector<char> nvrtc_log(log_size);
      NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
      LOG(FATAL) << "Compilation failure for nvrtc("
                 << nvrtcGetErrorString(compile_result) << "): \n"
                 << nvrtc_log.data();
    }
    size_t ptx_size;
    NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
    vector<char> nvrtc_ptx(ptx_size);
    NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data()));
    NVRTC_CHECK(nvrtcDestroyProgram(&prog));
    // After compilation, load the module.
    if (module_loaded_) {
      CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
    }
    CUDA_DRIVERAPI_ENFORCE(
        cuModuleLoadDataEx(&module_, nvrtc_ptx.data(), 0, 0, 0));
    module_loaded_ = true;
    CUDA_DRIVERAPI_ENFORCE(
        cuModuleGetFunction(&kernel_, module_, name.c_str()));
  }

  template <typename... Args>
  void Launch(
      unsigned int gx,
      unsigned int gy,
      unsigned int gz,
      unsigned int bx,
      unsigned int by,
      unsigned int bz,
      unsigned int shared_mem,
      cudaStream_t stream,
      Args... args) {
    CAFFE_ENFORCE(
        module_loaded_, "Cannot call Launch before a module is loaded.");
    void* args_voidp[] = {&args...};
    CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
        kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
  }

  void LaunchEx(
      unsigned int gx,
      unsigned int gy,
      unsigned int gz,
      unsigned int bx,
      unsigned int by,
      unsigned int bz,
      unsigned int shared_mem,
      cudaStream_t stream,
      void** extra) {
    CAFFE_ENFORCE(
        module_loaded_, "Cannot call Launch before a module is loaded.");
    CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
        kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, nullptr, extra));
  }

 private:
  bool module_loaded_;
  CUmodule module_;
  CUfunction kernel_;
};

// TODO: this is in no way unique and is just a hack right now.
inline std::string GetUniqueName() {
  static constexpr int len = 20;
  static const char alpha[] =
      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

  std::stringstream ss;
  ss << "_cuda_kernel_";
  for (const auto i : c10::irange(len)) {
    ss << alpha[rand() % (sizeof(alpha) - 1)];
  }
  return ss.str();
}

} // namespace caffe2

#endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_