1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
|
#ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_
#define CAFFE2_CUDA_RTC_COMMON_RTC_H_
#include <sstream>
#include <string>
#include <cuda.h>
#include <nvrtc.h>
#define NVRTC_CHECK(condition) \
do { \
nvrtcResult result = condition; \
if (result != NVRTC_SUCCESS) { \
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
<< nvrtcGetErrorString(result); \
} \
} while (0)
namespace caffe2 {
template <typename Derived>
class CudaRTCFunction {
public:
CudaRTCFunction() : module_loaded_(false) {}
~CudaRTCFunction() {
if (module_loaded_) {
CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
}
}
// TODO: this function is nontrivial and since CudaRTCFunction uses CRTP, it
// may potentially increase the binary size. In that case, move common parts
// into a separate function.
template <typename... Args>
void Compile(Args... args) {
string src = static_cast<Derived*>(this)->GetSource(args...);
string name = static_cast<Derived*>(this)->KernelName(args...);
VLOG(1) << "function name: " << name;
VLOG(1) << "function src:\n" << src;
// Actually do the compiling.
nvrtcProgram prog;
NVRTC_CHECK(
nvrtcCreateProgram(&prog, src.c_str(), nullptr, 0, nullptr, nullptr));
// Compile the program.
// TODO(Yangqing): how to find the current gpu architecture instead of hard
// coding it?
const char* nvrtc_opts[] = {
"--gpu-architecture=compute_35", "--use_fast_math"};
nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts);
if (compile_result != NVRTC_SUCCESS) {
size_t log_size;
NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
vector<char> nvrtc_log(log_size);
NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
LOG(FATAL) << "Compilation failure for nvrtc("
<< nvrtcGetErrorString(compile_result) << "): \n"
<< nvrtc_log.data();
}
size_t ptx_size;
NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
vector<char> nvrtc_ptx(ptx_size);
NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data()));
NVRTC_CHECK(nvrtcDestroyProgram(&prog));
// After compilation, load the module.
if (module_loaded_) {
CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
}
CUDA_DRIVERAPI_ENFORCE(
cuModuleLoadDataEx(&module_, nvrtc_ptx.data(), 0, 0, 0));
module_loaded_ = true;
CUDA_DRIVERAPI_ENFORCE(
cuModuleGetFunction(&kernel_, module_, name.c_str()));
}
template <typename... Args>
void Launch(
unsigned int gx,
unsigned int gy,
unsigned int gz,
unsigned int bx,
unsigned int by,
unsigned int bz,
unsigned int shared_mem,
cudaStream_t stream,
Args... args) {
CAFFE_ENFORCE(
module_loaded_, "Cannot call Launch before a module is loaded.");
void* args_voidp[] = {&args...};
CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
}
void LaunchEx(
unsigned int gx,
unsigned int gy,
unsigned int gz,
unsigned int bx,
unsigned int by,
unsigned int bz,
unsigned int shared_mem,
cudaStream_t stream,
void** extra) {
CAFFE_ENFORCE(
module_loaded_, "Cannot call Launch before a module is loaded.");
CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, nullptr, extra));
}
private:
bool module_loaded_;
CUmodule module_;
CUfunction kernel_;
};
// TODO: this is in no way unique and is just a hack right now.
inline std::string GetUniqueName() {
static constexpr int len = 20;
static const char alpha[] =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::stringstream ss;
ss << "_cuda_kernel_";
for (const auto i : c10::irange(len)) {
ss << alpha[rand() % (sizeof(alpha) - 1)];
}
return ss.str();
}
} // namespace caffe2
#endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_
|