File: elemenntwise_rtc_gpu.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (129 lines) | stat: -rw-r--r-- 4,121 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/cuda_rtc/common_rtc.h"

namespace caffe2 {
namespace {
class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> {
 public:
  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}

  template <typename... Args>
  string KernelName(Args... /*args*/) {
    return name_;
  }

  template <typename... Args>
  string GetSource(Args... args);

 private:
  string name_;
};

template <>
string ElementwiseRTCFunction::GetSource(
    int input_size,
    int output_size,
    const string command_string) {
  std::stringstream ss;
  ss << "extern \"C\" __global__ void " << name_
     << "(const size_t nthreads, \n";
  // Insert the parameter list.
  int remain_params = input_size + output_size;
  for (int i = 0; i < input_size; ++i) {
    ss << "const float* in" << i << ((remain_params--) ? ", \n" : "");
  }
  for (int i = 0; i < output_size; ++i) {
    ss << "float* out" << i << ((remain_params--) ? ", \n" : "");
  }
  ss << ") {\n"
        "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
        "index < nthreads; index += blockDim.x * gridDim.x) {\n"
     << command_string << "\n"
     << "}\n}";
  return ss.str();
}
} // namespace

/**
 * A GPU operator that can generate limited elementwise operations.
 *
 * ElementwiseRTCOp allows one to do a simple and limited thing: it takes in
 * multiple inputs and multiple outputs, as well as a raw string argument
 * rtc_src. The runtime then generates the following kernel code:
 *
 *   __global__ void kernel_name(const size_t nthreads, ...) {
 *     for(int index = blockIdx.x * blockDim.x + threadIdx.x;
 *         index < nthreads; index += blockDim.x * gridDim.x) {
 *       rtc_src
 *     }
 *   }
 * where the "..." part is auto generated, so one can refer to the input and
 * output as in0, in1, ..., out0, out1... in the rtc_src string.
 *
 * For example, if one wants to do a vector multiplication, one can take two
 * inputs and one outputs, and write rtc_src as
 *     out0[index] = in0[index] * in1[index];
 *
 * This op is currently highly experimental. We do not have a gradient
 * registered for it either.
 */
class ElementwiseRTCOp final : public Operator<CUDAContext> {
 public:
  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CUDAContext>(operator_def, ws) {
    const string src = OperatorBase::GetSingleArgument<string>("rtc_src", "");
    CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
    func_.Compile(InputSize(), OutputSize(), src);
  }
  ~ElementwiseRTCOp() override {}

  bool RunOnDevice() override {
    static_assert(
        sizeof(void*) == sizeof(size_t),
        "The argbuffer relies on the assumption that void* and "
        "size_t have the same size.");
    vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
    size_t* argBuffer = argBuffer_vec.data();
    CAFFE_ENFORCE(
        Input(0).numel() < std::numeric_limits<int>::max(),
        "The kernel function currently only supports int index.");
    argBuffer[0] = Input(0).numel();
    void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
    for (int i = 0; i < InputSize(); ++i) {
      ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
    }
    for (int i = 0; i < OutputSize(); ++i) {
      Output(i)->ResizeLike(Input(0));
      ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
    }
    size_t argBufferSize = sizeof(argBuffer);
    void* config[] = {
        CU_LAUNCH_PARAM_BUFFER_POINTER,
        argBuffer,
        CU_LAUNCH_PARAM_BUFFER_SIZE,
        &argBufferSize,
        CU_LAUNCH_PARAM_END};
    func_.LaunchEx(
        CAFFE_GET_BLOCKS(Input(0).numel()),
        1,
        1,
        CAFFE_CUDA_NUM_THREADS,
        1,
        1,
        0,
        context_.cuda_stream(),
        config);
    return true;
  }

 private:
  ElementwiseRTCFunction func_;
};

namespace {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
}

} // namespace caffe2