1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
#include <assert.h>
#include "caffe2/operators/elementwise_linear_op.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/operator_fallback_gpu.h"
#include "caffe2/utils/cub_namespace.cuh"
#include <cub/block/block_reduce.cuh>
namespace caffe2 {
namespace {
__global__ void ElementwiseLinearKernel(const int N, const int D,
const float* X_data, const float* a_data, const float* b_data,
float* Y_data) {
CUDA_1D_KERNEL_LOOP(i, N * D) {
int d = i % D;
Y_data[i] = X_data[i] * a_data[d] + b_data[d];
}
}
__global__ void ElementwiseLinearGradientKernel(const int N, const int D,
const float* g_o_data, const float* X_data, const float* a_data,
float* g_X_data, float* g_a_data, float* g_b_data) {
int d = blockIdx.x; // One block per D
float g_a_sum = 0;
float g_b_sum = 0;
for (int n = threadIdx.x; n < N; n += blockDim.x) {
const float gox = g_o_data[n * D + d];
g_X_data[n * D + d] = gox * a_data[d];
g_a_sum += gox * X_data[n * D + d];
g_b_sum += gox;
}
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float g_a_sum_tot = BlockReduce(temp_storage).Sum(g_a_sum);
__syncthreads();
float g_b_sum_tot = BlockReduce(temp_storage).Sum(g_b_sum);
if (threadIdx.x == 0) {
g_a_data[d] = g_a_sum_tot;
g_b_data[d] = g_b_sum_tot;
}
}
} // namespace
template<>
bool ElementwiseLinearOp<float, CUDAContext>::RunOnDevice(){
const auto& X = Input(0);
const auto& a = Input(1);
const auto& b = Input(2);
const auto canonical_axis = X.canonical_axis_index(axis_);
const int N = X.size_to_dim(canonical_axis);
const int D = X.size_from_dim(canonical_axis);
CAFFE_ENFORCE_EQ(a.dim(), 1, a.dim());
CAFFE_ENFORCE_EQ(a.dim(0), D, a.dim());
CAFFE_ENFORCE_EQ(b.dim(), 1, b.dim());
CAFFE_ENFORCE_EQ(b.dim(0), D, b.dim());
auto* Y = Output(0, X.sizes(), at::dtype<float>());
ElementwiseLinearKernel<<<
CAFFE_GET_BLOCKS(N * D),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N,
D,
X.data<float>(),
a.data<float>(),
b.data<float>(),
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template<>
bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
const auto& g_o = Input(0);
const auto& X = Input(1);
const auto& a = Input(2);
const auto canonical_axis = X.canonical_axis_index(axis_);
const int N = X.size_to_dim(canonical_axis);
const int D = X.size_from_dim(canonical_axis);
CAFFE_ENFORCE_EQ(a.dim(), 1, a.dim());
CAFFE_ENFORCE_EQ(a.dim(0), D, a.dim());
auto* g_X = Output(0, X.sizes(), at::dtype<float>());
auto * g_a = Output(1, a.sizes(), at::dtype<float>());
auto * g_b = Output(2, a.sizes(), at::dtype<float>());
float* g_a_data = g_a->template mutable_data<float>();
float* g_b_data = g_b->template mutable_data<float>();
ElementwiseLinearGradientKernel<<<
D,
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N,
D,
g_o.data<float>(),
X.data<float>(),
a.data<float>(),
g_X->template mutable_data<float>(),
g_a_data,
g_b_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(ElementwiseLinear,
ElementwiseLinearOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(ElementwiseLinearGradient,
ElementwiseLinearGradientOp<float, CUDAContext>);
} // namespace caffe2
|