1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
#include "caffe2/utils/math/broadcast.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math/utils.h"
namespace caffe2 {
namespace math {
namespace {
template <typename T>
__global__ void AffineChannelNCHWCUDAKernel(
const int C,
const int M,
const int HxW,
const T* X,
const T* scale,
const T* bias,
T* Y);
template <>
__global__ void AffineChannelNCHWCUDAKernel<float>(
const int C,
const int M,
const int HxW,
const float* X,
const float* scale,
const float* bias,
float* Y) {
const int nc = blockIdx.x / M;
const int c = nc % C;
const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (w < HxW) {
const int index = nc * HxW + w;
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
#else
Y[index] = fmaf(X[index], scale[c], bias[c]);
#endif
}
}
template <typename T>
__global__ void AffineChannelNHWCCUDAKernel(
const int C,
const T* X,
const T* scale,
const T* bias,
T* Y);
template <>
__global__ void AffineChannelNHWCCUDAKernel<float>(
const int C,
const float* X,
const float* scale,
const float* bias,
float* Y) {
const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
if (c < C) {
const int index = blockIdx.x * C + c;
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
#else
Y[index] = fmaf(X[index], scale[c], bias[c]);
#endif
}
}
} // namespace
#define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T) \
template <> \
CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NCHW>( \
const int N, \
const int C, \
const int HxW, \
const T* X, \
const T* scale, \
const T* bias, \
T* Y, \
CUDAContext* context) { \
const int M = DivUp(HxW, CAFFE_CUDA_NUM_THREADS); \
AffineChannelNCHWCUDAKernel<T> \
<<<N * C * M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
C, M, HxW, X, scale, bias, Y); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} \
template <> \
CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NHWC>( \
const int N, \
const int C, \
const int HxW, \
const T* X, \
const T* scale, \
const T* bias, \
T* Y, \
CUDAContext* context) { \
const int M = DivUp(C, CAFFE_CUDA_NUM_THREADS); \
AffineChannelNHWCCUDAKernel<T> \
<<<dim3(N* HxW, M), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(C, X, scale, bias, Y); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
}
CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(float)
#undef CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL
} // namespace math
} // namespace caffe2
|