1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
|
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/integral_image_op.h"
namespace caffe2 {
namespace {
__global__ void RowPassKernel(
int count,
int rows_out,
int cols_out,
int chans,
const float* in,
float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which row, channel, and batch element we're processing
int row = i % rows_out;
int chan = (i / rows_out) % chans;
int ind = i / rows_out / chans;
// Input is (H, W) and output is (H + 1, W + 1)
int rows_in = rows_out - 1;
int cols_in = cols_out - 1;
// Row pointer to input data
// Input data is shift (-1, -1) relative to output data, hence row - 1
const float* row_in_data =
in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
// Row pointer to output data
float* row_out_data =
out + cols_out * (row + rows_out * (chan + ind * chans));
// The first row and first column of the output is all zeros
row_out_data[0] = 0.;
if (row == 0) {
for (int i = 1; i < cols_out; ++i) {
row_out_data[i] = 0.;
}
} else {
for (int i = 1; i < cols_out; ++i) {
// Recall that input data is shift (-1, -1) relative to the output,
// hence i - 1
row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
}
}
}
}
__global__ void RowPassGradientKernel(
int count,
int rows_out,
int cols_out,
int chans,
const float* in,
float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which row, channel, and batch element we're processing
int row = i % rows_out;
int chan = (i / rows_out) % chans;
int ind = i / rows_out / chans;
// Input in (H + 1, W + 1) and output is (H + 1, W)
int rows_in = rows_out;
int cols_in = cols_out + 1;
// Col pointer to input data
const float* row_in_data =
in + cols_in * (row + rows_in * (chan + ind * chans));
// Col pointer to output data
float* row_out_data =
out + cols_out * (row + rows_out * (chan + ind * chans));
row_out_data[0] = row_in_data[0];
for (int i = 1; i < cols_out; ++i) {
row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
}
}
}
__global__ void
ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which col, channel, and batch element we're processing
int col = i % cols_out;
int chan = (i / cols_out) % chans;
int ind = i / cols_out / chans;
float* col_out_data =
out + col + cols_out * rows_out * (chan + ind * chans);
for (int i = 1; i < rows_out; ++i) {
col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
}
}
}
__global__ void ColPassGradientKernel(
int count,
int rows_out,
int cols_out,
int chans,
const float* in,
float* out) {
CUDA_1D_KERNEL_LOOP(i, count) {
// Figure out which col, channel, and batch element we're processing
int col = i % cols_out;
int chan = (i / cols_out) % chans;
int ind = i / cols_out / chans;
// Input is (H + 1, W) and output is (H, W)
int rows_in = rows_out + 1;
int cols_in = cols_out;
// Col pointer to input data
const float* col_in_data =
in + col + cols_in * rows_in * (chan + ind * chans);
// Col pointer to output data
float* col_out_data =
out + col + cols_out * rows_out * (chan + ind * chans);
col_out_data[0] = col_in_data[0];
for (int i = 1; i < rows_out; ++i) {
col_out_data[i * cols_out] =
col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
}
}
}
} // namespace
template <>
bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement");
// Input is (N, C, H, W)
// Output is (N, C, H + 1, W + 1)
vector<int64_t> out_shape(X.sizes().vec());
out_shape[2] += 1; // H + 1 output size
out_shape[3] += 1; // W + 1 output size
auto* Y = Output(0, out_shape, at::dtype<float>());
const int chans = X.dim32(1);
const int rows_out = Y->dim32(2);
const int cols_out = Y->dim32(3);
// Integral image over rows of input X
const int row_pass_size = X.dim32(0) * chans * rows_out;
RowPassKernel<<<
CAFFE_GET_BLOCKS(row_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
row_pass_size,
rows_out,
cols_out,
chans,
X.data<float>(),
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Integral image over columns of the integral image over rows
const int col_pass_size = X.dim32(0) * chans * cols_out;
ColPassKernel<<<
CAFFE_GET_BLOCKS(col_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
col_pass_size,
rows_out,
cols_out,
chans,
Y->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
template <>
bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Original input to "forward" op
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
auto* dX = Output(
0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
// "forward" op (aka "gradInput")
// Row pass reduces shape of dY from (N, C, H + 1, W + 1)
// to (N, C, H + 1, W)
// Col pass reduces shape to (N, C, H, W)
vector<int64_t> row_pass_shape(dY.sizes().vec());
row_pass_shape[3] -= 1;
ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA));
const int chans = row_pass_buffer_.dim32(1);
const int rows_out = row_pass_buffer_.dim32(2);
const int cols_out = row_pass_buffer_.dim32(3);
// Integral image over rows of input X
const int row_pass_size = X.dim32(0) * chans * rows_out;
RowPassGradientKernel<<<
CAFFE_GET_BLOCKS(row_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
row_pass_size,
rows_out,
cols_out,
chans,
dY.data<float>(),
row_pass_buffer_.mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Integral image over columns of the integral image over rows
const int col_pass_size = X.dim32(0) * chans * cols_out;
ColPassGradientKernel<<<
CAFFE_GET_BLOCKS(col_pass_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
col_pass_size,
rows_out - 1,
cols_out,
chans,
row_pass_buffer_.data<float>(),
dX->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
IntegralImageGradient,
IntegralImageGradientOp<float, CUDAContext>);
} // namespace caffe2
|