File: upsample_op.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (240 lines) | stat: -rw-r--r-- 7,338 bytes
// copied and pasted from pytorch to test if this passes the build.

#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/upsample_op.h"
#include "caffe2/utils/GpuAtomics.cuh"
#include "caffe2/utils/math.h"

namespace caffe2 {

namespace {

inline __device__ int idx(
    const int n,
    const int num_channels,
    const int c,
    const int height,
    const int width,
    const int y,
    const int x) {
  return ((n * num_channels + c) * height + y) * width + x;
}

// input is X, output is Y
__global__ void UpsampleBilinearKernel(
    const int num_batch,
    const int num_channels,
    const int input_height,
    const int input_width,
    const int output_height,
    const int output_width,
    const float* __restrict__ X,
    float* __restrict__ Y) {


    const int size = output_height * output_width;
    CUDA_1D_KERNEL_LOOP(index, size) {
    int indexTemp = index;
    const int out_x = indexTemp % output_width;
    indexTemp /= output_width;
    const int out_y = indexTemp % output_height;
    indexTemp /= output_height;
    indexTemp /= num_channels;

    const float rheight =
         output_height > 1 ? (input_height - 1.f) / (output_height - 1.f) : 0.f;
    const float rwidth =
         output_width > 1 ? (input_width - 1.f) / (output_width - 1.f) : 0.f;

    // Compute Y axis lambdas
    const float h1r = rheight * out_y;
    const int h1 = (int)h1r;
    const int h1p = (h1 < input_height - 1) ? 1 : 0;
    const float h1lambda = h1r - h1;
    const float h0lambda = 1.f - h1lambda;

    // Compute X axis lambdas
    const float w1r = rwidth * out_x;
    const int w1 = (int)w1r;
    const int w1p = (w1 < input_width - 1) ? 1 : 0;
    const float w1lambda = w1r - w1;
    const float w0lambda = 1.f - w1lambda;

    for (int n = 0; n < num_batch; n++){
      for (int c = 0; c < num_channels; c++) {

        float X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)];
        float X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)];
        float X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)];
        float X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)];

        Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
                    h0lambda * (w0lambda * X0 + w1lambda * X1) +
                    h1lambda * (w0lambda * X2 + w1lambda * X3);
      }
    }
  }
}
// input is dY, output is dX
__global__ void UpsampleBilinearGradientKernel(
    const int input_size,
    const int num_channels,
    const int input_height,
    const int input_width,
    const int output_height,
    const int output_width,
    const float* dY,
    float* dX) {
  CUDA_1D_KERNEL_LOOP(index, input_size) {
    int indexTemp = index;
    const int in_x = indexTemp % input_width;
    indexTemp /= input_width;
    const int in_y = indexTemp % input_height;
    indexTemp /= input_height;
    const int c = indexTemp % num_channels;
    indexTemp /= num_channels;
    const int n = indexTemp;

    const float rheight =
        output_height > 1 ? (output_height - 1.f) / (input_height - 1.f) : 0.f;
    const float rwidth =
        output_width > 1 ? (output_width - 1.f) / (input_width - 1.f) : 0.f;

    // Compute Y axis lambdas
    const float h1r = rheight * in_y;
    const int h1 = (int)h1r;
    const int h1p = (h1 < output_height - 1) ? 1 : 0;
    const float h1lambda = h1r - h1;
    const float h0lambda = 1.f - h1lambda;

    // Compute X axis lambdas
    const float w1r = rwidth * in_x;
    const int w1 = (int)w1r;
    const int w1p = (w1 < output_width - 1) ? 1 : 0;
    const float w1lambda = w1r - w1;
    const float w0lambda = 1.f - w1lambda;

#if __CUDA_ARCH__ >= 350
    const float dYi = __ldg(&dY[index]);
#else
    const float dYi = dY[index];
#endif

    gpu_atomic_add(
        &dX[idx(n, num_channels, c, output_height, output_width, h1, w1)],
        h0lambda * w0lambda * dYi);
    gpu_atomic_add(
        &dX[idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)],
        h0lambda * w1lambda * dYi);
    gpu_atomic_add(
        &dX[idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)],
        h1lambda * w0lambda * dYi);
    gpu_atomic_add(
        &dX[idx(
            n,
            num_channels,
            c,
            output_height,
            output_width,
            h1 + h1p,
            w1 + w1p)],
        h1lambda * w1lambda * dYi);
  }
}

} // namespace

template <>
bool UpsampleBilinearOp<float, CUDAContext>::RunOnDevice() {
  const auto& X = Input(0);

  const auto inputDims = X.sizes();
  CAFFE_ENFORCE_EQ(4, inputDims.size());
  const int batch_size = X.dim32(0), num_channels = X.dim32(1),
            input_height = X.dim32(2), input_width = X.dim32(3);
  if (InputSize() == 2) {
    const auto& scales = Input(1);
    CAFFE_ENFORCE_EQ(scales.dim(), 1);
    CAFFE_ENFORCE_EQ(scales.numel(), 2);
    float scales_data[2];
    context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
    height_scale_ = scales_data[0];
    width_scale_ = scales_data[1];
  }
  int output_width = input_width * width_scale_;
  int output_height = input_height * height_scale_;
  auto* Y = Output(
      0,
      {batch_size, num_channels, output_height, output_width},
      at::dtype<float>());

  const auto size = output_height * output_width;
  UpsampleBilinearKernel<<<
      CAFFE_GET_BLOCKS(size),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      batch_size,
      num_channels,
      input_height,
      input_width,
      output_height,
      output_width,
      X.data<float>(),
      Y->template mutable_data<float>());
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return true;
}
template <>
bool UpsampleBilinearGradientOp<float, CUDAContext>::RunOnDevice() {
  const auto& dY = Input(0);
  const auto& X = Input(1);
  const auto inputDims = dY.sizes();
  CAFFE_ENFORCE_EQ(4, inputDims.size());
  const int batch_size = dY.dim32(0);
  const int num_channels = dY.dim32(1);
  const int input_height = dY.dim32(2);
  const int input_width = dY.dim32(3);
  const int output_height = X.dim32(2);
  const int output_width = X.dim32(3);
  if (InputSize() == 3) {
    const auto& scales = Input(2);
    CAFFE_ENFORCE_EQ(scales.dim(), 1);
    CAFFE_ENFORCE_EQ(scales.numel(), 2);
    float scales_data[2];
    context_.CopyToCPU<float>(2, scales.data<float>(), scales_data);
    height_scale_ = scales_data[0];
    width_scale_ = scales_data[1];
  }
  auto* dX = Output(
      0,
      {batch_size, num_channels, output_height, output_width},
      at::dtype<float>());
  math::Set<float, CUDAContext>(
      dX->numel(), 0.0f, dX->mutable_data<float>(), &context_);
  const auto size = dY.numel();
  UpsampleBilinearGradientKernel<<<
      CAFFE_GET_BLOCKS(size),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      size,
      num_channels,
      input_height,
      input_width,
      output_height,
      output_width,
      dY.data<float>(),
      dX->template mutable_data<float>());
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return true;
}
REGISTER_CUDA_OPERATOR(
    UpsampleBilinear,
    UpsampleBilinearOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
    UpsampleBilinearGradient,
    UpsampleBilinearGradientOp<float, CUDAContext>);
} // namespace caffe2