File: transform_gpu.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (85 lines) | stat: -rw-r--r-- 2,108 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include "caffe2/core/context_gpu.h"
#include "caffe2/image/transform_gpu.h"
#include "caffe2/utils/conversions.h"

/**
 *
 * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
 * Distributed under 2-clause BSD license; see accompanying LICENSE file
 *
 **/

namespace caffe2 {

namespace {

// input in (int8, NHWC), output in (fp32, NCHW)
template <typename In, typename Out>
__global__ void transform_kernel(
    const int C,
    const int H,
    const int W,
    const float* mean,
    const float* std,
    const In* in,
    Out* out) {
  const auto n = blockIdx.x;

  const auto nStride = C*H*W;

  // pointers to data for this image
  const In *const input_ptr = &in[n*nStride];
  Out *const output_ptr = &out[n*nStride];

  // either read or write uncoalesced - try reading
  for (int c=0; c < C; ++c) {
    for (int h=threadIdx.y; h < H; h += blockDim.y) {
      for (int w=threadIdx.x; w < W; w += blockDim.x) {
        const int in_idx = c + C*w + C*W*h;  // HWC
        const int out_idx = c*H*W + h*W + w;  // CHW

        output_ptr[out_idx] = convert::To<float,Out>(
          (convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
      }
    }
  }
}

}

template <typename T_IN, typename T_OUT, class Context>

bool TransformOnGPU(
    Tensor& X,
    Tensor* Y,
    Tensor& mean,
    Tensor& std,
    Context* context) {
  const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
  auto* input_data = X.template data<T_IN>();
  auto* output_data = Y->template mutable_data<T_OUT>();

  transform_kernel<
    T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
      C, H, W, mean.template data<float>(), std.template data<float>(),
      input_data, output_data);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return true;
};

template bool TransformOnGPU<uint8_t, float, CUDAContext>(
    Tensor& X,
    Tensor* Y,
    Tensor& mean,
    Tensor& std,
    CUDAContext* context);

template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
    Tensor& X,
    Tensor* Y,
    Tensor& mean,
    Tensor& std,
    CUDAContext* context);

}  // namespace caffe2