File: channel_stats_op.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (122 lines) | stat: -rw-r--r-- 2,988 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include "caffe2/operators/channel_stats_op.h"

#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math/reduce.cuh"

namespace caffe2 {

namespace {

template <typename T, int kBlockDimX, int kBlockDimY>
__global__ void ChannelStatsNCHWCUDAKernel(
    const int N,
    const int C,
    const int HxW,
    const T* X,
    T* sum,
    T* sumsq) {
  __shared__
      typename BlockReduce2D<T, kBlockDimX, kBlockDimY>::TempStorage m_storage;
  __shared__
      typename BlockReduce2D<T, kBlockDimX, kBlockDimY>::TempStorage v_storage;
  const int c = blockIdx.x;
  T m_val = 0;
  T v_val = 0;
  for (int n = threadIdx.x; n < N; n += blockDim.x) {
    for (int hw = threadIdx.y; hw < HxW; hw += blockDim.y) {
      const int index = (n * C + c) * HxW + hw;
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
      m_val += __ldg(X + index);
      v_val += __ldg(X + index) * __ldg(X + index);
#else
      m_val += X[index];
      v_val += X[index] * X[index];
#endif
    }
  }
  m_val = BlockReduce2D<T, kBlockDimX, kBlockDimY>(m_storage).Sum(m_val);
  v_val = BlockReduce2D<T, kBlockDimX, kBlockDimY>(v_storage).Sum(v_val);
  if (threadIdx.x == 0 && threadIdx.y == 0) {
    sum[c] = m_val;
    sumsq[c] = v_val;
  }
}

template <typename T>
__global__ void ChannelStatsNHWCCUDAKernel(
    const int N,
    const int C,
    const int HxW,
    const T* X,
    T* sum,
    T* sumsq) {
  __shared__ typename BlockReduce<T>::TempStorage m_storage;
  __shared__ typename BlockReduce<T>::TempStorage v_storage;
  const int inner_size = N * HxW;
  const int c = blockIdx.x;
  T m_val = 0;
  T v_val = 0;
  for (int i = threadIdx.x; i < inner_size; i += blockDim.x) {
    const int index = i * C + c;
#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
    m_val += __ldg(X + index);
    v_val += __ldg(X + index) * __ldg(X + index);
#else
    m_val += X[index];
    v_val += X[index] * X[index];
#endif
  }
  m_val = BlockReduce<T>(m_storage).Sum(m_val);
  v_val = BlockReduce<T>(v_storage).Sum(v_val);
  if (threadIdx.x == 0) {
    sum[c] = m_val;
    sumsq[c] = v_val;
  }
}

} // namespace

template <>
template <>
bool ChannelStatsOp<CUDAContext>::ComputeChannelStatsNCHW<float>(
    const int N,
    const int C,
    const int HxW,
    const float* X,
    float* sum,
    float* sumsq) {
  DISPATCH_REDUCE_KERNEL_BY_2D_BLOCK_WITH_TYPE_1(
      HxW,
      ChannelStatsNCHWCUDAKernel,
      float,
      C,
      context_.cuda_stream(),
      N,
      C,
      HxW,
      X,
      sum,
      sumsq);
  return true;
}

template <>
template <>
bool ChannelStatsOp<CUDAContext>::ComputeChannelStatsNHWC<float>(
    const int N,
    const int C,
    const int HxW,
    const float* X,
    float* sum,
    float* sumsq) {
  ChannelStatsNHWCCUDAKernel<float>
      <<<C, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
          N, C, HxW, X, sum, sumsq);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return true;
}

REGISTER_CUDA_OPERATOR(ChannelStats, ChannelStatsOp<CUDAContext>);

} // namespace caffe2