File: pool_op_util.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (361 lines) | stat: -rw-r--r-- 11,287 bytes
parent folder | download | duplicates (2)
#include "caffe2/operators/pool_op_util.h"

#include "caffe2/utils/eigen_utils.h"

namespace caffe2 {
namespace pool_op_util {

namespace {

#if defined(__ARM_NEON__) || defined(__ARM_NEON)

// Vectorizes 4x4p0s0 average pooling for ARM NEON
void AvgPoolNeon4x4p0s0Plane(
    int inputH,
    int inputW,
    const float* input,
    float* output) {
  constexpr int kKernelHeight = 4;
  constexpr int kKernelWidth = 4;
  constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));

  // Handle portion that can be unrolled by 4
  constexpr int kUnroll = 4;
  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;

  if (inputW % kLoadCols == 0) {
    //
    // Manually unroll by 4 (kUnroll)
    //

    for (int h = 0; h < inputH; h += kKernelHeight) {
      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
      const float* curInput = input + h * inputW;

      for (int w = 0; w < inputW; w += kLoadCols) {
        float32x4_t out = {};

        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
          out = vsetq_lane_f32(v0, out, 0);
        }
        curInput += kLoadSizeFloat;

        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
          out = vsetq_lane_f32(v0, out, 1);
        }
        curInput += kLoadSizeFloat;

        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
          out = vsetq_lane_f32(v0, out, 2);
        }
        curInput += kLoadSizeFloat;

        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
          float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
          float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
          out = vsetq_lane_f32(v0, out, 3);
        }
        curInput += kLoadSizeFloat;

        out = vmulq_f32(out, vdupq_n_f32(kDiv));
        vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
      }
    }
  } else {
    //
    // Not unrolled
    //

    for (int h = 0; h < inputH; h += kKernelHeight) {
      const float* inputRow = input + h * inputW;
      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);

      for (int w = 0; w < inputW; w += kKernelWidth) {
        const float* curInput = inputRow + w;

        float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
        float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
        float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
        float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
        float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
        outputRow[w / kKernelWidth] = v0;
      }
    }
  }
}

// Vectorizes 2x2p0s0 average pooling for ARM NEON
void MaxPoolNeon2x2p0s0Plane(
    int inputH,
    int inputW,
    const float* input,
    float* output) {
  constexpr int kKernelHeight = 2;
  constexpr int kKernelWidth = 2;

  // Handle portion that can be unrolled by 4
  constexpr int kUnroll = 4;
  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;

  if (inputW % kLoadCols == 0) {
    for (int h = 0; h < inputH; h += kKernelHeight) {
      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
      const float* curInput = input + h * inputW;

      for (int w = 0; w < inputW; w += kLoadCols) {
        float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
          hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
        }
        curInput += kLoadSizeFloat;
        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
          hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
        }
        curInput += kLoadSizeFloat;
        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
          hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
        }
        curInput += kLoadSizeFloat;
        {
          float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
          float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
          float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
          hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
        }
        curInput += kLoadSizeFloat;

        float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
        float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
        vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
        vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
      }
    }
  } else {
    // Not unrolled
    for (int h = 0; h < inputH; h += kKernelHeight) {
      const float* inputRow = input + h * inputW;
      float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);

      for (int w = 0; w < inputW; w += kKernelWidth * 2) {
        const float* curInput = inputRow + w;
        float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
        float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
        float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
        float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
        vst1_f32(&outputRow[w / kKernelWidth], hmax);
      }
    }
  }
}

#endif

} // namespace

bool IsNeon4x4p0s0Eligible(
    const int input_h,
    const int input_w,
    const int output_h,
    const int output_w,
    const int kh,
    const int kw,
    const int stride_h,
    const int stride_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int dilation_h,
    const int dilation_w,
    const float* X,
    float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
  // Use this kernel only if:
  //   1. Kernel size is 4x4
  //   2. Stride is 4x4
  //   3. Padding is 0
  //   4. Dilation is 1
  //   5. Output width and height are even divisors of input width
  //   6. Input width and height are divisible by 4 (should be implied by all of
  //      the above, but just check again)
  // Input and output pointers are aligned by float32x4_t
  const bool kernel_ok = (kh == 4) && (kw == 4);
  const bool stride_ok = (stride_h == 4) && (stride_w == 4);
  const bool pad_ok =
      (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
  const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
  const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
  const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
  const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
      isPointerAligned(Y, sizeof(float32x4_t));
  return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
      input_ok && align_ok;
#else
  (void)input_h;
  (void)input_w;
  (void)output_h;
  (void)output_w;
  (void)kh;
  (void)kw;
  (void)stride_h;
  (void)stride_w;
  (void)pad_t;
  (void)pad_l;
  (void)pad_b;
  (void)pad_r;
  (void)dilation_h;
  (void)dilation_w;
  (void)X;
  (void)Y;
  return false;
#endif
}

bool IsNeon2x2p0s0Eligible(
    const int input_h,
    const int input_w,
    const int output_h,
    const int output_w,
    const int kh,
    const int kw,
    const int stride_h,
    const int stride_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int dilation_h,
    const int dilation_w,
    const float* X,
    float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
  // Use this kernel only if:
  //   1. Kernel size is 2x2
  //   2. Stride is 2x2
  //   3. Padding is 0
  //   4. Dilation is 1
  //   5. Output width and height are even divisors of input width
  //   6. Input width and height are divisible by 4 (should be implied b all of
  //      the above, but just check again)
  // Input and output pointers are aligned by float32x4_t
  const bool kernel_ok = (kh == 2) && (kw == 2);
  const bool stride_ok = (stride_h == 2) && (stride_w == 2);
  const bool pad_ok =
      (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
  const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
  const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
  const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
  const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
      isPointerAligned(Y, sizeof(float32x4_t));
  return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
      input_ok && align_ok;
#else
  (void)input_h;
  (void)input_w;
  (void)output_h;
  (void)output_w;
  (void)kh;
  (void)kw;
  (void)stride_h;
  (void)stride_w;
  (void)pad_t;
  (void)pad_l;
  (void)pad_b;
  (void)pad_r;
  (void)dilation_h;
  (void)dilation_w;
  (void)X;
  (void)Y;
  return false;
#endif
}

void RunNeonAveragePool4x4p0s0NCHW(
    int N,
    int C,
    int H,
    int W,
    const float* X,
    float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
  const int X_stride = H * W;
  const int Y_stride = (H / 4) * (W / 4);
  const float* X_ptr = X;
  float* Y_ptr = Y;
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < C; ++j) {
      AvgPoolNeon4x4p0s0Plane(H, W, X_ptr, Y_ptr);
      X_ptr += X_stride;
      Y_ptr += Y_stride;
    }
  }
#else
  (void)N;
  (void)C;
  (void)H;
  (void)W;
  (void)X;
  (void)Y;
#endif
}

void RunNeonMaxPool2x2p0s0NCHW(
    int N,
    int C,
    int H,
    int W,
    const float* X,
    float* Y) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
  const int X_stride = H * W;
  const int Y_stride = (H / 2) * (W / 2);
  const float* X_ptr = X;
  float* Y_ptr = Y;
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < C; ++j) {
      MaxPoolNeon2x2p0s0Plane(H, W, X_ptr, Y_ptr);
      X_ptr += X_stride;
      Y_ptr += Y_stride;
    }
  }
#else
  (void)N;
  (void)C;
  (void)H;
  (void)W;
  (void)X;
  (void)Y;
#endif
}

} // namespace pool_op_util
} // namespace caffe2