File: nnpack_ops.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (352 lines) | stat: -rw-r--r-- 12,656 bytes
parent folder | download | duplicates (2)
#include "caffe2/core/common.h"

#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif

#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/operators/leaky_relu_op.h"
#include "caffe2/utils/cpuid.h"
#include "caffe2/utils/math.h"
#include "nnpack.h"

C10_DEFINE_int(
    caffe2_nnpack_num_threads,
    1,
    "The number of nnpack pthreadpool threads.");
C10_DEFINE_bool(
    caffe2_nnpack_use_mkl_num_threads,
    true,
    "If MKL is built, this sets nnpack to use the same number of threads as "
    "MKL does. This overrides caffe2_nnpack_num_threads if set.");

namespace caffe2 {
////////////////////////////////////////////////////////////////////////////////
// Helper Functions
////////////////////////////////////////////////////////////////////////////////

namespace {

bool has_nnpack() {
  // nnp_initialize is a noop after the first call so it's safe to invoke it
  // repeatedly
  auto nnpack_status = nnp_initialize();
  return nnpack_status == nnp_status_success;
}

nnp_convolution_algorithm get_nnp_convolution_algorithm(
    const std::string& algo) {
  if (algo == "AUTO") {
    return nnp_convolution_algorithm_auto;
  }
  if (algo == "WINOGRAD") {
    return nnp_convolution_algorithm_wt8x8;
  }
  if (algo == "FT16") {
    return nnp_convolution_algorithm_ft16x16;
  }
  if (algo == "FT8") {
    return nnp_convolution_algorithm_ft8x8;
  }
  return nnp_convolution_algorithm_auto;
}

nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
    const std::string& kts) {
  if (kts == "BLOCK") {
    return nnp_convolution_transform_strategy_block_based;
  }
  if (kts == "TUPLE") {
    return nnp_convolution_transform_strategy_tuple_based;
  }
  return nnp_convolution_transform_strategy_block_based;
}

////////////////////////////////////////////////////////////////////////////////
// Thread Pool
////////////////////////////////////////////////////////////////////////////////

static pthreadpool_t nnpack_threadpool_ = nullptr;

pthreadpool_t nnpack_threadpool() {
  if (nnpack_threadpool_ == nullptr) {
    enum nnp_status nnpack_status = nnp_initialize();
    CAFFE_ENFORCE(
        nnpack_status == nnp_status_success, "NNPack is not supported here!");
    int num_threads = FLAGS_caffe2_nnpack_num_threads;
    if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
#ifdef CAFFE2_USE_MKL
      num_threads = mkl_get_max_threads();
#else
      VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
                 "Caffe2 is not built with MKL. Skipping.";
#endif
    }
    nnpack_threadpool_ = pthreadpool_create(num_threads);
  }
  return nnpack_threadpool_;
}
}

////////////////////////////////////////////////////////////////////////////////
// NNPACK Ops
////////////////////////////////////////////////////////////////////////////////

class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
 public:
  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<CPUContext>(operator_def, ws),
        algo_(get_nnp_convolution_algorithm(
            OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
        kts_(get_nnp_convolution_transform_strategy(
            OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
    OPERATOR_NEEDS_FEATURE(
        this->order_ == StorageOrder::NCHW,
        "NNPack only supports NCHW order. Please consider adding "
        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
    OPERATOR_NEEDS_FEATURE(
        dilation_h() == 1 && dilation_w() == 1,
        "The NNPack convolution does not support dilation yet.");
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDeviceWithOrderNCHW() override {
    auto& X = Input(0);
    auto& filter = Input(1);
    auto& bias = Input(2);
    auto* Y = Output(0);

    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
    const int M = filter.dim32(0);

    CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4");
    CAFFE_ENFORCE(filter.dim(), 4);
    CAFFE_ENFORCE(C % this->group_ == 0, "");
    CAFFE_ENFORCE(M % this->group_ == 0, "");
    CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
    CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
    CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
    CAFFE_ENFORCE(bias.numel() == M, "");

    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
    const int oH = Y->dim32(2), oW = Y->dim32(3);

    if (N > 1) {
      CAFFE_ENFORCE_EQ(
          this->stride_h(),
          1,
          "NNPack only supports stride = 1 when doing batch feedforward");
      CAFFE_ENFORCE_EQ(
          this->stride_w(),
          1,
          "NNPack only supports stride = 1 when doing batch feedforward");
    }
    std::vector<int> pads(
        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
    std::vector<int> stride({this->stride_h(), this->stride_w()});

    const size_t input_channels = X.dim32(1);
    const size_t output_channels = Y->dim32(1);
    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
                                 .height = static_cast<size_t>(X.dim32(2))};
    // filter is MCHW
    const nnp_size kernel_size = {
        .width = static_cast<size_t>(filter.dim32(3)),
        .height = static_cast<size_t>(filter.dim32(2))};
    // pad is tblr
    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
                                 .right = static_cast<size_t>(pads[3]),
                                 .bottom = static_cast<size_t>(pads[1]),
                                 .left = static_cast<size_t>(pads[2])};

    const nnp_size output_subsample = {
        .width = static_cast<size_t>(stride[1]),
        .height = static_cast<size_t>(stride[0])};
    if (N == 1) {
      VLOG(1) << "Running inference mode";
      for (auto g = 0; g < group_; ++g) {
        const auto status = nnp_convolution_inference(
            algo_,
            kts_,
            C / group_,
            M / group_,
            input_size,
            padding,
            kernel_size,
            output_subsample,
            X.template data<float>() + g * H * W * (C / group_),
            filter.template data<float>() + filter.numel() / group_ * g,
            bias.template data<float>() + bias.numel() / group_ * g,
            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
            nnpack_threadpool(),
            nullptr);
        CAFFE_ENFORCE(nnp_status_success == status, "");
      }
    } else {
      VLOG(1) << "Running batched mode";
      for (auto g = 0; g < group_; ++g) {
        const auto status = nnp_convolution_output(
            algo_,
            N,
            C / group_,
            M / group_,
            input_size,
            padding,
            kernel_size,
            X.template data<float>() + g * H * W * (C / group_),
            filter.template data<float>() + filter.numel() / group_ * g,
            bias.template data<float>() + bias.numel() / group_ * g,
            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
            nnpack_threadpool(),
            nullptr);
        CAFFE_ENFORCE(nnp_status_success == status, "");
      }
    }
    return true;
  }

 private:
  const nnp_convolution_algorithm algo_;
  const nnp_convolution_transform_strategy kts_;
};

class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
 public:
  NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
    OPERATOR_NEEDS_FEATURE(
        this->order_ == StorageOrder::NCHW,
        "NNPack only supports NCHW order. Please consider add "
        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
    OPERATOR_NEEDS_FEATURE(
        this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_t() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_l() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_r() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_b() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDeviceWithOrderNCHW() override {
    auto& X = Input(0);
    auto* Y = Output(0);
    CAFFE_ENFORCE(X.dim() == 4, "");
    const int H = X.dim32(2), W = X.dim32(3);
    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
    std::vector<int> pads(
        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
    std::vector<int> stride({this->stride_h(), this->stride_w()});
    std::vector<int> pooling({this->kernel_h(), this->kernel_w()});

    // Input X is in NCHW order
    const size_t batch_size = X.dim32(0);
    const size_t input_channels = X.dim32(1);
    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
                                 .height = static_cast<size_t>(X.dim32(2))};
    // pooling kernel
    const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
                                   .height = static_cast<size_t>(pooling[0])};
    // pad is tblr
    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
                                 .right = static_cast<size_t>(pads[3]),
                                 .bottom = static_cast<size_t>(pads[1]),
                                 .left = static_cast<size_t>(pads[2])};

    const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
                                     .height = static_cast<size_t>(stride[0])};
    const auto status = nnp_max_pooling_output(
        batch_size,
        input_channels,
        input_size,
        padding,
        pooling_size,
        pooling_stride,
        X.template data<float>(),
        Y->template mutable_data<float>(),
        nnpack_threadpool());
    CAFFE_ENFORCE(nnp_status_success == status, "");
    return true;
  }

 private:
};

class NNPACKReluOp final : public Operator<CPUContext> {
 public:
  NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDevice() override {
    auto& X = Input(0);
    auto* Y = Output(0);
    const auto status = nnp_relu_output(
        1,
        X.numel(),
        X.template data<float>(),
        Y->template mutable_data<float>(),
        0.0,
        nnpack_threadpool());
    CAFFE_ENFORCE(nnp_status_success == status, "");
    return true;
  }

 private:
};

class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
 public:
  NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
      : LeakyReluOp<float, CPUContext>(operator_def, ws) {
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDevice() override {
    auto& X = Input(0);
    auto* Y = Output(0);
    const auto status = nnp_relu_output(
        1,
        X.numel(),
        X.template data<float>(),
        Y->template mutable_data<float>(),
        alpha_,
        nnpack_threadpool());
    CAFFE_ENFORCE(nnp_status_success == status, "");
    return true;
  }

 private:
};

REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);

} // namespace caffe2