File: elementwise_dnnlowp_op.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (137 lines) | stat: -rw-r--r-- 6,252 bytes
#pragma once

#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/elementwise_ops.h"
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/dnnlowp_op.h"
#include "caffe2/quantization/server/sigmoid.h"

namespace caffe2 {

template <typename T, class Functor>
class UnaryElementwiseWithArgsDNNLowPOp : public Operator<CPUContext> {
 public:
  USE_OPERATOR_FUNCTIONS(CPUContext);
  UnaryElementwiseWithArgsDNNLowPOp(
      const OperatorDef& operator_def,
      Workspace* ws)
      : Operator<CPUContext>(operator_def, ws), functor_() {}

  bool RunOnDevice() override {
    if (!arguments_parsed_) {
      dnnlowp::ParseDNNLowPOperatorArguments(this);
      dnnlowp::SetStaticQuantizationParams(
          this, 0, functor_.GetOutputQuantizationParams());
      arguments_parsed_ = true;
    }

    auto& input = this->template Input<int8::Int8TensorCPU>(0).t;
    auto& output = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>()->t;
    output.ResizeLike(input);
    functor_(
        input.size(),
        input.template data<T>(),
        output.template mutable_data<T>());

    dnnlowp::PropagateOutputTensorQuantizationParams(
        this, 0, functor_.GetOutputQuantizationParams());
    return true;
  }

 private:
  Functor functor_;
  bool arguments_parsed_{false};
};

template <typename T, typename FP32_OP>
class BinaryElementwiseDNNLowPOp : public DNNLowPOp<T, FP32_OP> {
 public:
  USE_OPERATOR_FUNCTIONS(CPUContext);
  BinaryElementwiseDNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
      : DNNLowPOp<T, FP32_OP>(operator_def, ws),
        OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
        OP_SINGLE_ARG(int, "axis", axis_, -1),
        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
        OP_SINGLE_ARG(string, "order", order_, "NCHW") {
    // Figure out the correct axis to use.
    if (enable_broadcast_) {
      if (axis_ != -1) {
        // Get axis from an explicit axis argument.
        CAFFE_ENFORCE_EQ(
            axis_str_.size(),
            0,
            "Args axis and axis_str cannot be used simultaneously.");
      } else if (axis_str_.size()) {
        // Get the axis index semantically.
        CAFFE_ENFORCE_EQ(
            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
        size_t semantic_axis_ = order_.find(axis_str_);
        CAFFE_ENFORCE_NE(
            semantic_axis_,
            string::npos,
            "Unrecognizable axis string ",
            axis_str_,
            " from order string ",
            order_);
        axis_ = semantic_axis_;
      }
    } else {
      CAFFE_ENFORCE(
          axis_ == -1 && axis_str_.size() == 0,
          "Do not specify axis or axis_str if broadcast is not enabled.");
    }
  }

 protected:
  bool enable_broadcast_;
  int axis_;
  string axis_str_;
  string order_;

  dnnlowp::RequantizationParams requantization_params_;
}; // BinaryElementwiseDNNLowPOp

// For arithmetic operators, Eigen provides a good way to vectorize even
// when broadcasting.
#define DECLARE_EIGEN_FUNCTOR(name, eigen_op, input_type, output_type)       \
  struct Eigen##name##Functor {                                              \
    template <int b_is_scalar, typename T, typename R>                       \
    inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
      if (b_is_scalar) {                                                     \
        EigenVectorArrayMap<R>(out, n) =                                     \
            eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0]));           \
      } else {                                                               \
        EigenVectorArrayMap<R>(out, n) = eigen_op(                           \
            (ConstEigenVectorArrayMap<T>(a, n)),                             \
            (ConstEigenVectorArrayMap<T>(b, n)));                            \
      }                                                                      \
    }                                                                        \
    template <typename T, typename R>                                        \
    void RunWithBroadcast(                                                   \
        const T* a,                                                          \
        const T* b,                                                          \
        R* out,                                                              \
        size_t pre,                                                          \
        size_t n,                                                            \
        CPUContext*) {                                                       \
      EigenArrayMap<R>(out, n, pre) = eigen_op(                              \
          (ConstEigenArrayMap<T>(a, n, pre).colwise()),                      \
          (ConstEigenVectorArrayMap<T>(b, n)));                              \
    }                                                                        \
    template <typename T, typename R>                                        \
    void RunWithBroadcast2(                                                  \
        const T* a,                                                          \
        const T* b,                                                          \
        R* out,                                                              \
        size_t pre,                                                          \
        size_t n,                                                            \
        size_t post,                                                         \
        CPUContext*) {                                                       \
      for (const auto i : c10::irange(pre)) {                                        \
        EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op(            \
            (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),    \
            (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));   \
      }                                                                      \
    }                                                                        \
  };
} // namespace caffe2