File: elementwise_dnnlowp_op.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (137 lines) | stat: -rw-r--r-- 6,252 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#pragma once

#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/elementwise_ops.h"
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/dnnlowp_op.h"
#include "caffe2/quantization/server/sigmoid.h"

namespace caffe2 {

template <typename T, class Functor>
class UnaryElementwiseWithArgsDNNLowPOp : public Operator<CPUContext> {
 public:
  USE_OPERATOR_FUNCTIONS(CPUContext);
  UnaryElementwiseWithArgsDNNLowPOp(
      const OperatorDef& operator_def,
      Workspace* ws)
      : Operator<CPUContext>(operator_def, ws), functor_() {}

  bool RunOnDevice() override {
    if (!arguments_parsed_) {
      dnnlowp::ParseDNNLowPOperatorArguments(this);
      dnnlowp::SetStaticQuantizationParams(
          this, 0, functor_.GetOutputQuantizationParams());
      arguments_parsed_ = true;
    }

    auto& input = this->template Input<int8::Int8TensorCPU>(0).t;
    auto& output = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>()->t;
    output.ResizeLike(input);
    functor_(
        input.size(),
        input.template data<T>(),
        output.template mutable_data<T>());

    dnnlowp::PropagateOutputTensorQuantizationParams(
        this, 0, functor_.GetOutputQuantizationParams());
    return true;
  }

 private:
  Functor functor_;
  bool arguments_parsed_{false};
};

template <typename T, typename FP32_OP>
class BinaryElementwiseDNNLowPOp : public DNNLowPOp<T, FP32_OP> {
 public:
  USE_OPERATOR_FUNCTIONS(CPUContext);
  BinaryElementwiseDNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
      : DNNLowPOp<T, FP32_OP>(operator_def, ws),
        OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
        OP_SINGLE_ARG(int, "axis", axis_, -1),
        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
        OP_SINGLE_ARG(string, "order", order_, "NCHW") {
    // Figure out the correct axis to use.
    if (enable_broadcast_) {
      if (axis_ != -1) {
        // Get axis from an explicit axis argument.
        CAFFE_ENFORCE_EQ(
            axis_str_.size(),
            0,
            "Args axis and axis_str cannot be used simultaneously.");
      } else if (axis_str_.size()) {
        // Get the axis index semantically.
        CAFFE_ENFORCE_EQ(
            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
        size_t semantic_axis_ = order_.find(axis_str_);
        CAFFE_ENFORCE_NE(
            semantic_axis_,
            string::npos,
            "Unrecognizable axis string ",
            axis_str_,
            " from order string ",
            order_);
        axis_ = semantic_axis_;
      }
    } else {
      CAFFE_ENFORCE(
          axis_ == -1 && axis_str_.size() == 0,
          "Do not specify axis or axis_str if broadcast is not enabled.");
    }
  }

 protected:
  bool enable_broadcast_;
  int axis_;
  string axis_str_;
  string order_;

  dnnlowp::RequantizationParams requantization_params_;
}; // BinaryElementwiseDNNLowPOp

// For arithmetic operators, Eigen provides a good way to vectorize even
// when broadcasting.
#define DECLARE_EIGEN_FUNCTOR(name, eigen_op, input_type, output_type)       \
  struct Eigen##name##Functor {                                              \
    template <int b_is_scalar, typename T, typename R>                       \
    inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
      if (b_is_scalar) {                                                     \
        EigenVectorArrayMap<R>(out, n) =                                     \
            eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0]));           \
      } else {                                                               \
        EigenVectorArrayMap<R>(out, n) = eigen_op(                           \
            (ConstEigenVectorArrayMap<T>(a, n)),                             \
            (ConstEigenVectorArrayMap<T>(b, n)));                            \
      }                                                                      \
    }                                                                        \
    template <typename T, typename R>                                        \
    void RunWithBroadcast(                                                   \
        const T* a,                                                          \
        const T* b,                                                          \
        R* out,                                                              \
        size_t pre,                                                          \
        size_t n,                                                            \
        CPUContext*) {                                                       \
      EigenArrayMap<R>(out, n, pre) = eigen_op(                              \
          (ConstEigenArrayMap<T>(a, n, pre).colwise()),                      \
          (ConstEigenVectorArrayMap<T>(b, n)));                              \
    }                                                                        \
    template <typename T, typename R>                                        \
    void RunWithBroadcast2(                                                  \
        const T* a,                                                          \
        const T* b,                                                          \
        R* out,                                                              \
        size_t pre,                                                          \
        size_t n,                                                            \
        size_t post,                                                         \
        CPUContext*) {                                                       \
      for (const auto i : c10::irange(pre)) {                                        \
        EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op(            \
            (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),    \
            (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));   \
      }                                                                      \
    }                                                                        \
  };
} // namespace caffe2