File: elementwise_add_dnnlowp_op.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (205 lines) | stat: -rw-r--r-- 6,846 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#include "elementwise_dnnlowp_op.h"

#include "caffe2/operators/elementwise_add_op.h"
#include "caffe2/quantization/server/sigmoid.h"

#include "dnnlowp_partition.h"
#include "op_wrapper.h"
#include "utility_dnnlowp_ops.h"

namespace caffe2 {

using namespace std;
using namespace dnnlowp;

using AddFp32Op =
    BinaryElementwiseOp<NumericTypes, CPUContext, AddFunctor<CPUContext>>;

template <typename T>
class AddDNNLowPOp : public BinaryElementwiseDNNLowPOp<T, AddFp32Op> {
 public:
  USE_OPERATOR_FUNCTIONS(CPUContext);
  USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, AddFp32Op);
  using BinaryElementwiseDNNLowPOp<T, AddFp32Op>::axis_;
  using BinaryElementwiseDNNLowPOp<T, AddFp32Op>::enable_broadcast_;
  using BinaryElementwiseDNNLowPOp<T, AddFp32Op>::requantization_params_;

  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  AddDNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
      : BinaryElementwiseDNNLowPOp<T, AddFp32Op>(operator_def, ws) {}

  bool RunOnDevice() override {
    if (!GetQuantizationParameters_()) {
      return false;
    }

    const auto& A = InputTensorCPU_(0);
    const auto& B = InputTensorCPU_(1);
    auto* C = OutputTensorCPU_(0);
    CAFFE_ENFORCE(
        &B != C || !enable_broadcast_,
        "In-place is allowed only with the first tensor when broadcasting");
    C->ResizeLike(A);

    T* C_quantized = GetQuantizedOutputData_();

    if (A.template IsType<T>() && B.template IsType<T>() &&
        A.numel() == B.numel() && is_same<T, uint8_t>::value &&
        GetCpuId().avx2() && GetCpuId().fma()) {
      // fast path
      // NOTE: this path does addition in floating point unlike slow path that
      // does everything in fixed-point. So they are numerically different.
#ifdef _OPENMP
#pragma omp parallel
#endif
      {
        constexpr int VLEN = 8;
        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
        int j_begin, j_end;
        tie(j_begin, j_end) = Get1DPartition(
            A.numel(),
            dnnlowp_get_num_threads(),
            dnnlowp_get_thread_num(),
            VLEN);

        internal::ElementWiseSumAVX2<T, false /*ReluFused*/>(
            A.template data<T>() + j_begin,
            B.template data<T>() + j_begin,
            C_quantized + j_begin,
            j_end - j_begin,
            in_qparams_[0].scale,
            in_qparams_[0].zero_point,
            in_qparams_[1].scale,
            in_qparams_[1].zero_point,
            out_qparams_.scale,
            out_qparams_.zero_point);
      } // omp parallel

      RunOnDeviceEpilogue_();

      return true;
    }

    // Quantize inputs if needed
    vector<int32_t> A_quantized(A.numel()), B_quantized(B.numel());
    for (int i = 0; i < 2; ++i) {
      int32_t* quantized_in = i == 0 ? A_quantized.data() : B_quantized.data();
      if (InputTensorCPU_(i).template IsType<T>()) {
        float real_multiplier =
            in_qparams_[i].scale / intermediate_qparams_.scale;
        RequantizationParams in_requantization_params =
            qfactory_->ChooseRequantizationMultiplier(
                real_multiplier, intermediate_qparams_);

        const T* input_data = InputTensorCPU_(i).template data<T>();
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
          quantized_in[j] = fbgemm::Requantize<int32_t>(
              input_data[j] - in_qparams_[i].zero_point,
              in_requantization_params);
        }
      } else {
        assert(A.template IsType<float>());
        const float* input_data = InputTensorCPU_(i).template data<float>();
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
          quantized_in[j] = fbgemm::Quantize<uint32_t>(
              input_data[j],
              intermediate_qparams_.zero_point,
              intermediate_qparams_.scale,
              qfactory_->GetEltwiseQuantizePrecision());
        }
      }
    }

    int32_t intermediate_zero_point =
        intermediate_qparams_.zero_point * InputSize();

    if (!enable_broadcast_) {
      CAFFE_ENFORCE_EQ(
          A.sizes(),
          B.sizes(),
          "Dimension mismatch - did you forget to set broadcast=1?");
#ifdef _OPENMP
#pragma omp parallel for
#endif
      for (int i = 0; i < C->numel(); ++i) {
        int32_t raw = A_quantized[i] + B_quantized[i] - intermediate_zero_point;
        C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
      }
    } else if (B.numel() == 1) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
      for (int i = 0; i < C->numel(); ++i) {
        int32_t raw = A_quantized[i] + B_quantized[0] - intermediate_zero_point;
        C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
      }
    } else {
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      size_t pre, n, post;
      std::tie(pre, n, post) =
          elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
#ifdef _OPENMP
#pragma omp parallel for
#endif
      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
      for (int i = 0; i < pre; ++i) {
        // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
        for (int j = 0; j < n; ++j) {
          // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
          for (int k = 0; k < post; ++k) {
            int32_t raw = A_quantized[((i * n) + j) * post + k] +
                B_quantized[j] - intermediate_zero_point;
            C_quantized[((i * n) + j) * post + k] =
                fbgemm::Requantize<T>(raw, requantization_params_);
          }
        }
      }
    }

    RunOnDeviceEpilogue_();

    return true;
  }

 private:
  bool GetQuantizationParameters_() {
    // Find global min and max of all inputs
    float global_min = numeric_limits<float>::max(),
          global_max = numeric_limits<float>::lowest();

    for (int i = 0; i < InputSize(); ++i) {
      in_qparams_[i] =
          GetInputTensorQuantizationParamsOf(this, i, qfactory_.get());

      global_min = std::min(global_min, in_qparams_[i].Min());
      global_max = std::max(global_max, in_qparams_[i].Max());
    }

    intermediate_qparams_ = qfactory_->ChooseQuantizationParams(
        global_min,
        global_max,
        qfactory_->GetEltwiseQuantizePrecision(),
        qfactory_->GetPreserveActivationSparsity());

    GetOutputQuantizationParams_();

    float real_multiplier = intermediate_qparams_.scale / out_qparams_.scale;
    requantization_params_ = qfactory_->ChooseRequantizationMultiplier(
        real_multiplier, out_qparams_);

    return true;
  }

  dnnlowp::TensorQuantizationParams intermediate_qparams_;
}; // class AddDNNLowPOp

REGISTER_CPU_OPERATOR_WITH_ENGINE(Add, DNNLOWP, AddDNNLowPOp<uint8_t>);
REGISTER_CPU_OPERATOR_WITH_ENGINE(Int8Add, DNNLOWP, AddDNNLowPOp<uint8_t>);

} // namespace caffe2