1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
#ifndef CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
#define CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
#include <fbgemm/FbgemmConvert.h>
#include <cmath>
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/operators/quantized/int8_utils.h"
#include "fp16_fma.h"
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
namespace caffe2 {
namespace int8 {
namespace {
static float ClampScale(float s)
{
const float MinScale(1e-10f);
if (std::fabs(s) < MinScale) {
LOG_EVERY_N(WARNING, 1000) << "Too small scale detected: "
<< s << " clamping to +/-" << MinScale;
return std::signbit(s) ? -MinScale : MinScale;
} else {
return s;
}
}
void Int8QuantizeNNPI(
const float* in,
uint8_t* out,
const int64_t N,
const float Y_scale,
const int32_t Y_offset) {
const int32_t qmin = std::numeric_limits<uint8_t>::min();
const int32_t qmax = std::numeric_limits<uint8_t>::max();
float inv_scale = ClampScale(1 / Y_scale);
float inv_scale_fp16 = 0;
fbgemm::RoundToFloat16(
&inv_scale, &inv_scale_fp16, 1, false /* no clamping */);
float offset_tmp = -Y_offset;
fbgemm::RoundToFloat16(
&offset_tmp, &offset_tmp, 1, false /* no clamping */);
std::vector<float> in_fp16(N);
fbgemm::RoundToFloat16(
in, in_fp16.data(), N, false /* no clamping */);
std::vector<float> inv_scalev(N, inv_scale_fp16);
std::vector<float> offsetv(N, -offset_tmp);
fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data());
for (const auto i : c10::irange(N)) {
offsetv[i] = round(offsetv[i]);
}
fbgemm::RoundToFloat16(
offsetv.data(), offsetv.data(), N, false /* no clamping */);
for (const auto i : c10::irange(N)) {
float halfRes = offsetv[i];
if (std::isinf(halfRes)) {
if (halfRes > 0) {
halfRes = qmax;
} else {
halfRes = qmin;
}
}
if (halfRes > qmax) {
halfRes = qmax;
}
if (halfRes < qmin) {
halfRes = qmin;
}
out[i] = static_cast<uint8_t>(halfRes);
}
}
} // namespace
class Int8QuantizeNNPIOp final : public Operator<CPUContext> {
public:
using Operator<CPUContext>::Operator;
bool RunOnDevice() override {
const auto& X = Input(0);
auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
Y->t.ResizeLike(X);
int32_t Y_offset =
this->template GetSingleArgument<int>("Y_zero_point", 0);
auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
Y->scale = Y_scale;
Y->zero_point = Y_offset;
Int8QuantizeNNPI(
X.data<float>(),
Y->t.mutable_data<uint8_t>(),
X.numel(),
Y_scale,
Y_offset);
return true;
}
};
} // namespace int8
} // namespace caffe2
#endif // CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
|