1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
|
#pragma once
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <limits>
#ifdef __x86_64__
#include <immintrin.h>
#endif
#include <fbgemm/QuantUtils.h>
#include "caffe2/quantization/server/dynamic_histogram.h"
#include "caffe2/utils/cpuid.h"
namespace dnnlowp {
using fbgemm::RequantizationParams;
using fbgemm::TensorQuantizationParams;
// Represents a quantization scheme that provides quantization parameter based
// on distribution of data to be quantized.
class QuantizationFactory {
public:
enum QuantizationKind {
// A simple quantization scheme that determines quantization parameter by
// just looking at min/max.
MIN_MAX_QUANTIZATION,
// Minimizes L2 norm of quantization error
L2_MIN_QUANTIZATION,
// fast search to remove histogram outliers and approximate L2 min
L2_MIN_QUANTIZATION_APPROX,
// Minimizes Kullback-Leibler divergence
KL_MIN_QUANTIZATION,
// Take 99 percentail (only works with sparsity preserving quantization)
P99_QUANTIZATION,
L1_MIN_QUANTIZATION,
};
/// Get the default factory whose policy is determined by gflags
static QuantizationFactory* GetDefaultInstance();
/// Choose quantization scale and zero_point that maps
/// floating-point range [min, max] to the integer range of the specified
/// precision
TensorQuantizationParams ChooseQuantizationParams(
float min,
float max,
int precision,
bool preserve_sparsity,
bool is_signed = false) const {
TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
min,
max,
is_signed ? -(1 << (precision - 1)) : 0,
is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
preserve_sparsity,
force_scale_power_of_two_);
qparams.precision = precision;
return qparams;
}
/// Choose quantization scale and zero_point that maps
/// floating-point range [min, max] to the default integer range of
/// this quantization factory
TensorQuantizationParams
ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
return ChooseQuantizationParams(
min,
max,
is_weight ? GetWeightPrecision() : GetActivationPrecision(),
is_weight ? GetPreserveWeightSparsity()
: GetPreserveActivationSparsity());
}
/// Choose quantization based on the values in an array to optimize the
/// quantization errors ignoring a few outliers
TensorQuantizationParams ChooseQuantizationParams(
const float* values,
int len,
QuantizationKind kind,
int precision,
bool preserve_sparsity) const;
TensorQuantizationParams ChooseQuantizationParams(
const float* values,
int len,
bool is_weight = false) const;
/// Choose quantization based on histogram of values to optimize the
/// quantization errors ignoring a few outliers
TensorQuantizationParams ChooseQuantizationParams(
const Histogram& hist,
QuantizationKind kind,
int precision,
bool preserve_sparsity,
bool is_weight = false) const;
TensorQuantizationParams ChooseQuantizationParams(
const Histogram& hist,
bool is_weight = false) const;
// Given a real_multiplier, produces a pair (quantized_multiplier,
// right_shift) where quantized_multiplier is an int32 representing a
// fixed-point value (in practice we only produce positive values) and
// right_shift is an amount to shift right by, so that the floating-point
// multiplication of some int32 input value by real_multiplier,
//
// return static_cast<int32>(int32_value * real_multiplier);
//
// is best approximated by the integer-arithmetic-only code
//
// return RoundingRightShift(
// Multiplication(int32_value, quantized_multiplier),
// right_shift);
//
// Note: all this code only needs to run offline to generate the quantized
// neural network workload, not at runtime on the device on which quantized
// neural networks need to run. So it's not performance-critical at all.
RequantizationParams ChooseRequantizationMultiplier(
float real_multiplier,
TensorQuantizationParams target_qparams) const;
int GetActivationPrecision() const {
return activation_precision_;
}
int GetWeightPrecision() const {
return weight_precision_;
}
int GetEltwiseQuantizePrecision() const {
return eltwise_quantize_precision_;
}
bool GetPreserveActivationSparsity() const {
return preserve_activation_sparsity_;
}
bool GetPreserveWeightSparsity() const {
return preserve_weight_sparsity_;
}
QuantizationKind GetActivationKind() const {
return activation_kind_;
}
QuantizationKind GetWeightKind() const {
return weight_kind_;
}
void SetWeightP99Threshold(float threshold) {
weight_p99_threshold_ = threshold;
}
void SetActivationP99Threshold(float threshold) {
activation_p99_threshold_ = threshold;
}
explicit QuantizationFactory(
int activation_precision = 8,
// precision used for activations in main operations like matmul
int weight_precision = 8, // precision used for weights
int requantization_multiplier_precision = 32,
// precision used for the requantization multiplier
int eltwise_quantize_precision = 16,
// precision used for element-wise addition
bool preserve_activation_sparsity = false,
// preserve zeros in quantization
bool preserve_weight_sparsity = false,
// preserve zeros in quantization
bool force_scale_power_of_two = false,
// restrict scaling to a power of two
QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
QuantizationKind weight_kind = MIN_MAX_QUANTIZATION,
float weight_p99_threshold = 0.99,
// P99 percentage to select out from the full histogram for weights
float activation_p99_threshold = 0.99
// P99 percentage to select out from the full histogram for activations
);
private:
int activation_precision_;
int weight_precision_;
int requantization_multiplier_precision_;
int eltwise_quantize_precision_;
bool preserve_activation_sparsity_;
bool preserve_weight_sparsity_;
bool force_scale_power_of_two_;
QuantizationKind activation_kind_, weight_kind_;
float weight_p99_threshold_;
float activation_p99_threshold_;
}; // class QuantizationFactory
/**
* Parse a string to QuantizationKind
*/
QuantizationFactory::QuantizationKind StringToKind(const std::string& s);
std::vector<float>
adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max);
} // namespace dnnlowp
|