File: dnnlowp.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (204 lines) | stat: -rw-r--r-- 6,678 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#pragma once

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <limits>

#ifdef __x86_64__
#include <immintrin.h>
#endif

#include <fbgemm/QuantUtils.h>

#include "caffe2/quantization/server/dynamic_histogram.h"
#include "caffe2/utils/cpuid.h"

namespace dnnlowp {

using fbgemm::RequantizationParams;
using fbgemm::TensorQuantizationParams;

// Represents a quantization scheme that provides quantization parameter based
// on distribution of data to be quantized.
class QuantizationFactory {
 public:
  enum QuantizationKind {
    // A simple quantization scheme that determines quantization parameter by
    // just looking at min/max.
    MIN_MAX_QUANTIZATION,
    // Minimizes L2 norm of quantization error
    L2_MIN_QUANTIZATION,
    // fast search to remove histogram outliers and approximate L2 min
    L2_MIN_QUANTIZATION_APPROX,
    // Minimizes Kullback-Leibler divergence
    KL_MIN_QUANTIZATION,
    // Take 99 percentail (only works with sparsity preserving quantization)
    P99_QUANTIZATION,
    L1_MIN_QUANTIZATION,
  };

  /// Get the default factory whose policy is determined by gflags
  static QuantizationFactory* GetDefaultInstance();

  /// Choose quantization scale and zero_point that maps
  /// floating-point range [min, max] to the integer range of the specified
  /// precision
  TensorQuantizationParams ChooseQuantizationParams(
      float min,
      float max,
      int precision,
      bool preserve_sparsity,
      bool is_signed = false) const {
    TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
        min,
        max,
        is_signed ? -(1 << (precision - 1)) : 0,
        is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
        preserve_sparsity,
        force_scale_power_of_two_);
    qparams.precision = precision;
    return qparams;
  }

  /// Choose quantization scale and zero_point that maps
  /// floating-point range [min, max] to the default integer range of
  /// this quantization factory
  TensorQuantizationParams
  ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
    return ChooseQuantizationParams(
        min,
        max,
        is_weight ? GetWeightPrecision() : GetActivationPrecision(),
        is_weight ? GetPreserveWeightSparsity()
                  : GetPreserveActivationSparsity());
  }

  /// Choose quantization based on the values in an array to optimize the
  /// quantization errors ignoring a few outliers
  TensorQuantizationParams ChooseQuantizationParams(
      const float* values,
      int len,
      QuantizationKind kind,
      int precision,
      bool preserve_sparsity) const;

  TensorQuantizationParams ChooseQuantizationParams(
      const float* values,
      int len,
      bool is_weight = false) const;

  /// Choose quantization based on histogram of values to optimize the
  /// quantization errors ignoring a few outliers
  TensorQuantizationParams ChooseQuantizationParams(
      const Histogram& hist,
      QuantizationKind kind,
      int precision,
      bool preserve_sparsity,
      bool is_weight = false) const;

  TensorQuantizationParams ChooseQuantizationParams(
      const Histogram& hist,
      bool is_weight = false) const;

  // Given a real_multiplier, produces a pair (quantized_multiplier,
  // right_shift) where quantized_multiplier is an int32 representing a
  // fixed-point value (in practice we only produce positive values) and
  // right_shift is an amount to shift right by, so that the floating-point
  // multiplication of some int32 input value by real_multiplier,
  //
  //   return static_cast<int32>(int32_value * real_multiplier);
  //
  // is best approximated by the integer-arithmetic-only code
  //
  //   return RoundingRightShift(
  //       Multiplication(int32_value, quantized_multiplier),
  //       right_shift);
  //
  // Note: all this code only needs to run offline to generate the quantized
  // neural network workload, not at runtime on the device on which quantized
  // neural networks need to run. So it's not performance-critical at all.
  RequantizationParams ChooseRequantizationMultiplier(
      float real_multiplier,
      TensorQuantizationParams target_qparams) const;

  int GetActivationPrecision() const {
    return activation_precision_;
  }

  int GetWeightPrecision() const {
    return weight_precision_;
  }

  int GetEltwiseQuantizePrecision() const {
    return eltwise_quantize_precision_;
  }

  bool GetPreserveActivationSparsity() const {
    return preserve_activation_sparsity_;
  }

  bool GetPreserveWeightSparsity() const {
    return preserve_weight_sparsity_;
  }

  QuantizationKind GetActivationKind() const {
    return activation_kind_;
  }
  QuantizationKind GetWeightKind() const {
    return weight_kind_;
  }

  void SetWeightP99Threshold(float threshold) {
    weight_p99_threshold_ = threshold;
  }
  void SetActivationP99Threshold(float threshold) {
    activation_p99_threshold_ = threshold;
  }

  explicit QuantizationFactory(
      int activation_precision = 8,
      // precision used for activations in main operations like matmul
      int weight_precision = 8, // precision used for weights
      int requantization_multiplier_precision = 32,
      // precision used for the requantization multiplier
      int eltwise_quantize_precision = 16,
      // precision used for element-wise addition
      bool preserve_activation_sparsity = false,
      // preserve zeros in quantization
      bool preserve_weight_sparsity = false,
      // preserve zeros in quantization
      bool force_scale_power_of_two = false,
      // restrict scaling to a power of two
      QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
      QuantizationKind weight_kind = MIN_MAX_QUANTIZATION,
      float weight_p99_threshold = 0.99,
      // P99 percentage to select out from the full histogram for weights

      float activation_p99_threshold = 0.99
      // P99 percentage to select out from the full histogram for activations
  );

 private:
  int activation_precision_;
  int weight_precision_;
  int requantization_multiplier_precision_;
  int eltwise_quantize_precision_;
  bool preserve_activation_sparsity_;
  bool preserve_weight_sparsity_;
  bool force_scale_power_of_two_;
  QuantizationKind activation_kind_, weight_kind_;
  float weight_p99_threshold_;
  float activation_p99_threshold_;
}; // class QuantizationFactory

/**
 * Parse a string to QuantizationKind
 */
QuantizationFactory::QuantizationKind StringToKind(const std::string& s);

std::vector<float>
adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max);

} // namespace dnnlowp