1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
|
#ifndef CAFFE2_CORE_QTENSOR_H_
#define CAFFE2_CORE_QTENSOR_H_
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"
#include <c10/util/accumulate.h>
#include <c10/util/irange.h>
#include <c10/util/typeid.h>
#include <algorithm>
#include <climits>
#include <cstddef>
#include <vector>
namespace caffe2 {
template <class Context>
class C10_EXPORT QTensor {
public:
QTensor() {}
virtual ~QTensor() {}
/**
* @brief Creates a quantized tensor of the given dimension.
*
* Note that the actual data allocation is not going to be carried out until
* the first time mutable_data() is called.
*
* The underlying storage of the quantized tensor interleaves elements
* by bit depth.
*
* Labeled memory for tensor of size 6, precision 3
* [ E1[0] E2[0] E3[0] E4[0] E5[0] E6[0] ] // Least significant Bits
* [ E1[1] E2[1] E3[1] E4[1] E5[1] E6[1] ]
* [ E1[2] E2[2] E3[2] E4[2] E5[2] E6[2] ]
*
* In the case of sign bits (see enable_sign argument), an extra bit
* per element is added:
*
* Labeled memory for tensor of size 6, precision 3, sign bit enabled
* [ E1[0] E2[0] E3[0] E4[0] E5[0] E6[0] ]
* [ E1[1] E2[1] E3[1] E4[1] E5[1] E6[1] ]
* [ E1[2] E2[2] E3[2] E4[2] E5[2] E6[2] ]
* [ E1[s] E2[s] E3[s] E4[s] E5[s] E6[s] ]
* Where 's' is 1 if E is negative
*
* The reason for this layout is the ability to efficiently multiply
* many low precision integers as a sum of popcnt(A & B) * 1 << bit.
* Explained here: https://arxiv.org/abs/1606.06160
*/
// TODO: changing at::ArrayRef<int> to at::ArrayRef<int64_t>?
explicit QTensor(
at::ArrayRef<int> dims,
const unsigned char precision,
const bool signbit = false)
: precision_(precision), signed_(signbit) {
Resize(dims);
}
void Resize(at::ArrayRef<int> dim_source) {
if (dims_ != dim_source) {
const auto source_size = c10::multiply_integers(dim_source);
if (static_cast<size_t>(source_size * (precision_ + signed_)) > capacity_) {
data_ptr_.clear();
capacity_ = 0;
}
dims_ = dim_source.vec();
size_ = source_size;
}
}
void
SetBitAtIndex(const unsigned char bit, const size_t index, const bool value) {
// Get the mutable data at bit depth `bit`.
unsigned char* d = mutable_data();
CAFFE_ENFORCE(
bit < precision_ + signed_,
"Attempted to a set a bit that is not allocated.");
CAFFE_ENFORCE(bit * aligned_size() < capacity_);
auto idx = (aligned_size() * bit) / CHAR_BIT;
d = &d[idx];
idx = index / CHAR_BIT;
auto shift = CHAR_BIT - (index % CHAR_BIT) - 1;
if (value) {
d[idx] |= 1 << shift;
} else {
d[idx] &= ~(1 << shift);
}
}
bool GetBitAtIndex(const unsigned char bit, const size_t index) const {
// Get the data at bit depth `bit`
const unsigned char* d = data();
auto idx = (aligned_size() * bit) / CHAR_BIT;
d = &d[idx];
idx = index / CHAR_BIT;
auto shift = CHAR_BIT - (index % CHAR_BIT) - 1;
return d[idx] & (1 << shift);
}
void SetPrecision(const unsigned char precision) {
precision_ = precision;
data_ptr_.clear();
}
void SetSigned(const bool make_signed = true) {
signed_ = make_signed;
data_ptr_.clear();
}
void SetScale(const double scale) {
scale_ = scale;
}
void SetBias(const double bias) {
bias_ = bias;
}
unsigned char* mutable_data() {
if (!data_ptr_) {
data_ptr_ = Context::New(nbytes());
capacity_ = nbytes() * CHAR_BIT;
}
CAFFE_ENFORCE(capacity_ == nbytes() * CHAR_BIT);
return static_cast<unsigned char*>(data_ptr_.get());
}
inline const unsigned char* data() const {
return static_cast<unsigned char*>(data_ptr_.get());
}
inline size_t size() const {
return size_;
}
inline unsigned char alignment() const {
return alignment_;
}
inline unsigned char precision() const {
return precision_;
}
inline at::ArrayRef<int> sizes() const {
return dims_;
}
// TODO: deprecate?
inline at::ArrayRef<int> dims() const {
return dims_;
}
inline bool is_signed() const {
return signed_;
}
/**
* Returns the number of dimensions of the data.
*/
inline int ndim() const {
return dims_.size();
}
inline size_t aligned_size() const {
return alignment_ * ((size_ + alignment_ - 1) / alignment_);
}
inline size_t nbytes() const {
return (aligned_size() * (precision_ + signed_)) / CHAR_BIT;
}
inline double scale() const {
return scale_;
}
inline double bias() const {
return bias_;
}
/**
* Returns the i-th dimension of the qtensor in int.
*/
inline int dim32(const int i) const {
TORCH_DCHECK_LT(i, static_cast<int>(dims_.size())) << "Exceeding ndim limit " << dims_.size();
TORCH_DCHECK_GE(i, 0) << "Cannot have negative index";
CAFFE_ENFORCE_LT(dims_[i], std::numeric_limits<int>::max());
return static_cast<int>(dims_[i]);
}
/**
* Returns the 'canonical' version of a (usually) user-specified axis,
* allowing for negative indexing (e.g., -1 for the last axis).
*
* @param axis_index the axis index.
* If 0 <= index < ndim(), return index.
* If -ndim <= index <= -1, return (ndim() - (-index)),
* e.g., the last axis index (ndim() - 1) if index == -1,
* the second to last if index == -2, etc.
* Dies on out of range index.
*/
inline int canonical_axis_index(int axis_index) const {
CAFFE_ENFORCE_GE(axis_index, -ndim());
CAFFE_ENFORCE_LT(axis_index, ndim());
if (axis_index < 0) {
return axis_index + ndim();
}
return axis_index;
}
/**
* Return product of all dimensions starting from K.
*/
inline int64_t size_from_dim(int k) const {
int64_t r = 1;
for (const auto i : c10::irange(k, dims_.size())) {
r *= dims_[i];
}
return r;
}
/**
* Product of all dims up to.
*/
inline int64_t size_to_dim(int k) const {
CAFFE_ENFORCE(k < dims_.size());
int64_t r = 1;
for (const auto i : c10::irange(k)) {
r *= dims_[i];
}
return r;
}
protected:
std::vector<int> dims_;
size_t size_ = 0;
// Precision in bits.
unsigned char precision_ = CHAR_BIT;
// Bit alignment.
unsigned char alignment_ = CHAR_BIT;
// Allocated data.
at::DataPtr data_ptr_;
// value = scale_ * (x + bias_)
double scale_;
double bias_;
bool signed_ = false;
// Capacity in bits.
size_t capacity_ = 0;
};
} // namespace caffe2
#endif // CAFFE2_CORE_QTENSOR_H_
|