1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
|
/*
Copyright (c) 2016, Taiga Nomi, Edgar Riba
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
namespace tiny_dnn {
namespace core {
namespace kernels {
template <class T>
T highest() {
return (std::numeric_limits<T>::max)();
}
template <class T>
T lowest() {
return std::numeric_limits<T>::is_integer ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
}
// We have to be able to detect and handle overflows in int32, so this function
// uses doubles and int64's to make sure we have enough room.
template <class T>
int64_t float_to_quantized_unclamped(float_t input, float_t range_min, float_t range_max) {
if (range_min == range_max) {
return 0;
}
const int number_of_bits = sizeof(T) * 8;
const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
const double range = ((range_max - range_min) * range_adjust);
const double range_scale = (number_of_steps / range);
int64_t quantized =
static_cast<int64_t>(round(input * range_scale) - round(range_min * range_scale));
const int64_t lowest_quantized =
static_cast<int64_t>(lowest<T>());
quantized += lowest_quantized;
return quantized;
}
inline int32_t int64_to_int32(int64_t src) {
assert(src <= std::numeric_limits<int32_t>::max() && src >= std::numeric_limits<int32_t>::min());
return static_cast<int32_t>(src);
}
// This converts the float into the final quantized type, clamping/saturating
// any over or underflows.
template <class T>
T float_to_quantized(float_t input, float_t range_min, float_t range_max) {
int64_t quantized = float_to_quantized_unclamped<T>(input, range_min, range_max);
const int64_t lowest_quantized =
static_cast<int64_t>(lowest<T>());
const int64_t highest_quantized =
static_cast<int64_t>(highest<T>());
quantized = std::max<int64_t>(quantized, lowest_quantized);
quantized = std::min<int64_t>(quantized, highest_quantized);
return static_cast<T>(static_cast<int32_t>(quantized));
}
template <class T>
float quantized_to_float(T input, float_t range_min, float_t range_max) {
if (range_min == range_max) {
return range_min;
}
const int number_of_bits = sizeof(T) * 8;
const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
const double range = ((range_max - range_min) * range_adjust);
const double range_scale = (range / number_of_steps);
const int64_t lowest_quantized =
static_cast<int64_t>(lowest<T>());
const double offset_input = static_cast<double>(input) - lowest_quantized;
const double result = range_min + (offset_input * range_scale);
return static_cast<float_t>(result);
}
template <class T>
float float_for_one_quantized_level(float_t range_min, float_t range_max) {
const int64_t highest_ = static_cast<int64_t>(highest<T>());
const int64_t lowest_ = static_cast<int64_t>(lowest<T>());
const float float_for_one_quantized_level =
(range_max - range_min) / (highest_ - lowest_);
return float_for_one_quantized_level;
}
template <class T1, class T2, class T3>
void quantization_range_for_multiplication(float_t min_a, float_t max_a, float_t min_b,
float_t max_b, float_t* min_c,
float_t* max_c) {
const float_t a_float_for_one_quant_level =
float_for_one_quantized_level<T1>(min_a, max_a);
const float_t b_float_for_one_quant_level =
float_for_one_quantized_level<T2>(min_b, max_b);
const int64_t c_highest = static_cast<int64_t>(highest<T3>());
const int64_t c_lowest = static_cast<int64_t>(lowest<T3>());
const float c_float_for_one_quant_level =
a_float_for_one_quant_level * b_float_for_one_quant_level;
*min_c = c_float_for_one_quant_level * c_lowest;
*max_c = c_float_for_one_quant_level * c_highest;
}
template <class T1, class T2>
inline T2 requantize_in_new_range(T1 input, float_t min_input, float_t max_input,
float_t min_new, float_t max_new) {
const float_t input_float = quantized_to_float<T1>(input, min_input, max_input);
return float_to_quantized<T2>(input_float, min_new, max_new);
}
template <class T1, class T2>
inline void requantize_many_in_new_range(T1* input, size_t count, float_t min_input,
float_t max_input, float_t min_output,
float_t max_output, T2* output) {
for (size_t index = 0; index < count; ++index) {
const float_t input_float =
quantized_to_float<T1>(input[index], min_input, max_input);
output[index] = float_to_quantized<T2>(input_float, min_output, max_output);
}
}
// Because converting 32-bit accumulated results down to eight bit is a common
// case, we have a specialized code path to handle it as efficiently as
// possible using only fixed-point math for the inner loop.
template <>
inline void requantize_many_in_new_range<int32_t, uint8_t>(
int32_t* input, size_t count, float_t min_input, float_t max_input,
float_t min_output, float_t max_output, uint8_t* output) {
// Initially we calculate all the constants we need once, before we go into
// the inner loop.
const int fp_shift = 16;
const float input_range = max_input - min_input;
const float output_range = max_output - min_output;
const float recip_output_range = (255.0f / output_range);
const int64_t recip_output_range_fp =
static_cast<int64_t>(recip_output_range * (1 << fp_shift));
const int64_t range_scale_fp =
static_cast<int64_t>(255.0f * (1 << fp_shift) * input_range / output_range);
const int64_t input_offset_fp =
static_cast<int64_t>((min_input * recip_output_range_fp) + (range_scale_fp >> 1));
const int64_t output_offset_fp = static_cast<int64_t>(round((min_output * 255.0f) / output_range));
const int64_t rounding_delta = 1 << (fp_shift - 1);
// Inside this loop we just do minimal adds, multiplies, and shifts, in a way
// that could be easily adapted for a SIMD implementation. It should also be
// possible to perform all the calculations in 32-bit rather than 64, but
// that's not been implemented yet.
for (size_t index = 0; index < count; ++index) {
const int64_t input_value = static_cast<int64_t>(input[index]);
const int64_t fp_value =
((input_value * range_scale_fp) >> 32) + input_offset_fp;
const int64_t round_intermediate =
((fp_value >= 0) ? (fp_value + rounding_delta)
: (fp_value - rounding_delta)) >>
fp_shift;
int64_t quantized_int64 = (round_intermediate - output_offset_fp);
quantized_int64 = std::max<int64_t>(quantized_int64, 0LL);
quantized_int64 = std::min<int64_t>(quantized_int64, 255LL);
output[index] = static_cast<uint8_t>(static_cast<int32_t>(quantized_int64));
}
}
// REQUIRES: 'result->NumElements() == input.NumElements()'
template <class T>
void float_tensor_to_quantized_in_place(const vec_t& input, float_t min, float_t max,
std::vector<T>* result) {
const size_t data_size = input.size();
for (size_t i = 0; i < data_size; ++i) {
(*result)[i] = float_to_quantized<T>(input[i], min, max);
}
}
template <class T>
std::vector<T> float_tensor_to_quantized(const vec_t& input, float_t min, float_t max) {
std::vector<T> result(input.size(), static_cast<T>(0));
float_tensor_to_quantized_in_place<T>(input, min, max, &result);
return result;
}
// REQUIRES: 'result->NumElements() == input.NumElements()'
template <class T>
void quantized_tensor_to_float_in_place(const std::vector<T>& input, float_t min, float_t max,
vec_t* result) {
const size_t data_size = input.size();
for (size_t i = 0; i < data_size; ++i) {
(*result)[i] = quantized_to_float<T>(input[i], min, max);
}
}
template <class T>
vec_t quantized_tensor_to_float(const std::vector<T>& input, float_t min, float_t max) {
vec_t result(input.size(), static_cast<float_t>(0));
quantized_tensor_to_float_in_place<T>(input, min, max, &result);
return result;
}
template <class T1, class T2>
void quantize_down_and_shrink_range( std::vector<T1>& input, float_t min_input, float_t max_input,
float_t* min_new, float_t* max_new, std::vector<T2>* output){
const int32_t input_lowest_quantized = static_cast<int32_t>(lowest<T1>());
const int32_t input_highest_quantized = static_cast<int32_t>(highest<T1>());
T1 actual_min_quantized = input_highest_quantized;
T1 actual_max_quantized = input_lowest_quantized;
for (serial_size_t i = 0; i < input.size(); ++i) {
const T1 value = input[i];
actual_min_quantized = std::min(actual_min_quantized, value);
actual_max_quantized = std::max(actual_max_quantized, value);
}
// We want to make sure that the minimum is no larger than zero, so that the
// convolution operation can run efficiently.
*min_new = std::min(0.0f, quantized_to_float(actual_min_quantized, min_input,
max_input));
*max_new = quantized_to_float(actual_max_quantized, min_input, max_input);
requantize_many_in_new_range<int32_t, uint8_t>(&input[0], input.size(),
min_input, max_input, *min_new,
*max_new, &(*output)[0]);
}
} // namespace kernels
} // namespace core
} // namespace tiny_dnn
|