1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
|
/* Bit and bytes utilities.
Bytes swap functions, reverse order of bytes:
- _Py_bswap16(uint16_t)
- _Py_bswap32(uint32_t)
- _Py_bswap64(uint64_t)
*/
#ifndef Py_INTERNAL_BITUTILS_H
#define Py_INTERNAL_BITUTILS_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
#if defined(__GNUC__) \
&& ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))
/* __builtin_bswap16() is available since GCC 4.8,
__builtin_bswap32() is available since GCC 4.3,
__builtin_bswap64() is available since GCC 4.3. */
# define _PY_HAVE_BUILTIN_BSWAP
#endif
#ifdef _MSC_VER
# include <intrin.h> // _byteswap_uint64()
#endif
static inline uint16_t
_Py_bswap16(uint16_t word)
{
#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16)
return __builtin_bswap16(word);
#elif defined(_MSC_VER)
Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
return _byteswap_ushort(word);
#else
// Portable implementation which doesn't rely on circular bit shift
return ( ((word & UINT16_C(0x00FF)) << 8)
| ((word & UINT16_C(0xFF00)) >> 8));
#endif
}
static inline uint32_t
_Py_bswap32(uint32_t word)
{
#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32)
return __builtin_bswap32(word);
#elif defined(_MSC_VER)
Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
return _byteswap_ulong(word);
#else
// Portable implementation which doesn't rely on circular bit shift
return ( ((word & UINT32_C(0x000000FF)) << 24)
| ((word & UINT32_C(0x0000FF00)) << 8)
| ((word & UINT32_C(0x00FF0000)) >> 8)
| ((word & UINT32_C(0xFF000000)) >> 24));
#endif
}
static inline uint64_t
_Py_bswap64(uint64_t word)
{
#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64)
return __builtin_bswap64(word);
#elif defined(_MSC_VER)
return _byteswap_uint64(word);
#else
// Portable implementation which doesn't rely on circular bit shift
return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
| ((word & UINT64_C(0x000000000000FF00)) << 40)
| ((word & UINT64_C(0x0000000000FF0000)) << 24)
| ((word & UINT64_C(0x00000000FF000000)) << 8)
| ((word & UINT64_C(0x000000FF00000000)) >> 8)
| ((word & UINT64_C(0x0000FF0000000000)) >> 24)
| ((word & UINT64_C(0x00FF000000000000)) >> 40)
| ((word & UINT64_C(0xFF00000000000000)) >> 56));
#endif
}
// Population count: count the number of 1's in 'x'
// (number of bits set to 1), also known as the hamming weight.
//
// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
// can be used, to keep the implementation simple. For example, Visual Studio
// __popcnt() is not used this reason. The clang and GCC builtin function can
// use the x86 POPCNT instruction if the target architecture has SSE4a or
// newer.
static inline int
_Py_popcount32(uint32_t x)
{
#if (defined(__clang__) || defined(__GNUC__))
#if SIZEOF_INT >= 4
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
return __builtin_popcount(x);
#else
// The C standard guarantees that unsigned long will always be big enough
// to hold a uint32_t value without losing information.
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
return __builtin_popcountl(x);
#endif
#else
// 32-bit SWAR (SIMD Within A Register) popcount
// Binary: 0 1 0 1 ...
const uint32_t M1 = 0x55555555;
// Binary: 00 11 00 11. ..
const uint32_t M2 = 0x33333333;
// Binary: 0000 1111 0000 1111 ...
const uint32_t M4 = 0x0F0F0F0F;
// Put count of each 2 bits into those 2 bits
x = x - ((x >> 1) & M1);
// Put count of each 4 bits into those 4 bits
x = (x & M2) + ((x >> 2) & M2);
// Put count of each 8 bits into those 8 bits
x = (x + (x >> 4)) & M4;
// Sum of the 4 byte counts.
// Take care when considering changes to the next line. Portability and
// correctness are delicate here, thanks to C's "integer promotions" (C99
// §6.3.1.1p2). On machines where the `int` type has width greater than 32
// bits, `x` will be promoted to an `int`, and following C's "usual
// arithmetic conversions" (C99 §6.3.1.8), the multiplication will be
// performed as a multiplication of two `unsigned int` operands. In this
// case it's critical that we cast back to `uint32_t` in order to keep only
// the least significant 32 bits. On machines where the `int` type has
// width no greater than 32, the multiplication is of two 32-bit unsigned
// integer types, and the (uint32_t) cast is a no-op. In both cases, we
// avoid the risk of undefined behaviour due to overflow of a
// multiplication of signed integer types.
return (uint32_t)(x * 0x01010101U) >> 24;
#endif
}
// Return the index of the most significant 1 bit in 'x'. This is the smallest
// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.
static inline int
_Py_bit_length(unsigned long x)
{
#if (defined(__clang__) || defined(__GNUC__))
if (x != 0) {
// __builtin_clzl() is available since GCC 3.4.
// Undefined behavior for x == 0.
return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);
}
else {
return 0;
}
#elif defined(_MSC_VER)
// _BitScanReverse() is documented to search 32 bits.
Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);
unsigned long msb;
if (_BitScanReverse(&msb, x)) {
return (int)msb + 1;
}
else {
return 0;
}
#else
const int BIT_LENGTH_TABLE[32] = {
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
};
int msb = 0;
while (x >= 32) {
msb += 6;
x >>= 6;
}
msb += BIT_LENGTH_TABLE[x];
return msb;
#endif
}
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_BITUTILS_H */
|