1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
|
/* Sort.c -- Sort functions
: Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Sort.h"
#include "CpuArch.h"
#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|| (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
)
// the code with prefetch is slow for small arrays on x86.
// So we disable prefetch for x86.
#ifndef MY_CPU_X86
// #pragma message("Z7_PREFETCH : __builtin_prefetch")
#define Z7_PREFETCH(a) __builtin_prefetch((a))
#endif
#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
#include "7zWindows.h"
// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
// For example, clang-cl can generate "prefetcht2" instruction for
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
// But we want to generate "prefetcht0" instruction.
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
// instead of PreFetchCacheLine() / _mm_prefetch().
// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
// But old x86 cpus don't support "prefetcht0".
// So we will use PreFetchCacheLine(), only if we are sure that
// generated instruction is supported by all cpus of that isa.
#if defined(MY_CPU_AMD64) \
|| defined(MY_CPU_ARM64) \
|| defined(MY_CPU_IA64)
// we need to use additional braces for (a) in PreFetchCacheLine call, because
// PreFetchCacheLine macro doesn't use braces:
// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
// #pragma message("Z7_PREFETCH : PreFetchCacheLine")
#define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
#endif
#endif // _WIN32
#define PREFETCH_NO(p,k,s,size)
#ifndef Z7_PREFETCH
#define SORT_PREFETCH(p,k,s,size)
#else
// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
#if PREFETCH_LEVEL == 0
#define SORT_PREFETCH(p,k,s,size)
#else // PREFETCH_LEVEL != 0
/*
if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we prefetch one value per cache line.
Use it if array is aligned for cache line size (64 bytes)
or if array is small (less than L1 cache size).
if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we perfetch all cache lines that can be required.
it can be faster for big unaligned arrays.
*/
#define USE_PREFETCH_FOR_ALIGNED_ARRAY
// s == k * 2
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
// x86 supports (lea r1*8+offset)
#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
#else
#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
#endif
#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
#define PREFETCH_ADD_OFFSET 0
#else
// last offset that can be reqiured in PREFETCH_LEVEL step:
#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
#endif
#if PREFETCH_LEVEL <= 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#else // PREFETCH_LEVEL > 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - 16)); \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#endif // PREFETCH_LEVEL > 3
#endif // PREFETCH_LEVEL != 0
#endif // Z7_PREFETCH
#if defined(MY_CPU_ARM64) \
/* || defined(MY_CPU_AMD64) */ \
/* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
// we want to use cmov, if cmov is very fast:
// - this cmov version is slower for clang-x64.
// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
#define Z7_FAST_CMOV_SUPPORTED
#endif
#ifdef Z7_FAST_CMOV_SUPPORTED
// we want to use cmov here, if cmov is fast: new arm64 cpus.
// we want the compiler to use conditional move for this branch
#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
#else
// use this branch, if cpu doesn't support fast conditional move.
// it uses slow array access reading:
#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
#endif
#define HeapSortDown(p, k, size, temp, macro_prefetch) \
{ \
for (;;) { \
UInt32 n0, n1; \
size_t s = k * 2; \
if (s >= size) { \
if (s == size) { \
n0 = p[s]; \
p[k] = n0; \
if (temp < n0) k = s; \
} \
break; \
} \
n0 = p[k * 2]; \
n1 = p[k * 2 + 1]; \
s += n0 < n1; \
GET_MAX_VAL(n0, n1, p[s]) \
if (temp >= n0) break; \
macro_prefetch(p, k, s, size) \
p[k] = n0; \
k = s; \
} \
p[k] = temp; \
}
/*
stage-1 : O(n) :
we generate intermediate partially sorted binary tree:
p[0] : it's additional item for better alignment of tree structure in memory.
p[1]
p[2] p[3]
p[4] p[5] p[6] p[7]
...
p[x] >= p[x * 2]
p[x] >= p[x * 2 + 1]
stage-2 : O(n)*log2(N):
we move largest item p[0] from head of tree to the end of array
and insert last item to sorted binary tree.
*/
// (p) must be aligned for cache line size (64-bytes) for best performance
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
{
if (size < 2)
return;
if (size == 2)
{
const UInt32 a0 = p[0];
const UInt32 a1 = p[1];
const unsigned k = a1 < a0;
p[k] = a0;
p[k ^ 1] = a1;
return;
}
{
// stage-1 : O(n)
// we transform array to partially sorted binary tree.
size_t i = --size / 2;
// (size) now is the index of the last item in tree,
// if (i)
{
do
{
const UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
while (--i);
}
{
const UInt32 temp = p[0];
const UInt32 a1 = p[1];
if (temp < a1)
{
size_t k = 1;
p[0] = a1;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
}
}
if (size < 3)
{
// size == 2
const UInt32 a0 = p[0];
p[0] = p[2];
p[2] = a0;
return;
}
if (size != 3)
{
// stage-2 : O(size) * log2(size):
// we move largest item p[0] from head to the end of array,
// and insert last item to sorted binary tree.
do
{
const UInt32 temp = p[size];
size_t k = p[2] < p[3] ? 3 : 2;
p[size--] = p[0];
p[0] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
}
while (size != 3);
}
{
const UInt32 a2 = p[2];
const UInt32 a3 = p[3];
const size_t k = a2 < a3;
p[2] = p[1];
p[3] = p[0];
p[k] = a3;
p[k ^ 1] = a2;
}
}
|