File: Sort.c

package info (click to toggle)
7zip-rar 25.00%2Bds-1
  • links: PTS, VCS
  • area: non-free
  • in suites: trixie
  • size: 13,432 kB
  • sloc: cpp: 212,215; ansic: 39,747; asm: 4,987; makefile: 2,125
file content (268 lines) | stat: -rwxr-xr-x 7,666 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
/* Sort.c -- Sort functions
: Igor Pavlov : Public domain */

#include "Precomp.h"

#include "Sort.h"
#include "CpuArch.h"

#if (  (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
    || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
    )
// the code with prefetch is slow for small arrays on x86.
// So we disable prefetch for x86.
#ifndef MY_CPU_X86
  // #pragma message("Z7_PREFETCH : __builtin_prefetch")
  #define Z7_PREFETCH(a)  __builtin_prefetch((a))
#endif

#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)

#include "7zWindows.h"

// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
// For example, clang-cl can generate "prefetcht2" instruction for
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
// But we want to generate "prefetcht0" instruction.
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
// instead of PreFetchCacheLine() / _mm_prefetch().

// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
// But old x86 cpus don't support "prefetcht0".
// So we will use PreFetchCacheLine(), only if we are sure that
// generated instruction is supported by all cpus of that isa.
#if defined(MY_CPU_AMD64) \
    || defined(MY_CPU_ARM64) \
    || defined(MY_CPU_IA64)
// we need to use additional braces for (a) in PreFetchCacheLine call, because
// PreFetchCacheLine macro doesn't use braces:
//   #define PreFetchCacheLine(l, a)  _mm_prefetch((CHAR CONST *) a, l)
  // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
  #define Z7_PREFETCH(a)  PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
#endif

#endif // _WIN32


#define PREFETCH_NO(p,k,s,size)

#ifndef Z7_PREFETCH
  #define SORT_PREFETCH(p,k,s,size)
#else

// #define PREFETCH_LEVEL 2  // use it if cache line is 32-bytes
#define PREFETCH_LEVEL 3  // it is fast for most cases (64-bytes cache line prefetch)
// #define PREFETCH_LEVEL 4  // it can be faster for big array (128-bytes prefetch)

#if PREFETCH_LEVEL == 0

  #define SORT_PREFETCH(p,k,s,size)

#else // PREFETCH_LEVEL != 0

/*
if  defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
    we prefetch one value per cache line.
    Use it if array is aligned for cache line size (64 bytes)
    or if array is small (less than L1 cache size).

if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
    we perfetch all cache lines that can be required.
    it can be faster for big unaligned arrays.
*/
  #define USE_PREFETCH_FOR_ALIGNED_ARRAY

// s == k * 2
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
  // x86 supports (lea r1*8+offset)
  #define PREFETCH_OFFSET(k,s)  ((s) << PREFETCH_LEVEL)
#else
  #define PREFETCH_OFFSET(k,s)  ((k) << (PREFETCH_LEVEL + 1))
#endif

#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
  #define PREFETCH_ADD_OFFSET   0
#else
  // last offset that can be reqiured in PREFETCH_LEVEL step:
  #define PREFETCH_RANGE        ((2 << PREFETCH_LEVEL) - 1)
  #define PREFETCH_ADD_OFFSET   PREFETCH_RANGE / 2
#endif

#if PREFETCH_LEVEL <= 3

#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2)); \
  }}
#else /* for unaligned array */
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
      Z7_PREFETCH((p + s2)); \
  }}
#endif

#else // PREFETCH_LEVEL > 3

#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2 - 16)); \
      Z7_PREFETCH((p + s2)); \
  }}
#else /* for unaligned array */
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
      Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
      Z7_PREFETCH((p + s2)); \
  }}
#endif

#endif // PREFETCH_LEVEL > 3
#endif // PREFETCH_LEVEL != 0
#endif // Z7_PREFETCH


#if defined(MY_CPU_ARM64) \
    /* || defined(MY_CPU_AMD64) */ \
    /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
  // we want to use cmov, if cmov is very fast:
  // - this cmov version is slower for clang-x64.
  // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
  #define Z7_FAST_CMOV_SUPPORTED
#endif
 
#ifdef Z7_FAST_CMOV_SUPPORTED
  // we want to use cmov here, if cmov is fast: new arm64 cpus.
  // we want the compiler to use conditional move for this branch
  #define GET_MAX_VAL(n0, n1, max_val_slow)  if (n0 < n1) n0 = n1;
#else
  // use this branch, if cpu doesn't support fast conditional move.
  // it uses slow array access reading:
  #define GET_MAX_VAL(n0, n1, max_val_slow)  n0 = max_val_slow;
#endif

#define HeapSortDown(p, k, size, temp, macro_prefetch) \
{ \
  for (;;) { \
    UInt32 n0, n1; \
    size_t s = k * 2; \
    if (s >= size) { \
      if (s == size) { \
        n0 = p[s]; \
        p[k] = n0; \
        if (temp < n0) k = s; \
      } \
      break; \
    } \
    n0 = p[k * 2]; \
    n1 = p[k * 2 + 1]; \
    s += n0 < n1; \
    GET_MAX_VAL(n0, n1, p[s]) \
    if (temp >= n0) break; \
    macro_prefetch(p, k, s, size) \
    p[k] = n0; \
    k = s; \
  } \
  p[k] = temp; \
}


/*
stage-1 : O(n) :
  we generate intermediate partially sorted binary tree:
  p[0]  : it's additional item for better alignment of tree structure in memory.
  p[1]
  p[2]       p[3]
  p[4] p[5]  p[6] p[7]
  ...
  p[x] >= p[x * 2]
  p[x] >= p[x * 2 + 1]
  
stage-2 : O(n)*log2(N):
  we move largest item p[0] from head of tree to the end of array
  and insert last item to sorted binary tree.
*/

// (p) must be aligned for cache line size (64-bytes) for best performance

void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
{
  if (size < 2)
    return;
  if (size == 2)
  {
    const UInt32 a0 = p[0];
    const UInt32 a1 = p[1];
    const unsigned k = a1 < a0;
    p[k] = a0;
    p[k ^ 1] = a1;
    return;
  }
  {
    // stage-1 : O(n)
    // we transform array to partially sorted binary tree.
    size_t i = --size / 2;
    // (size) now is the index of the last item in tree,
    // if (i)
    {
      do
      {
        const UInt32 temp = p[i];
        size_t k = i;
        HeapSortDown(p, k, size, temp, PREFETCH_NO)
      }
      while (--i);
    }
    {
      const UInt32 temp = p[0];
      const UInt32 a1 = p[1];
      if (temp < a1)
      {
        size_t k = 1;
        p[0] = a1;
        HeapSortDown(p, k, size, temp, PREFETCH_NO)
      }
    }
  }

  if (size < 3)
  {
    // size == 2
    const UInt32 a0 = p[0];
    p[0] = p[2];
    p[2] = a0;
    return;
  }
  if (size != 3)
  {
    // stage-2 : O(size) * log2(size):
    // we move largest item p[0] from head to the end of array,
    // and insert last item to sorted binary tree.
    do
    {
      const UInt32 temp = p[size];
      size_t k = p[2] < p[3] ? 3 : 2;
      p[size--] = p[0];
      p[0] = p[1];
      p[1] = p[k];
      HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
    }
    while (size != 3);
  }
  {
    const UInt32 a2 = p[2];
    const UInt32 a3 = p[3];
    const size_t k = a2 < a3;
    p[2] = p[1];
    p[3] = p[0];
    p[k] = a3;
    p[k ^ 1] = a2;
  }
}