1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
|
///
/// @file Sieve.hpp
/// @brief This file implements a highly optimized prime sieving
/// algorithm for computing the special leaves (sometimes named
/// hard special leaves) in the combinatorial prime counting
/// algorithms (e.g. Lagarias-Miller-Odlyzko, Deleglise-Rivat,
/// Gourdon).
///
/// The Sieve class contains a sieve of Eratosthenes
/// implementation with 30 numbers per byte i.e. the 8 bits of
/// each byte correspond to the offsets: { 1, 7, 11, 13, 17,
/// 19, 23, 29 }. Unlike a traditional prime sieve this sieve
/// is designed for use in the combinatorial prime counting
/// algorithms: this sieve removes primes as well as multiples
/// of primes and it counts the number of elements that have
/// been crossed off for the first time in the sieve array.
///
/// Since there is a large number of leaves for which we have
/// to count the number of unsieved elements in the sieve
/// array, Lagarias-Miller-Odlyzko have suggested using a
/// binary indexed tree data structure (a.k.a. Fenwick tree) to
/// speedup counting. However using a binary indexed tree is
/// bad for performance as it causes many cache misses and
/// branch mispredictions. For this reason this implementation
/// instead uses a linear counter array whose elements contain
/// the total count of unsieved elements in a certain interval.
///
/// In-depth description of this algorithm:
/// https://github.com/kimwalisch/primecount/blob/master/doc/Hard-Special-Leaves.md
///
/// Copyright (C) 2025 Kim Walisch, <kim.walisch@gmail.com>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///
#ifndef SIEVE_HPP
#define SIEVE_HPP
#include <macros.hpp>
#include <cpu_arch_macros.hpp>
#include <popcnt.hpp>
#include <Vector.hpp>
#include <stdint.h>
#if defined(ENABLE_ARM_SVE) || \
defined(ENABLE_MULTIARCH_ARM_SVE)
#include <arm_sve.h>
#elif defined(ENABLE_AVX512_VPOPCNT) || \
defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
#include <immintrin.h>
#endif
namespace primecount {
class Sieve
{
public:
Sieve(uint64_t low, uint64_t segment_size, uint64_t wheel_size);
uint64_t count(uint64_t start, uint64_t stop) const;
void cross_off(uint64_t prime, uint64_t i);
void cross_off_count(uint64_t prime, uint64_t i);
static uint64_t align_segment_size(uint64_t size);
uint64_t get_total_count() const
{
return total_count_;
}
template <typename T>
void pre_sieve(const Vector<T>& primes, uint64_t c, uint64_t low, uint64_t high)
{
reset_sieve(low, high);
for (uint64_t i = 4; i <= c; i++)
cross_off(primes[i], i);
init_counter(low, high);
}
/// Count 1 bits inside [0, stop].
/// This method is safe to run on any CPU without runtime
/// CPUID checks. In most cases (e.g. when compiled
/// without -march=native) this will call the portable
/// but slow count_popcnt64() method.
///
ALWAYS_INLINE uint64_t count(uint64_t stop)
{
#if defined(ENABLE_AVX512_VPOPCNT)
#define SIEVE_COUNT_ALGO_NAME "AVX512 bit counting"
return count_avx512(stop);
#elif defined(ENABLE_ARM_SVE)
#define SIEVE_COUNT_ALGO_NAME "ARM SVE bit counting"
return count_arm_sve(stop);
#else
#define SIEVE_COUNT_ALGO_NAME "POPCNT64 bit counting"
return count_popcnt64(stop);
#endif
}
#if defined(ENABLE_PORTABLE_POPCNT64)
/// Count 1 bits inside [start, stop]
uint64_t count_popcnt64(uint64_t start, uint64_t stop) const;
/// Count 1 bits inside [0, stop]
ALWAYS_INLINE uint64_t count_popcnt64(uint64_t stop)
{
ASSERT(stop >= prev_stop_);
uint64_t start = prev_stop_ + 1;
prev_stop_ = stop;
if (start > stop)
return count_;
// Quickly count the number of unsieved elements (in
// the sieve array) up to a value that is close to
// the stop number i.e. (stop - start) < counter_.dist.
// We do this using the counter array, each element
// of the counter array contains the number of
// unsieved elements in the interval:
// [i * counter_.dist, (i + 1) * counter_.dist[.
while (counter_.stop <= stop)
{
start = counter_.stop;
counter_.stop += counter_.dist;
counter_.sum += counter_[counter_.i++];
count_ = counter_.sum;
}
// Here the remaining distance is relatively small i.e.
// (stop - start) < counter_.dist, hence we simply
// count the remaining number of unsieved elements by
// linearly iterating over the sieve array.
ASSERT(start <= stop);
ASSERT(stop - start < segment_size());
uint64_t start_idx = start / 240;
uint64_t stop_idx = stop / 240;
uint64_t m1 = unset_smaller[start % 240];
uint64_t m2 = unset_larger[stop % 240];
// Branchfree bitmask calculation:
// m1 = (start_idx != stop_idx) ? m1 : m1 & m2;
m1 &= (-(start_idx != stop_idx) | m2);
// m2 = (start_idx != stop_idx) ? m2 : 0;
m2 &= -(start_idx != stop_idx);
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
uint64_t start_bits = sieve64[start_idx] & m1;
uint64_t stop_bits = sieve64[stop_idx] & m2;
uint64_t cnt = popcnt64(start_bits);
cnt += popcnt64(stop_bits);
for (uint64_t i = start_idx + 1; i < stop_idx; i++)
cnt += popcnt64(sieve64[i]);
count_ += cnt;
return count_;
}
#endif
#if defined(ENABLE_AVX512_VPOPCNT) || \
defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
/// Count 1 bits inside [start, stop]
#if defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
__attribute__ ((target ("avx512f,avx512vpopcntdq")))
#endif
uint64_t count_avx512(uint64_t start, uint64_t stop) const;
/// Count 1 bits inside [0, stop]
#if defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
__attribute__ ((target ("avx512f,avx512vpopcntdq")))
#endif
ALWAYS_INLINE uint64_t count_avx512(uint64_t stop)
{
ASSERT(stop >= prev_stop_);
uint64_t start = prev_stop_ + 1;
prev_stop_ = stop;
if (start > stop)
return count_;
// Quickly count the number of unsieved elements (in
// the sieve array) up to a value that is close to
// the stop number i.e. (stop - start) < counter_.dist.
// We do this using the counter array, each element
// of the counter array contains the number of
// unsieved elements in the interval:
// [i * counter_.dist, (i + 1) * counter_.dist[.
while (counter_.stop <= stop)
{
start = counter_.stop;
counter_.stop += counter_.dist;
counter_.sum += counter_[counter_.i++];
count_ = counter_.sum;
}
// Here the remaining distance is relatively small i.e.
// (stop - start) < counter_.dist, hence we simply
// count the remaining number of unsieved elements by
// linearly iterating over the sieve array.
ASSERT(start <= stop);
ASSERT(stop - start < segment_size());
uint64_t start_idx = start / 240;
uint64_t stop_idx = stop / 240;
uint64_t m1 = unset_smaller[start % 240];
uint64_t m2 = unset_larger[stop % 240];
// Branchfree bitmask calculation:
// m1 = (start_idx != stop_idx) ? m1 : m1 & m2;
m1 &= (-(start_idx != stop_idx) | m2);
// m2 = (start_idx != stop_idx) ? m2 : 0;
m2 &= -(start_idx != stop_idx);
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
uint64_t start_bits = sieve64[start_idx] & m1;
uint64_t stop_bits = sieve64[stop_idx] & m2;
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits);
__m512i vcnt = _mm512_popcnt_epi64(vec);
uint64_t i = start_idx + 1;
// Compute this for loop using AVX512.
// for (i = start_idx + 1; i < stop_idx; i++)
// cnt += popcnt64(sieve64[i]);
for (; i + 8 < stop_idx; i += 8)
{
vec = _mm512_loadu_epi64(&sieve64[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
}
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx));
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
count_ += _mm512_reduce_add_epi64(vcnt);
return count_;
}
#elif defined(ENABLE_ARM_SVE) || \
defined(ENABLE_MULTIARCH_ARM_SVE)
/// Count 1 bits inside [start, stop]
#if defined(ENABLE_MULTIARCH_ARM_SVE)
__attribute__ ((target ("arch=armv8-a+sve")))
#endif
uint64_t count_arm_sve(uint64_t start, uint64_t stop) const;
/// Count 1 bits inside [0, stop]
#if defined(ENABLE_MULTIARCH_ARM_SVE)
__attribute__ ((target ("arch=armv8-a+sve")))
#endif
ALWAYS_INLINE uint64_t count_arm_sve(uint64_t stop)
{
ASSERT(stop >= prev_stop_);
uint64_t start = prev_stop_ + 1;
prev_stop_ = stop;
if (start > stop)
return count_;
// Quickly count the number of unsieved elements (in
// the sieve array) up to a value that is close to
// the stop number i.e. (stop - start) < counter_.dist.
// We do this using the counter array, each element
// of the counter array contains the number of
// unsieved elements in the interval:
// [i * counter_.dist, (i + 1) * counter_.dist[.
while (counter_.stop <= stop)
{
start = counter_.stop;
counter_.stop += counter_.dist;
counter_.sum += counter_[counter_.i++];
count_ = counter_.sum;
}
// Here the remaining distance is relatively small i.e.
// (stop - start) < counter_.dist, hence we simply
// count the remaining number of unsieved elements by
// linearly iterating over the sieve array.
ASSERT(start <= stop);
ASSERT(stop - start < segment_size());
uint64_t start_idx = start / 240;
uint64_t stop_idx = stop / 240;
uint64_t m1 = unset_smaller[start % 240];
uint64_t m2 = unset_larger[stop % 240];
// Branchfree bitmask calculation:
// m1 = (start_idx != stop_idx) ? m1 : m1 & m2;
m1 &= (-(start_idx != stop_idx) | m2);
// m2 = (start_idx != stop_idx) ? m2 : 0;
m2 &= -(start_idx != stop_idx);
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
uint64_t start_bits = sieve64[start_idx] & m1;
uint64_t stop_bits = sieve64[stop_idx] & m2;
ASSERT(svcntd() >= 2);
svuint64_t vec = svinsr_n_u64(svdup_u64(start_bits), stop_bits);
svuint64_t vcnt = svcnt_u64_z(svwhilelt_b64(0, 2), vec);
uint64_t i = start_idx + 1;
// Compute this for loop using ARM SVE.
// for (i = start_idx + 1; i < stop_idx; i++)
// cnt += popcnt64(sieve64[i]);
for (; i + svcntd() < stop_idx; i += svcntd())
{
vec = svld1_u64(svptrue_b64(), &sieve64[i]);
vec = svcnt_u64_x(svptrue_b64(), vec);
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
}
svbool_t pg = svwhilelt_b64(i, stop_idx);
vec = svld1_u64(pg, &sieve64[i]);
vec = svcnt_u64_z(pg, vec);
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
count_ += svaddv_u64(svptrue_b64(), vcnt);
return count_;
}
#endif
private:
void add(uint64_t prime);
void allocate_counter(uint64_t low);
void init_counter(uint64_t low, uint64_t high);
void reset_counter();
void reset_sieve(uint64_t low, uint64_t high);
uint64_t segment_size() const;
static const Array<uint64_t, 240> unset_smaller;
static const Array<uint64_t, 240> unset_larger;
struct Wheel
{
uint32_t multiple;
uint32_t index;
Wheel() = default;
Wheel(uint32_t m, uint32_t i)
: multiple(m),
index(i)
{ }
};
struct Counter
{
uint64_t stop = 0;
uint64_t dist = 0;
uint64_t log2_dist = 0;
uint64_t sum = 0;
uint64_t i = 0;
Vector<uint32_t> counter;
uint32_t& operator[](std::size_t pos)
{
return counter[pos];
}
uint32_t operator[](std::size_t pos) const
{
return counter[pos];
}
};
uint64_t start_ = 0;
uint64_t prev_stop_ = 0;
uint64_t count_ = 0;
uint64_t total_count_ = 0;
Vector<uint8_t> sieve_;
Vector<Wheel> wheel_;
Counter counter_;
};
} // namespace
#endif
|