1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
|
/* *****************************************************************************
/
/ This file is part of Solirte, a solenoid ion ray-tracing engine.
/ (c) 2008 Michael Brown.
/ michael.brown@anu.edu.au
/
/ An object that hands out blocks of random data.
/ *************************************************************************** */
#ifndef RANDPOOL_H
#define RANDPOOL_H
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <string.h>
#include "u128simd.h"
#ifndef CFG_TIMETRACK_DISABLE
#include "../platform/timetrack.h"
#endif
#define CFG_RANDPOOL_IMPL_SSE2 1
#define CFG_RANDPOOL_IMPL_ALTIVEC 2
#define CFG_RANDPOOL_IMPL_64BIT 3
#define CFG_RANDPOOL_IMPL_32BITSLOW 4
#define CFG_RANDPOOL_IMPL_MMX 5
#if (CFG_HAVE_SSE2 == 1)
#define CFG_RANDPOOL_IMPL CFG_RANDPOOL_IMPL_SSE2
#endif
#if ((CFG_BITS == 64) && !defined(CFG_RANDPOOL_IMPL))
#define CFG_RANDPOOL_IMPL CFG_RANDPOOL_IMPL_64BIT
#endif
#if ((CFG_HAVE_MMX == 1) && !defined(CFG_RANDPOOL_IMPL))
#define CFG_RANDPOOL_IMPL CFG_RANDPOOL_IMPL_MMX
#endif
#if (!defined(CFG_RANDPOOL_IMPL))
#define CFG_RANDPOOL_IMPL CFG_RANDPOOL_IMPL_32BITSLOW
#endif
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_MMX)
#include "sfmt_mmx.h"
#endif
#if CFG_COMPILER == CFG_COMPILER_MSVC
// Shut up one of the more braindead MSVC warnings.
#pragma warning(push)
#pragma warning(disable: 4127)
#endif
template <class SFMTConsts>
class CRandPool
{
public:
static const size_t N = SFMTConsts::MEXP / 128 + 1;
static const size_t N32 = N * 4;
#ifndef CFG_TIMETRACK_DISABLE
static struct TMetrics
{
static THREADVAR uint32_t poolmoves;
static THREADVAR uint32_t gen_rand_all_count;
static THREADVAR uint64_t gen_rand_all_time;
TMetrics(void)
{
CPerfCounters::SetName(poolmoves, "CRandPool<" + GenToString(SFMTConsts::MEXP) + ">.poolmoves");
CPerfCounters::SetName(gen_rand_all_count, "CPRNG<" + GenToString(SFMTConsts::MEXP) + ">.gen_rand_all.count");
CPerfCounters::SetName(gen_rand_all_time, "CPRNG<" + GenToString(SFMTConsts::MEXP) + ">.gen_rand_all.time");
}
void ForceConstruction(void) { }
} Metrics;
#endif
typedef CDataBuffer_ConstView<uint32_t, N * 16> TPoolConstViewU32;
typedef CDataBuffer_VarView<uint32_t, N * 16> TPoolVarViewU32;
typedef CDataBuffer_ConstView<uint64_t, N * 16> TPoolConstViewU64;
typedef CDataBuffer_VarView<uint64_t, N * 16> TPoolVarViewU64;
#if CFG_HAVE_U128 == CFG_SIMD_INTRINSIC
typedef CDataBuffer_ConstView<uint128_t, N * 16> TPoolViewConstU128;
typedef CDataBuffer_VarView<uint128_t, N * 16> TPoolViewVarU128;
#endif
#if CFG_HAVE_U128FP64 == CFG_SIMD_INTRINSIC
typedef CDataBuffer_ConstView<uint128fp64_t, N * 16> TPoolConstViewU128FP64;
typedef CDataBuffer_VarView<uint128fp64_t, N * 16> TPoolVarViewU128FP64;
#endif
class CRandomBlock
{
private:
CDataBuffer<N * 16> FPool;
int FRefCount;
protected:
public:
CRandomBlock(void) { FRefCount = 1; }
int AddRef(void) { return ++FRefCount; }
int GetRefCount(void) { return FRefCount; }
int Release(void)
{
if (--FRefCount == 0)
{
delete this;
return 0;
}
return FRefCount;
}
template<class ElementType> inline CDataBuffer_ConstView<ElementType, N * 16> GetConstView(void)
{ return FPool.template GetConstView<ElementType>(); }
template<class ElementType> inline CDataBuffer_VarView<ElementType, N * 16> GetVarView(void)
{ return FPool.template GetVarView<ElementType>(); }
void *GetDataPointer(void) { return &FPool; }
static void *operator new(size_t size) { return _aligned_malloc(size, 16); }
static void operator delete(void *p) { _aligned_free(p); }
};
class CRandomBlockPtr
{
private:
CRandomBlock *FBlock;
protected:
public:
CRandomBlockPtr(void) { FBlock = new CRandomBlock(); }
CRandomBlockPtr(CRandomBlock *Src) : FBlock(Src) { /* No AddRef */ }
CRandomBlockPtr(const CRandomBlockPtr &Src) : FBlock(Src.FBlock) { FBlock->AddRef(); }
~CRandomBlockPtr(void) { FBlock->Release(); }
int GetRefCount(void) { return FBlock->GetRefCount(); }
template<class ElementType> inline CDataBuffer_ConstView<ElementType, N * 16> GetConstView(void)
{ return FBlock->template GetConstView<ElementType>(); }
template<class ElementType> inline CDataBuffer_VarView<ElementType, N * 16> GetVarView(void)
{ return FBlock->template GetVarView<ElementType>(); }
void *GetDataPointer(void) { return FBlock->GetDataPointer(); }
void MakeUnique(void)
{
if (GetRefCount() != 1)
{
#ifndef CFG_TIMETRACK_DISABLE
Metrics.poolmoves++;
#endif
CRandomBlock *NewBlock = new CRandomBlock;
memcpy(NewBlock->GetDataPointer(), FBlock->GetDataPointer(), N * sizeof(uint128_t));
FBlock->Release();
FBlock = NewBlock;
}
}
};
protected:
// The 128-bit internal state array
CRandomBlockPtr FPool;
private:
// A function used in the initialization (init_by_array).
static inline uint32_t func1(uint32_t x)
{
return (x ^ (x >> 27)) * (uint32_t)1664525UL;
}
// A function used in the initialization (init_by_array).
static inline uint32_t func2(uint32_t x)
{
return (x ^ (x >> 27)) * (uint32_t)1566083941UL;
}
// A function used in the initialization (init_gen_rand).
static inline uint32_t func3(uint32_t x, uint32_t i)
{
return 1812433253UL * (x ^ (x >> 30)) + i;
}
// This function corrects for byte ordering on big endian machines. Note
// that this function does slow things down a bit so should be avoided where
// performance is important.
static inline size_t IdxOf32(size_t LittleEndianIdx)
{
return EndianFix<2>(LittleEndianIdx);
}
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_32BITSLOW)
// -------------------------------------------------------------------------
// Implementation for register-challenged 32-bit architectures. In other
// words, x86. There simply isn't enough register to go around to buffer the
// C and D values, so we have to pull them from memory (hopefully the cache)
// instead. Note that this is really only needed if the compiler is not
// supported by the inline assembler MMX version, which is faster.
//
// This also serves a double duty as the "default" implementation in case we
// don't have anything better to use. There's currently two architectures
// that are not fully optimal (and use this implementation): 32-bit SPARC,
// and 32-bit PPC systems that don't support AltiVec (such as POWER). There
// should be a register-buffering 32-bit implementation for these
// architectures.
// -------------------------------------------------------------------------
static std::string GetRandpoolImpl(void) { return "32bitslow"; }
// Emulation of 128-bit shifts using 32-bit integer operations.
template<int NumBits, int Idx, int SrcIdx> static inline uint32_t Shr128_sub(uint32_t *SrcPtr)
{
// Low bit of SrcIdx ends up at SrcIdx * 32 - NumBits.
// Get position relative to the low bit of Idx (= Idx * 32).
static const int SrcLow = SrcIdx * 32 - NumBits - Idx * 32;
static const int UseLeftShift = ((SrcLow > 0) && (SrcLow < 32)) ? 1 : 0;
static const int UseRightShift = ((SrcLow < 0) && (SrcLow > -32)) ? 1 : 0;
static const int UseNoShift = (SrcLow == 0) ? 1 : 0;
static const int LShiftBits = (UseLeftShift == 1) ? SrcLow : 0;
static const int RShiftBits = (UseRightShift == 1) ? -SrcLow : 0;
if (UseLeftShift == 1) return SrcPtr[EndianFix_static_int<2, SrcIdx>::Value] << LShiftBits;
if (UseRightShift == 1) return SrcPtr[EndianFix_static_int<2, SrcIdx>::Value] >> RShiftBits;
if (UseNoShift == 1) return SrcPtr[EndianFix_static_int<2, SrcIdx>::Value];
return 0;
}
template<int NumBits, int Idx, int SrcIdx> static inline uint32_t Shl128_sub(uint32_t *SrcPtr)
{
return Shr128_sub<-NumBits, Idx, SrcIdx>(SrcPtr);
}
template<int NumBits, int Idx> static uint32_t inline Shr128Xor(uint32_t x, uint32_t *Y128Ptr)
{
return x ^
Shr128_sub<NumBits, Idx, 0>(Y128Ptr) ^ Shr128_sub<NumBits, Idx, 1>(Y128Ptr) ^
Shr128_sub<NumBits, Idx, 2>(Y128Ptr) ^ Shr128_sub<NumBits, Idx, 3>(Y128Ptr);
}
template<int NumBits, int Idx> static uint32_t inline Shl128Xor(uint32_t x, uint32_t *Y128Ptr)
{
return x ^
Shl128_sub<NumBits, Idx, 0>(Y128Ptr) ^ Shl128_sub<NumBits, Idx, 1>(Y128Ptr) ^
Shl128_sub<NumBits, Idx, 2>(Y128Ptr) ^ Shl128_sub<NumBits, Idx, 3>(Y128Ptr);
}
// The recursion function for 32-bit architectures with high register
// pressure (ie: x86). Zero registers are used for caching.
template<int COffset, int DOffset>
static void inline mm_recursion_32_0(uint32_t *BufA, uint32_t *BufB)
{
uint32_t A0Res;
A0Res = Shl128Xor<SFMTConsts::SL2 * 8, 0>(BufA[EndianFix_static_int<2, 0>::Value], BufA);
A0Res ^= (BufB[EndianFix_static_int<2, 0>::Value] >> SFMTConsts::SR1) & SFMTConsts::MSK1;
A0Res = Shr128Xor<SFMTConsts::SR2 * 8, 0>(A0Res, BufA + COffset * 4);
A0Res ^= BufA[EndianFix_static_int<2, 0>::Value + DOffset * 4] << SFMTConsts::SL1;
if (SFMTConsts::SL2 == 0) BufA[EndianFix_static_int<2, 0>::Value] = A0Res;
uint32_t A1Res;
A1Res = Shl128Xor<SFMTConsts::SL2 * 8, 1>(BufA[EndianFix_static_int<2, 1>::Value], BufA);
A1Res ^= (BufB[EndianFix_static_int<2, 1>::Value] >> SFMTConsts::SR1) & SFMTConsts::MSK2;
A1Res = Shr128Xor<SFMTConsts::SR2 * 8, 1>(A1Res, BufA + COffset * 4);
A1Res ^= BufA[EndianFix_static_int<2, 1>::Value + DOffset * 4] << SFMTConsts::SL1;
if ((SFMTConsts::SL2 > 0) && (SFMTConsts::SL2 <= 32)) BufA[EndianFix_static_int<2, 0>::Value] = A0Res;
if (SFMTConsts::SL2 == 0) BufA[EndianFix_static_int<2, 1>::Value] = A1Res;
uint32_t A2Res;
A2Res = Shl128Xor<SFMTConsts::SL2 * 8, 2>(BufA[EndianFix_static_int<2, 2>::Value], BufA);
A2Res ^= (BufB[EndianFix_static_int<2, 2>::Value] >> SFMTConsts::SR1) & SFMTConsts::MSK3;
A2Res = Shr128Xor<SFMTConsts::SR2 * 8, 2>(A2Res, BufA + COffset * 4);
A2Res ^= BufA[EndianFix_static_int<2, 2>::Value + DOffset * 4] << SFMTConsts::SL1;
if ((SFMTConsts::SL2 > 32) && (SFMTConsts::SL2 <= 64)) BufA[EndianFix_static_int<2, 0>::Value] = A0Res;
if ((SFMTConsts::SL2 > 0) && (SFMTConsts::SL2 <= 32)) BufA[EndianFix_static_int<2, 1>::Value] = A1Res;
if (SFMTConsts::SL2 == 0) BufA[EndianFix_static_int<2, 2>::Value] = A2Res;
uint32_t A3Res;
A3Res = Shl128Xor<SFMTConsts::SL2 * 8, 3>(BufA[EndianFix_static_int<2, 3>::Value], BufA);
A3Res ^= (BufB[EndianFix_static_int<2, 3>::Value] >> SFMTConsts::SR1) & SFMTConsts::MSK4;
A3Res = Shr128Xor<SFMTConsts::SR2 * 8, 3>(A3Res, BufA + COffset * 4);
A3Res ^= BufA[EndianFix_static_int<2, 3>::Value + DOffset * 4] << SFMTConsts::SL1;
if (SFMTConsts::SL2 > 64) BufA[EndianFix_static_int<2, 0>::Value] = A0Res;
if (SFMTConsts::SL2 > 32) BufA[EndianFix_static_int<2, 1>::Value] = A1Res;
if (SFMTConsts::SL2 > 0) BufA[EndianFix_static_int<2, 2>::Value] = A2Res;
BufA[EndianFix_static_int<2, 3>::Value] = A3Res;
}
NOINLINE void gen_rand_all(void)
{
#ifndef CFG_TIMETRACK_DISABLE
CTimedScope ScopeTimer(Metrics.gen_rand_all_time);
Metrics.gen_rand_all_count++;
#endif
uint32_t *APtr = FPool.template GetVarView<uint32_t>().GetPointer();
uint32_t *BPtr = APtr + SFMTConsts::POS1 * 4;
// The two initial loops (unusual C and D offsets).
mm_recursion_32_0<N - 2, N - 1>(APtr, BPtr);
mm_recursion_32_0<N - 2, -1>(APtr + 4, BPtr + 4);
int NumLoops = N - SFMTConsts::POS1 - 2;
APtr += (NumLoops + 2) * 4;
BPtr += (NumLoops + 2) * 4;
// The main loop. No unrolling.
int RepLoop = 0;
for (;;)
{
for (int MainLoop = -NumLoops; MainLoop != 0; MainLoop++)
mm_recursion_32_0<-2, -1>(APtr + MainLoop * 4, BPtr + MainLoop * 4);
if (RepLoop == 0)
{
// Need to adjust BPtr to handle wrapping of the buffer.
RepLoop = 1;
NumLoops = SFMTConsts::POS1;
APtr += NumLoops * 4;
BPtr += (NumLoops - N) * 4;
}
else
break;
}
}
#endif
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_MMX)
// -------------------------------------------------------------------------
// MMX implementation. Since current compilers can't handle the register
// pressure for this implementation, it has to be done via assembler (inline
// if we're using MSVC, external if we're using something else). All we do
// is bounce out to the correct function.
// -------------------------------------------------------------------------
static std::string GetRandpoolImpl(void) { return "mmx"; }
NOINLINE void gen_rand_all(void)
{
#ifndef CFG_TIMETRACK_DISABLE
CTimedScope ScopeTimer(Metrics.gen_rand_all_time);
Metrics.gen_rand_all_count++;
#endif
SFMT_recursion_mmx<SFMTConsts>::SFMT_recursion(FPool.GetDataPointer());
}
#endif
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_64BIT)
// -------------------------------------------------------------------------
// Implementation for 64 bit architectures. Here, we buffer C and D in
// registers. It is assumed that there are sufficient registers that this
// can be done. In every 64-bit platform of relevance (x86-64, SPARC, PPC,
// and IA64) this is the case. Note that in the case of x86-64 we'd probably
// be using SSE2 anyhow.
// -------------------------------------------------------------------------
static std::string GetRandpoolImpl(void) { return "64bit"; }
// Emulation of 128-bit shifts using 64-bit integer operations.
template<int NumBits, int Idx, int SrcIdx> FORCEINLINE uint64_t Shr128_sub(uint64_t YVal)
{
// Low bit of SrcIdx ends up at SrcIdx * 64 - NumBits.
// Get position relative to the low bit of Idx (= Idx * 64).
static const int SrcLow = SrcIdx * 64 - NumBits - Idx * 64;
static const int UseLeftShift = ((SrcLow > 0) && (SrcLow < 64)) ? 1 : 0;
static const int UseRightShift = ((SrcLow < 0) && (SrcLow > -64)) ? 1 : 0;
static const int UseNoShift = (SrcLow == 0) ? 1 : 0;
static const int LShiftBits = (UseLeftShift == 1) ? SrcLow : 0;
static const int RShiftBits = (UseRightShift == 1) ? -SrcLow : 0;
if (UseLeftShift == 1) return YVal << LShiftBits;
if (UseRightShift == 1) return YVal >> RShiftBits;
if (UseNoShift == 1) return YVal;
return 0;
}
template<int NumBits, int Idx, int SrcIdx> FORCEINLINE uint64_t Shl128_sub(uint64_t YVal)
{
return Shr128_sub<-NumBits, Idx, SrcIdx>(YVal);
}
template<int NumBits, int Idx> uint64_t FORCEINLINE Shr128Xor(uint64_t x, uint64_t YLow, uint64_t YHigh)
{
return x ^ Shr128_sub<NumBits, Idx, 0>(YLow) ^ Shr128_sub<NumBits, Idx, 1>(YHigh);
}
template<int NumBits, int Idx> uint64_t FORCEINLINE Shl128Xor(uint64_t x, uint64_t YLow, uint64_t YHigh)
{
return x ^ Shl128_sub<NumBits, Idx, 0>(YLow) ^ Shl128_sub<NumBits, Idx, 1>(YHigh);
}
// The recursion function for 64-bit architectures. It's assumed that there
// are enough registers, which is true for any architecture worth worrying
// about (x86-64, SPARC, IA64, PPC).
void FORCEINLINE mm_recursion(uint128_t *BufA, uint128_t *BufB, uint64_t &ALowOut, uint64_t &AHighOut,
uint64_t CLow, uint64_t CHigh, uint64_t DLow, uint64_t DHigh)
{
uint64_t ALowIn = BufA->GetU64<0>();
uint64_t AHighIn = BufA->GetU64<1>();
uint64_t A0Res;
A0Res = Shl128Xor<SFMTConsts::SL2 * 8, 0>(ALowIn, ALowIn, AHighIn);
A0Res ^= (BufB->GetU64<0>() >> SFMTConsts::SR1) &
(static_cast<uint64_t>(SFMTConsts::MSK1) | (static_cast<uint64_t>(SFMTConsts::MSK2) << 32)) &
U64_Shr32Mask<SFMTConsts::SR1>::Value;
A0Res = Shr128Xor<SFMTConsts::SR2 * 8, 0>(A0Res, CLow, CHigh);
A0Res ^= (DLow << SFMTConsts::SL1) & U64_Shl32Mask<SFMTConsts::SL1>::Value;
if (SFMTConsts::SL2 == 0) { ALowOut = A0Res; BufA->SetU64<0>(A0Res); }
uint64_t A1Res;
A1Res = Shl128Xor<SFMTConsts::SL2 * 8, 1>(AHighIn, ALowIn, AHighIn);
A1Res ^= (BufB->GetU64<1>() >> SFMTConsts::SR1) &
(static_cast<uint64_t>(SFMTConsts::MSK3) | (static_cast<uint64_t>(SFMTConsts::MSK4) << 32)) &
U64_Shr32Mask<SFMTConsts::SR1>::Value;
A1Res = Shr128Xor<SFMTConsts::SR2 * 8, 1>(A1Res, CLow, CHigh);
A1Res ^= (DHigh << SFMTConsts::SL1) & U64_Shl32Mask<SFMTConsts::SL1>::Value;
if (SFMTConsts::SL2 > 0) { ALowOut = A0Res; BufA->SetU64<0>(A0Res); }
AHighOut = A1Res; BufA->SetU64<1>(A1Res);
}
#endif
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_SSE2)
// -------------------------------------------------------------------------
// This is the SSE2 implementation. Implementing AltiVec in u128simd.h
// should allow this to work fine for AltiVec as well.
// -------------------------------------------------------------------------
static std::string GetRandpoolImpl(void) { return "sse2"; }
// The main recursion function.
uint128_t FORCEINLINE mm_recursion(uint128_t *BufA, uint128_t *BufB,
uint128_t ValC, uint128_t ValD, uint128_t Mask)
{
uint128_t ValA = *BufA;
ValA = SIMDOP_xor(SIMDOP_xor(SIMDOP_xor(SIMDOP_xor(
ValA,
SIMDOps::Shl8<SFMTConsts::SL2>(ValA)),
SIMDOP_and(SIMDOps::Shr_u32<SFMTConsts::SR1>(BufB[0]), Mask)),
SIMDOps::Shr8<SFMTConsts::SR2>(ValC)),
SIMDOps::Shl_u32<SFMTConsts::SL1>(ValD));
*BufA = ValA;
return ValA;
}
#endif
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_64BIT) || (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_SSE2)
// -------------------------------------------------------------------------
// This is the generic gen_rand for architectures where C and D are buffered
// in registers.
// -------------------------------------------------------------------------
static const int PartLoops1 = (N - SFMTConsts::POS1) % 3;
static const int Loops1 = (N - SFMTConsts::POS1) / 3;
static const int Loops2 = SFMTConsts::POS1 / 3;
static const int PartLoops2 = SFMTConsts::POS1 % 3;
NOINLINE void gen_rand_all(void)
{
#ifndef CFG_TIMETRACK_DISABLE
CTimedScope ScopeTimer(Metrics.gen_rand_all_time);
Metrics.gen_rand_all_count++;
#endif
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_SSE2)
uint128_t Mask = SIMDOps::uint128_t_const<SFMTConsts::MSK1, SFMTConsts::MSK2, SFMTConsts::MSK3, SFMTConsts::MSK4>::Value();
uint128_t AVal, CVal, DVal;
#elif (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_64BIT)
uint64_t ALow, AHigh, CLow, CHigh, DLow, DHigh;
#endif
uint128_t *APtr = FPool.template GetVarView<uint128_t>().GetPointer();
uint128_t *BPtr;
// Now would be a perfect time to use Duff's device. Except that both GCC
// and MSVC choke badly on it and generate terrible code. So, we have to
// manually unroll the partial loops. Additionally, it appears to be
// impossible to express the functions in a way that makes everything work
// well in both GCC and MSVC in both SSE2 and 64-bit mode. So, the code
// has to be duplicated to coerce the compiler into not thrashing the
// stack. Also, we should be able to use "APtr + 1" etc, except MSVC is
// broken and emits C4328 on this construct.
#define COMPOSE(x, y) x##y
#if (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_SSE2)
#define MM_RECURSION(A, AOffset, BOffset, C, D) COMPOSE(A, Val) = mm_recursion(&APtr[AOffset], &BPtr[BOffset], COMPOSE(C, Val), COMPOSE(D, Val), Mask);
#define MM_LOAD(X, AOffset) COMPOSE(X, Val) = APtr[AOffset];
#elif (CFG_RANDPOOL_IMPL == CFG_RANDPOOL_IMPL_64BIT)
#define MM_RECURSION(A, AOffset, BOffset, C, D) mm_recursion(&APtr[AOffset], &BPtr[BOffset], COMPOSE(A, Low), COMPOSE(A, High), COMPOSE(C, Low), COMPOSE(C, High), COMPOSE(D, Low), COMPOSE(D, High));
#define MM_LOAD(X, AOffset) COMPOSE(X, Low) = APtr[AOffset].GetU64<0>(); COMPOSE(X, High) = APtr[AOffset].GetU64<1>();
#endif
BPtr = APtr;
if (PartLoops1 == 2)
{
MM_LOAD(D, N - 2)
MM_LOAD(A, N - 1)
MM_RECURSION(C, 0, SFMTConsts::POS1 + 0, D, A)
MM_RECURSION(D, 1, SFMTConsts::POS1 + 1, A, C)
APtr += 2;
}
if (PartLoops1 == 1)
{
MM_LOAD(A, N - 2)
MM_LOAD(C, N - 1)
MM_RECURSION(D, 0, SFMTConsts::POS1 + 0, A, C)
APtr += 1;
}
APtr += 3 * Loops1;
BPtr = APtr + SFMTConsts::POS1;
int LoopCount = -3 * Loops1;
for (;;)
{
for (; LoopCount < 0; LoopCount += 3)
{
MM_RECURSION(A, LoopCount + 0, LoopCount + 0, C, D)
MM_RECURSION(C, LoopCount + 1, LoopCount + 1, D, A)
MM_RECURSION(D, LoopCount + 2, LoopCount + 2, A, C)
}
if (BPtr == APtr + SFMTConsts::POS1)
{
// Handle BPtr wrapping around the end of the array.
LoopCount = -Loops2 * 3;
BPtr = APtr - N + 2 * SFMTConsts::POS1 - SFMTConsts::POS1 % 3;
APtr += Loops2 * 3;
}
else
break;
}
// Again, Duff's device would allow us to avoid this annoying manual
// unrolling (not to mention this rather harder to follow logic).
if (PartLoops2 >= 1) MM_RECURSION(A, LoopCount + 0, LoopCount + 0, C, D)
if (PartLoops2 >= 2) MM_RECURSION(C, LoopCount + 1, LoopCount + 1, D, A)
}
#endif
// Certifies the period (2^MEXP), potentially changing one of the bits to
// make sure.
void PeriodCertification(void)
{
CDataBuffer_VarView<uint128_t, N * 16> PoolU128 = FPool.template GetVarView<uint128_t>();
uint128_t ParityCheck = SIMDOps::Parity(SIMDOP_and(
SIMDOps::uint128_t_const<SFMTConsts::PARITY1, SFMTConsts::PARITY2, SFMTConsts::PARITY3, SFMTConsts::PARITY4>::Value(), PoolU128.Load(0)));
PoolU128.Store(0, SIMDOP_xor(PoolU128.Load(0), SIMDOps::AndNot(SIMDOps::uint128_t_const<
lowestbit_u128<SFMTConsts::PARITY1, SFMTConsts::PARITY2, SFMTConsts::PARITY3, SFMTConsts::PARITY4>::y0,
lowestbit_u128<SFMTConsts::PARITY1, SFMTConsts::PARITY2, SFMTConsts::PARITY3, SFMTConsts::PARITY4>::y1,
lowestbit_u128<SFMTConsts::PARITY1, SFMTConsts::PARITY2, SFMTConsts::PARITY3, SFMTConsts::PARITY4>::y2,
lowestbit_u128<SFMTConsts::PARITY1, SFMTConsts::PARITY2, SFMTConsts::PARITY3, SFMTConsts::PARITY4>::y3>::Value(),
ParityCheck)));
}
public:
CRandPool(void)
{
#ifndef CFG_TIMETRACK_DISABLE
Metrics.ForceConstruction();
#endif
}
/**
* This function returns the identification string.
* The string shows the word size, the Mersenne exponent,
* and all parameters of this generator.
*/
static std::string GetIDString(void)
{
std::ostringstream os;
os << "SFMT" << '-' << SFMTConsts::MEXP << ':' << SFMTConsts::POS1 << '-' << SFMTConsts::SL1 << '-' << SFMTConsts::SL2 <<
'-' << SFMTConsts::SR1 << '-' << SFMTConsts::SR2 << ':' << std::hex << std::setfill('0') <<
std::setw(8) << SFMTConsts::MSK1 << '-' <<
std::setw(8) << SFMTConsts::MSK2 << '-' <<
std::setw(8) << SFMTConsts::MSK3 << '-' <<
std::setw(8) << SFMTConsts::MSK4 << ':' << GetRandpoolImpl();
return os.str();
}
/**
* This function initializes the internal state array with a 32-bit
* integer seed.
*
* @param seed a 32-bit integer used as the seed.
*/
NOINLINE void init_gen_rand(uint32_t seed)
{
CDataBuffer_VarView<uint32_t, N * 16> PoolU32 = FPool.template GetVarView<uint32_t>();
uint32_t x0 = seed;
for (uint32_t i = 0; i < N; i++)
{
uint32_t x1 = func3(x0, i * 4 + 1);
uint32_t x2 = func3(x1, i * 4 + 2);
uint32_t x3 = func3(x2, i * 4 + 3);
PoolU32.Store(i * 4 + EndianFix_static<2, 0>::Value, x0);
PoolU32.Store(i * 4 + EndianFix_static<2, 1>::Value, x1);
PoolU32.Store(i * 4 + EndianFix_static<2, 2>::Value, x2);
PoolU32.Store(i * 4 + EndianFix_static<2, 3>::Value, x3);
x0 = func3(x3, i * 4 + 4);
}
PeriodCertification();
SIMDOps::EmptyForFP();
}
/**
* This function initializes the internal state array,
* with an array of 32-bit integers used as the seeds
* @param init_key the array of 32-bit integers, used as a seed.
* @param key_length the length of init_key.
*/
NOINLINE void init_by_array(uint32_t *init_key, size_t key_length)
{
uint32_t *FPool32 = FPool.template GetVarView<uint32_t>().GetPointer();
size_t lag;
if (N32 >= 623)
lag = 11;
else if (N32 >= 68)
lag = 7;
else if (N32 >= 39)
lag = 5;
else
lag = 3;
size_t mid = (N32 - lag) / 2;
memset(FPool32, 0x8b, N * 16);
size_t count;
if (key_length + 1 > N32)
count = key_length + 1;
else
count = N32;
uint32_t r = func1(FPool32[IdxOf32(0)] ^ FPool32[IdxOf32(mid)]
^ FPool32[IdxOf32(N32 - 1)]);
FPool32[IdxOf32(mid)] += r;
r += key_length;
FPool32[IdxOf32(mid + lag)] += r;
FPool32[IdxOf32(0)] = r;
size_t i, j;
count--;
for (i = 1, j = 0; (j < count) && (j < key_length); j++)
{
r = func1(FPool32[IdxOf32(i)] ^ FPool32[IdxOf32((i + mid) % N32)]
^ FPool32[IdxOf32((i + N32 - 1) % N32)]);
FPool32[IdxOf32((i + mid) % N32)] += r;
r += init_key[j] + i;
FPool32[IdxOf32((i + mid + lag) % N32)] += r;
FPool32[IdxOf32(i)] = r;
i = (i + 1) % N32;
}
for (; j < count; j++)
{
r = func1(FPool32[IdxOf32(i)] ^ FPool32[IdxOf32((i + mid) % N32)]
^ FPool32[IdxOf32((i + N32 - 1) % N32)]);
FPool32[IdxOf32((i + mid) % N32)] += r;
r += i;
FPool32[IdxOf32((i + mid + lag) % N32)] += r;
FPool32[IdxOf32(i)] = r;
i = (i + 1) % N32;
}
for (j = 0; j < N32; j++)
{
r = func2(FPool32[IdxOf32(i)] + FPool32[IdxOf32((i + mid) % N32)]
+ FPool32[IdxOf32((i + N32 - 1) % N32)]);
FPool32[IdxOf32((i + mid) % N32)] ^= r;
r -= i;
FPool32[IdxOf32((i + mid + lag) % N32)] ^= r;
FPool32[IdxOf32(i)] = r;
i = (i + 1) % N32;
}
PeriodCertification();
SIMDOps::EmptyForFP();
}
static const size_t BufSize = N;
inline CRandomBlockPtr GetRandomBlock(void)
{
FPool.MakeUnique();
gen_rand_all();
return FPool;
}
};
#ifndef CFG_TIMETRACK_DISABLE
template<class SFMTParams> typename CRandPool<SFMTParams>::TMetrics CRandPool<SFMTParams>::Metrics;
template<class SFMTParams> THREADVAR uint32_t CRandPool<SFMTParams>::TMetrics::poolmoves = 0;
template<class SFMTParams> THREADVAR uint32_t CRandPool<SFMTParams>::TMetrics::gen_rand_all_count = 0;
template<class SFMTParams> THREADVAR uint64_t CRandPool<SFMTParams>::TMetrics::gen_rand_all_time = 0;
#endif
#if CFG_COMPILER == CFG_COMPILER_MSVC
#pragma warning(pop)
#endif
#endif
|