1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
__device__ unsigned int mulhilo32(
unsigned int a,
unsigned int b,
unsigned int* result_high) {
*result_high = __umulhi(a, b);
return a * b;
}
__device__ uint4 single_round(uint4 ctr, uint2 key) {
constexpr unsigned long kPhiloxSA = 0xD2511F53;
constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
unsigned int hi0;
unsigned int hi1;
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
return ret;
}
__device__ uint4 philox(
unsigned long long seed,
unsigned long long subsequence,
unsigned long long offset) {
constexpr unsigned long kPhilox10A = 0x9E3779B9;
constexpr unsigned long kPhilox10B = 0xBB67AE85;
uint2 key = {};
key.x = (unsigned int)seed;
key.y = (unsigned int)(seed >> 32);
uint4 counter = make_uint4(0, 0, 0, 0);
counter.x = (unsigned int)(offset);
counter.y = (unsigned int)(offset >> 32);
counter.z = (unsigned int)(subsequence);
counter.w = (unsigned int)(subsequence >> 32);
uint4 output = {};
uint2 key_ = key;
uint4 counter_ = counter;
for (int i = 0; i < 9; i++) {
counter_ = single_round(counter_, key_);
key_.x += (kPhilox10A);
key_.y += (kPhilox10B);
}
output = single_round(counter_, key_);
return output;
}
__device__ float uniformf(unsigned int x) {
constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
float result = x * kRanInvM32;
return result == 1 ? 0.0f : result;
}
__device__ double uniform(unsigned int x, unsigned int y) {
constexpr double kRan2Pow53Inv = 1.1102230246251565e-16;
const unsigned long long z =
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
double result = z * kRan2Pow53Inv + (kRan2Pow53Inv / 2.0);
return result == 1 ? 0.0 : result;
}
__device__ double rng_uniform(const uint4& rng_result, int rng_component) {
return uniform(
(&rng_result.x)[rng_component * 2],
(&rng_result.x)[rng_component * 2 + 1]);
}
__device__ float rng_uniformf(const uint4& rng_result, int rng_component) {
return uniformf((&rng_result.x)[rng_component]);
}
|