1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
|
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#define vboolf vboolf_impl
#define vboold vboold_impl
#define vint vint_impl
#define vuint vuint_impl
#define vllong vllong_impl
#define vfloat vfloat_impl
#define vdouble vdouble_impl
namespace embree
{
/* 8-wide AVX-512 64-bit double type */
template<>
struct vdouble<8>
{
ALIGNED_STRUCT_(64);
typedef vboold8 Bool;
enum { size = 8 }; // number of SIMD elements
union { // data
__m512d v;
double i[8];
};
////////////////////////////////////////////////////////////////////////////////
/// Constructors, Assignment & Cast Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble() {}
__forceinline vdouble(const vdouble8& t) { v = t.v; }
__forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
__forceinline vdouble(const __m512d& t) { v = t; }
__forceinline operator __m512d() const { return v; }
__forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
__forceinline vdouble(double i) {
v = _mm512_set1_pd(i);
}
__forceinline vdouble(double a, double b, double c, double d) {
v = _mm512_set4_pd(d,c,b,a);
}
__forceinline vdouble(double a0, double a1, double a2, double a3,
double a4, double a5, double a6, double a7)
{
v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
}
////////////////////////////////////////////////////////////////////////////////
/// Constants
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
__forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {}
__forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
__forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
////////////////////////////////////////////////////////////////////////////////
/// Loads and Stores
////////////////////////////////////////////////////////////////////////////////
static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
_mm512_stream_pd((double*)ptr, a);
}
static __forceinline vdouble8 loadu(const void* addr) {
return _mm512_loadu_pd((double*)addr);
}
static __forceinline vdouble8 load(const vdouble8* addr) {
return _mm512_load_pd((double*)addr);
}
static __forceinline vdouble8 load(const double* addr) {
return _mm512_load_pd(addr);
}
static __forceinline void store(void* ptr, const vdouble8& v) {
_mm512_store_pd(ptr, v);
}
static __forceinline void storeu(void* ptr, const vdouble8& v) {
_mm512_storeu_pd(ptr, v);
}
static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
_mm512_mask_storeu_pd(ptr, mask, f);
}
static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
_mm512_mask_store_pd(addr, mask, v2);
}
static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
return _mm512_mask_compress_pd(v, mask, v);
}
static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
return _mm512_mask_compress_pd(a, mask, b);
}
static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
////////////////////////////////////////////////////////////////////////////////
/// Array Access
////////////////////////////////////////////////////////////////////////////////
__forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; }
__forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
};
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); }
__forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
__forceinline vdouble8 operator +(const vdouble8& a) { return a; }
__forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
////////////////////////////////////////////////////////////////////////////////
/// Binary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
__forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); }
__forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; }
__forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
__forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); }
__forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; }
__forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
__forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); }
__forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; }
__forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
__forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); }
__forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; }
__forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
__forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); }
__forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; }
__forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
__forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); }
__forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; }
__forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
__forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
__forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
__forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
__forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); }
__forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); }
__forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
__forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); }
__forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); }
__forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
__forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
__forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
__forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
////////////////////////////////////////////////////////////////////////////////
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
__forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
__forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
__forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
__forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; }
__forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
__forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; }
__forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
__forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; }
__forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
__forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; }
__forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
__forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; }
__forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
__forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators + Select
////////////////////////////////////////////////////////////////////////////////
__forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); }
__forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; }
__forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
__forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); }
__forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; }
__forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
__forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); }
__forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; }
__forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
__forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); }
__forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; }
__forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
__forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); }
__forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; }
__forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
__forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); }
__forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; }
__forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
__forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
__forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
__forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
__forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
__forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
__forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
__forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
__forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
__forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
__forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
return _mm512_mask_or_pd(f,m,t,t);
}
////////////////////////////////////////////////////////////////////////////////
// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
template<int i0, int i1>
__forceinline vdouble8 shuffle(const vdouble8& v) {
return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
}
template<int i>
__forceinline vdouble8 shuffle(const vdouble8& v) {
return shuffle<i, i>(v);
}
template<int i0, int i1, int i2, int i3>
__forceinline vdouble8 shuffle(const vdouble8& v) {
return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<int i0, int i1>
__forceinline vdouble8 shuffle4(const vdouble8& v) {
return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
}
template<int i>
__forceinline vdouble8 shuffle4(const vdouble8& v) {
return shuffle4<i, i>(v);
}
template<int i>
__forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
}
__forceinline double toScalar(const vdouble8& v) {
return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
}
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); }
__forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
__forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
__forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); }
__forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
__forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
__forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); }
__forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
__forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
__forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
__forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
__forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
////////////////////////////////////////////////////////////////////////////////
/// Memory load and store operations
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
return _mm512_permutexvar_pd(index, v);
}
__forceinline vdouble8 reverse(const vdouble8& a) {
return permute(a, vllong8(reverse_step));
}
////////////////////////////////////////////////////////////////////////////////
/// Output Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
{
cout << "<" << v[0];
for (size_t i=1; i<8; i++) cout << ", " << v[i];
cout << ">";
return cout;
}
}
#undef vboolf
#undef vboold
#undef vint
#undef vuint
#undef vllong
#undef vfloat
#undef vdouble
|