1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
|
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#define vboolf vboolf_impl
#define vboold vboold_impl
#define vint vint_impl
#define vuint vuint_impl
#define vllong vllong_impl
#define vfloat vfloat_impl
#define vdouble vdouble_impl
namespace embree
{
/* 8-wide AVX-512 64-bit long long type */
template<>
struct vllong<8>
{
ALIGNED_STRUCT_(64);
typedef vboold8 Bool;
enum { size = 8 }; // number of SIMD elements
union { // data
__m512i v;
long long i[8];
};
////////////////////////////////////////////////////////////////////////////////
/// Constructors, Assignment & Cast Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vllong() {}
__forceinline vllong(const vllong8& t) { v = t.v; }
__forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
__forceinline vllong(const __m512i& t) { v = t; }
__forceinline operator __m512i() const { return v; }
__forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
__forceinline vllong(long long i) {
v = _mm512_set1_epi64(i);
}
__forceinline vllong(long long a, long long b, long long c, long long d) {
v = _mm512_set4_epi64(d,c,b,a);
}
__forceinline vllong(long long a0, long long a1, long long a2, long long a3,
long long a4, long long a5, long long a6, long long a7)
{
v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
}
__forceinline vllong(const vllong<4>& i) {
v = _mm512_broadcast_i64x4(i);
}
////////////////////////////////////////////////////////////////////////////////
/// Constants
////////////////////////////////////////////////////////////////////////////////
__forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
__forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {}
__forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
__forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
////////////////////////////////////////////////////////////////////////////////
/// Loads and Stores
////////////////////////////////////////////////////////////////////////////////
static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
_mm512_stream_si512((__m512i*)ptr,a);
}
static __forceinline vllong8 loadu(const void* addr) {
return _mm512_loadu_si512(addr);
}
static __forceinline vllong8 load(const vllong8* addr) {
return _mm512_load_si512(addr);
}
static __forceinline vllong8 load(const long long* addr) {
return _mm512_load_si512(addr);
}
static __forceinline vllong8 load(const unsigned char* ptr) {
return _mm512_cvtepu8_epi64(*(__m128i*)ptr);
}
static __forceinline void store(void* ptr, const vllong8& v) {
_mm512_store_si512(ptr,v);
}
static __forceinline void storeu(void* ptr, const vllong8& v) {
_mm512_storeu_si512(ptr,v);
}
static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
_mm512_mask_storeu_epi64(ptr,mask,f);
}
static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
_mm512_mask_store_epi64(addr,mask,v2);
}
static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
return _mm512_mask_compress_epi64(v,mask,v);
}
static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
return _mm512_mask_compress_epi64(a,mask,b);
}
static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
return _mm512_mask_expand_epi64(b,mask,a);
}
////////////////////////////////////////////////////////////////////////////////
/// Array Access
////////////////////////////////////////////////////////////////////////////////
__forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; }
__forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
};
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
__forceinline vllong8 operator +(const vllong8& a) { return a; }
__forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
////////////////////////////////////////////////////////////////////////////////
/// Binary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
__forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); }
__forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; }
__forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
__forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); }
__forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; }
__forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
__forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); }
__forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; }
__forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
__forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); }
__forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; }
__forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
__forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); }
__forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; }
__forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
__forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); }
__forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; }
__forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
__forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
__forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
__forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
__forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
__forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
__forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
__forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
__forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); }
__forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); }
__forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
__forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); }
__forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); }
__forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
__forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
__forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
__forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; }
__forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; }
__forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; }
__forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; }
__forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; }
__forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; }
__forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; }
__forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; }
__forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; }
__forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; }
__forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
__forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators + Select
////////////////////////////////////////////////////////////////////////////////
__forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); }
__forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; }
__forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
__forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); }
__forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; }
__forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
__forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); }
__forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; }
__forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
__forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); }
__forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; }
__forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
__forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); }
__forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; }
__forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
__forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); }
__forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; }
__forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
__forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
__forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
__forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
__forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
__forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
__forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
__forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
__forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
__forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
__forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
return _mm512_mask_or_epi64(f,m,t,t);
}
////////////////////////////////////////////////////////////////////////////////
// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
template<int i0, int i1>
__forceinline vllong8 shuffle(const vllong8& v) {
return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
}
template<int i>
__forceinline vllong8 shuffle(const vllong8& v) {
return shuffle<i, i>(v);
}
template<int i0, int i1, int i2, int i3>
__forceinline vllong8 shuffle(const vllong8& v) {
return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<int i0, int i1>
__forceinline vllong8 shuffle4(const vllong8& v) {
return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
}
template<int i>
__forceinline vllong8 shuffle4(const vllong8& v) {
return shuffle4<i, i>(v);
}
template<int i>
__forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
return _mm512_alignr_epi64(a, b, i);
};
__forceinline long long toScalar(const vllong8& v) {
return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
}
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
__forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); }
__forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
__forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
__forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); }
__forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
__forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
__forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); }
__forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
__forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
__forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); }
__forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
__forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
__forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); }
__forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
__forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
__forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
__forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
__forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
__forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
__forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
////////////////////////////////////////////////////////////////////////////////
/// Memory load and store operations
////////////////////////////////////////////////////////////////////////////////
__forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
return _mm512_permutexvar_epi64(index,v);
}
__forceinline vllong8 reverse(const vllong8& a) {
return permute(a,vllong8(reverse_step));
}
////////////////////////////////////////////////////////////////////////////////
/// Output Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
{
cout << "<" << v[0];
for (size_t i=1; i<8; i++) cout << ", " << v[i];
cout << ">";
return cout;
}
}
#undef vboolf
#undef vboold
#undef vint
#undef vuint
#undef vllong
#undef vfloat
#undef vdouble
|