1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include <arm_neon.h>
#include <cstdint>
namespace NEO {
struct uint16x16_t {
enum { numChannels = 16 };
uint16x8x2_t value;
uint16x16_t() {
value.val[0] = vdupq_n_u16(0);
value.val[1] = vdupq_n_u16(0);
}
uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
value.val[0] = lo;
value.val[1] = hi;
}
uint16x16_t(uint16_t a) {
value.val[0] = vdupq_n_u16(a);
value.val[1] = vdupq_n_u16(a);
}
explicit uint16x16_t(const void *alignedPtr) {
load(alignedPtr);
}
inline uint16_t get(unsigned int element) {
DEBUG_BREAK_IF(element >= numChannels);
uint16_t result;
// vgetq_lane requires constant immediate
switch (element) {
case 0:
result = vgetq_lane_u16(value.val[0], 0);
break;
case 1:
result = vgetq_lane_u16(value.val[0], 1);
break;
case 2:
result = vgetq_lane_u16(value.val[0], 2);
break;
case 3:
result = vgetq_lane_u16(value.val[0], 3);
break;
case 4:
result = vgetq_lane_u16(value.val[0], 4);
break;
case 5:
result = vgetq_lane_u16(value.val[0], 5);
break;
case 6:
result = vgetq_lane_u16(value.val[0], 6);
break;
case 7:
result = vgetq_lane_u16(value.val[0], 7);
break;
case 8:
result = vgetq_lane_u16(value.val[1], 0);
break;
case 9:
result = vgetq_lane_u16(value.val[1], 1);
break;
case 10:
result = vgetq_lane_u16(value.val[1], 2);
break;
case 11:
result = vgetq_lane_u16(value.val[1], 3);
break;
case 12:
result = vgetq_lane_u16(value.val[1], 4);
break;
case 13:
result = vgetq_lane_u16(value.val[1], 5);
break;
case 14:
result = vgetq_lane_u16(value.val[1], 6);
break;
case 15:
result = vgetq_lane_u16(value.val[1], 7);
break;
}
return result;
}
static inline uint16x16_t zero() {
return uint16x16_t(static_cast<uint16_t>(0u));
}
static inline uint16x16_t one() {
return uint16x16_t(static_cast<uint16_t>(1u));
}
static inline uint16x16_t mask() {
return uint16x16_t(static_cast<uint16_t>(0xffffu));
}
inline void load(const void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
}
inline void store(void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
}
inline operator bool() const {
uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
uint64x2_t tmp = vorrq_u64(hi, lo);
uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
return result;
}
inline uint16x16_t &operator-=(const uint16x16_t &a) {
value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline uint16x16_t &operator+=(const uint16x16_t &a) {
value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = veorq_u16(mask().value.val[0],
vcgtq_u16(b.value.val[0], a.value.val[0]));
result.value.val[1] = veorq_u16(mask().value.val[1],
vcgtq_u16(b.value.val[1], a.value.val[1]));
return result;
}
inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
return result;
}
// NOTE: uint16x16_t::blend behaves like mask ? a : b
inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
uint16x16_t result;
result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
return result;
}
};
} // namespace NEO
|