| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 
 | /*
 * Copyright (C) 2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include <arm_neon.h>
#include <cstdint>
namespace NEO {
struct uint16x16_t {
    enum { numChannels = 16 };
    uint16x8x2_t value;
    uint16x16_t() {
        value.val[0] = vdupq_n_u16(0);
        value.val[1] = vdupq_n_u16(0);
    }
    uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
        value.val[0] = lo;
        value.val[1] = hi;
    }
    uint16x16_t(uint16_t a) {
        value.val[0] = vdupq_n_u16(a);
        value.val[1] = vdupq_n_u16(a);
    }
    explicit uint16x16_t(const void *alignedPtr) {
        load(alignedPtr);
    }
    inline uint16_t get(unsigned int element) {
        DEBUG_BREAK_IF(element >= numChannels);
        uint16_t result;
        // vgetq_lane requires constant immediate
        switch (element) {
        case 0:
            result = vgetq_lane_u16(value.val[0], 0);
            break;
        case 1:
            result = vgetq_lane_u16(value.val[0], 1);
            break;
        case 2:
            result = vgetq_lane_u16(value.val[0], 2);
            break;
        case 3:
            result = vgetq_lane_u16(value.val[0], 3);
            break;
        case 4:
            result = vgetq_lane_u16(value.val[0], 4);
            break;
        case 5:
            result = vgetq_lane_u16(value.val[0], 5);
            break;
        case 6:
            result = vgetq_lane_u16(value.val[0], 6);
            break;
        case 7:
            result = vgetq_lane_u16(value.val[0], 7);
            break;
        case 8:
            result = vgetq_lane_u16(value.val[1], 0);
            break;
        case 9:
            result = vgetq_lane_u16(value.val[1], 1);
            break;
        case 10:
            result = vgetq_lane_u16(value.val[1], 2);
            break;
        case 11:
            result = vgetq_lane_u16(value.val[1], 3);
            break;
        case 12:
            result = vgetq_lane_u16(value.val[1], 4);
            break;
        case 13:
            result = vgetq_lane_u16(value.val[1], 5);
            break;
        case 14:
            result = vgetq_lane_u16(value.val[1], 6);
            break;
        case 15:
            result = vgetq_lane_u16(value.val[1], 7);
            break;
        }
        return result;
    }
    static inline uint16x16_t zero() {
        return uint16x16_t(static_cast<uint16_t>(0u));
    }
    static inline uint16x16_t one() {
        return uint16x16_t(static_cast<uint16_t>(1u));
    }
    static inline uint16x16_t mask() {
        return uint16x16_t(static_cast<uint16_t>(0xffffu));
    }
    inline void load(const void *alignedPtr) {
        DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
        value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
    }
    inline void store(void *alignedPtr) {
        DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
        vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
    }
    inline operator bool() const {
        uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
        uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
        uint64x2_t tmp = vorrq_u64(hi, lo);
        uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
        return result;
    }
    inline uint16x16_t &operator-=(const uint16x16_t &a) {
        value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
        value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
        return *this;
    }
    inline uint16x16_t &operator+=(const uint16x16_t &a) {
        value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
        value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
        return *this;
    }
    inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
        uint16x16_t result;
        result.value.val[0] = veorq_u16(mask().value.val[0],
                                        vcgtq_u16(b.value.val[0], a.value.val[0]));
        result.value.val[1] = veorq_u16(mask().value.val[1],
                                        vcgtq_u16(b.value.val[1], a.value.val[1]));
        return result;
    }
    inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
        uint16x16_t result;
        result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
        result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
        return result;
    }
    // NOTE: uint16x16_t::blend behaves like mask ? a : b
    inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
        uint16x16_t result;
        result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
        result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
        return result;
    }
};
} // namespace NEO
 |