File: uint16_avx2.h

package info (click to toggle)
intel-compute-runtime 25.35.35096.9-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 79,324 kB
  • sloc: cpp: 926,243; lisp: 3,433; sh: 715; makefile: 162; python: 21
file content (112 lines) | stat: -rw-r--r-- 3,133 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/*
 * Copyright (C) 2018-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"

#include <cstdint>
#include <immintrin.h>

namespace NEO {

#if __AVX2__
struct uint16x16_t { // NOLINT(readability-identifier-naming)
    enum { numChannels = 16 };

    __m256i value;

    uint16x16_t() {
        value = _mm256_setzero_si256();
    }

    uint16x16_t(__m256i value) : value(value) {
    }

    uint16x16_t(uint16_t a) {
        value = _mm256_set1_epi16(a); // AVX
    }

    explicit uint16x16_t(const void *alignedPtr) {
        load(alignedPtr);
    }

    inline uint16_t get(unsigned int element) {
        DEBUG_BREAK_IF(element >= numChannels);
        return reinterpret_cast<uint16_t *>(&value)[element];
    }

    static inline uint16x16_t zero() {
        return uint16x16_t(static_cast<uint16_t>(0u));
    }

    static inline uint16x16_t one() {
        return uint16x16_t(static_cast<uint16_t>(1u));
    }

    static inline uint16x16_t mask() {
        return uint16x16_t(static_cast<uint16_t>(0xffffu));
    }

    inline void load(const void *alignedPtr) {
        DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
        value = _mm256_load_si256(reinterpret_cast<const __m256i *>(alignedPtr)); // AVX
    }

    inline void loadUnaligned(const void *ptr) {
        value = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr)); // AVX
    }

    inline void store(void *alignedPtr) {
        DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
        _mm256_store_si256(reinterpret_cast<__m256i *>(alignedPtr), value); // AVX
    }

    inline void storeUnaligned(void *ptr) {
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), value); // AVX
    }

    inline operator bool() const {
        return _mm256_testz_si256(value, mask().value) ? false : true; // AVX
    }

    inline uint16x16_t &operator-=(const uint16x16_t &a) {
        value = _mm256_sub_epi16(value, a.value); // AVX2
        return *this;
    }

    inline uint16x16_t &operator+=(const uint16x16_t &a) {
        value = _mm256_add_epi16(value, a.value); // AVX2
        return *this;
    }

    inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
        uint16x16_t result;
        result.value =
            _mm256_xor_si256(mask().value,
                             _mm256_cmpgt_epi16(b.value, a.value)); // AVX2
        return result;
    }

    inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
        uint16x16_t result;
        result.value = _mm256_and_si256(a.value, b.value); // AVX2
        return result;
    }

    // NOTE: uint16x16_t::blend behaves like mask ? a : b
    inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
        uint16x16_t result;

        // Have to swap arguments to get intended calling semantics
        result.value =
            _mm256_blendv_epi8(b.value, a.value, mask.value); // AVX2
        return result;
    }
};
#endif // __AVX2__
} // namespace NEO