File: simd.h

package info (click to toggle)
ruby-oj 3.16.15-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 14,208 kB
  • sloc: ansic: 20,028; ruby: 11,642; sh: 70; makefile: 17
file content (172 lines) | stat: -rw-r--r-- 5,556 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#ifndef OJ_SIMD_H
#define OJ_SIMD_H

// SIMD architecture detection and configuration
// This header provides unified SIMD support across different CPU architectures
// with cross-platform runtime detection (Windows/Linux/Mac)

// SIMD implementation enum - used for runtime selection
typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;

// Define in oj.c.
extern SIMD_Implementation SIMD_Impl;

// Runtime CPU detection function (implemented in oj.c)
SIMD_Implementation oj_get_simd_implementation(void);

// =============================================================================
// Compiler compatibility macros
// =============================================================================

// Branch prediction hints
#if defined(__GNUC__) || defined(__clang__)
#define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
#define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
#define OJ_LIKELY(x) (x)
#define OJ_UNLIKELY(x) (x)
#endif

// Prefetch hints
#if defined(__GNUC__) || defined(__clang__)
#define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
#elif defined(_MSC_VER)
#include <intrin.h>
#define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
#else
#define OJ_PREFETCH(addr) ((void)0)
#endif

// Count trailing zeros (for SSE2 mask scanning)
#if defined(__GNUC__) || defined(__clang__)
#define OJ_CTZ(x) __builtin_ctz(x)
#elif defined(_MSC_VER)
#include <intrin.h>
static __inline int oj_ctz_msvc(unsigned int x) {
    unsigned long index;
    _BitScanForward(&index, x);
    return (int)index;
}
#define OJ_CTZ(x) oj_ctz_msvc(x)
#else
// Fallback: naive implementation
static inline int oj_ctz_fallback(unsigned int x) {
    int count = 0;
    while ((x & 1) == 0 && count < 32) {
        x >>= 1;
        count++;
    }
    return count;
}
#define OJ_CTZ(x) oj_ctz_fallback(x)
#endif

// =============================================================================
// x86/x86_64 SIMD detection
// =============================================================================
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
#define HAVE_SIMD_X86 1

// Include appropriate SIMD headers
#if defined(_MSC_VER)
// MSVC: use intrin.h for all intrinsics
#include <intrin.h>
#define HAVE_SIMD_SSE4_2 1
#define HAVE_SIMD_SSE2 1
#elif defined(__GNUC__) || defined(__clang__)
// GCC/Clang: check for header availability and include them
// We include headers but use target attributes to enable instructions per-function
// Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
#if __has_include(<cpuid.h>)
#include <cpuid.h>
#endif
#if defined(__SSE4_2__) || defined(__SSE2__)
// If any SSE is enabled globally, x86intrin.h should be available
#include <x86intrin.h>
#define HAVE_SIMD_SSE4_2 1
#define HAVE_SIMD_SSE2 1
#else
// Try to include headers anyway for target attribute functions
#if __has_include(<x86intrin.h>)
#include <x86intrin.h>
#define HAVE_SIMD_SSE4_2 1
#define HAVE_SIMD_SSE2 1
#elif __has_include(<nmmintrin.h>)
#include <nmmintrin.h>
#define HAVE_SIMD_SSE4_2 1
#define HAVE_SIMD_SSE2 1
#elif __has_include(<emmintrin.h>)
#include <emmintrin.h>
#define HAVE_SIMD_SSE2 1
#endif
#endif
#endif

// Target attribute macros for function-level SIMD enabling
#if defined(__clang__) || defined(__GNUC__)
#define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
#define OJ_TARGET_SSE2 __attribute__((target("sse2")))
#else
// MSVC doesn't need target attributes - intrinsics are always available
#define OJ_TARGET_SSE42
#define OJ_TARGET_SSE2
#endif

#endif  // x86/x86_64

// =============================================================================
// ARM NEON detection
// =============================================================================
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
#define HAVE_SIMD_NEON 1
#define SIMD_MINIMUM_THRESHOLD 6
#include <arm_neon.h>
#endif

// =============================================================================
// SIMD type string for debugging/logging
// =============================================================================
#if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
#define HAVE_SIMD_STRING_SCAN 1
#define SIMD_TYPE "x86 (runtime detected)"
#elif defined(HAVE_SIMD_NEON)
#define HAVE_SIMD_STRING_SCAN 1
#define SIMD_TYPE "NEON"
#else
#define SIMD_TYPE "none"
#endif

#if defined(HAVE_SIMD_SSE4_2)

#define SIMD_MINIMUM_THRESHOLD 6

extern void initialize_sse42(void);

static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
    // Extract high 4 bits to determine which 16-byte chunk (0-15)
    __m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));

    // Extract low 4 bits for index within the chunk (0-15)
    __m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));

    // Perform lookups in all 16 tables
    __m128i results[16];
    for (int i = 0; i < tab_size; i++) {
        results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
    }

    // Create masks for each chunk and blend results
    __m128i final_result = _mm_setzero_si128();

    for (int i = 0; i < tab_size; i++) {
        __m128i mask          = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
        __m128i masked_result = _mm_and_si128(mask, results[i]);
        final_result          = _mm_or_si128(final_result, masked_result);
    }

    return final_result;
}

#endif

#endif /* OJ_SIMD_H */