File: XnSIMD-Neon.h

package info (click to toggle)
openni2 2.2.0.33%2Bdfsg-15
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 22,232 kB
  • sloc: cpp: 111,183; ansic: 35,511; sh: 10,542; python: 1,313; java: 952; makefile: 575; xml: 12
file content (177 lines) | stat: -rw-r--r-- 4,594 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#ifndef _XN_SIMD_SSE_H_
#define _XN_SIMD_SSE_H_

#include "arm_neon.h"
#include <XnOS.h>
#include <stdio.h>

typedef int16x8_t XN_INT128;

typedef XnInt32 XN_INT32;
typedef XnInt16 XN_INT16;
typedef XnUInt64 XN_UINT64;
typedef XnUInt16 XN_UINT16;

static __inline__ __attribute__ ((__always_inline__)) 
int16x8_t XnPacksSigned16(int16x8_t a, int16x8_t b) // _mm_packs_epi16
{
		return vcombine_s8(vqmovn_s16(a), vqmovn_s16(b));
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnSetZero128()  //_mm_sub_epi16
{
        return vdupq_n_s16(0);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnOr128(int16x8_t a, int16x8_t b) //_mm_or_si128
{
	return vorrq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnShiftLeft128(int16x8_t a, const int32_t imm2) //XnShiftLeft128
{
	return vreinterpretq_s16_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s16(a), 16 - imm2));
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnShiftRight128(int16x8_t a, const int32_t imm2) //XnShiftLeft128
{
	return vreinterpretq_s16_s8(vextq_s8(vreinterpretq_s8_s16(a), vdupq_n_s8(0), imm2));
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnShiftRight16(int16x8_t a, const XN_INT32 count) //_mm_srai_epi16
{
	return vshrq_n_u16(a, count);
}
static __inline __attribute__ ((__always_inline__))
int16x8_t XnShiftRight16Sign(int16x8_t a, const XN_INT32 imm2)
{
	return vshrq_n_s16(a, imm2);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnAnd128(int16x8_t a, int16x8_t b) // _mm_and_si128
{
	return vandq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnAndNot128(int16x8_t a, int16x8_t b) // _mm_andnot_si128
{
	return vbicq_s16(b, a);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnHAdd16(int16x8_t a, int16x8_t b) // _mm_and_si128
{
	int16x8x2_t tmp = vuzpq_s16(a, b);
	return vaddq_s16(tmp.val[0], tmp.val[1]);
}

static __inline __attribute__ ((__always_inline__))
int8x16_t XnAdd16(int16x8_t a, int16x8_t b) // _mm_add_epi16
{
	return vaddq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnAdd16AndSaturates(int16x8_t a, int16x8_t b) // _mm_adds_epi16
{
	return vqaddq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnAddUnsigned16AndSaturates(int16x8_t a, int16x8_t b) //_mm_adds_epi16
{
    return vqaddq_u16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnSub16(int16x8_t a, int16x8_t b)  //_mm_sub_epi16
{
	return vsubq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint8x16_t XnSubSigned16(uint8x16_t a, uint8x16_t b)  //_mm_sub_epi16
{
	return vqsubq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnSubUnSigned16(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vqsubq_u16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnMult16(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vmulq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnMultUnSigned16(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vmulq_u16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnMin16(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vminq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnMax16(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vmaxq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnSetOne16(XN_INT16 a) //_mm_set1_epi16
{
	return vdupq_n_s16(a);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnCompareEqual(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vceqq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnCompareLessThan(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vcltq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnCompareGreaterThan(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vcgtq_s16(a, b);
}

static __inline __attribute__ ((__always_inline__))
int16x8_t XnSet16(XN_INT16 a7, XN_INT16 a6,
				  XN_INT16 a5, XN_INT16 a4,
				  XN_INT16 a3, XN_INT16 a2,
				  XN_INT16 a1, XN_INT16 a0) // _mm_set_epi16
{
	// The array may be optimized away by compiler for const input.
	const XN_INT16	temp[8] = {a0, a1, a2, a3, a4, a5, a6, a7};

	return vld1q_s16(temp);
}

static __inline __attribute__ ((__always_inline__))
uint16x8_t XnAverageUnsigned16(uint16x8_t a, uint16x8_t b)  //_mm_sub_epi16
{
	return vrhaddq_s16(a, b);
}

#endif