1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
|
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __AMMINTRIN_H
#define __AMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param x
/// The value from which bits are extracted.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a x are extracted. If the length is zero but the
/// index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
#define _mm_extracti_si64(x, len, idx) \
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
(char)(len), (char)(idx)))
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index and of the length specified by
/// \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param __x
/// The value from which bits are extracted.
/// \param __y
/// Specifies the index of the least significant bit at [13:8] and the
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
/// length is interpreted as 64. If the sum of the index and length is
/// greater than 64, the result is undefined. If the length and index are
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
/// Inserts bits of a specified length from the source integer vector
/// \a y into the lower 64 bits of the destination integer vector \a x at
/// the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
/// const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length \a len and by the index \a idx specifying the
/// least significant bit.
/// \param y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a y of length \a len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a x with the specified bitfields replaced by the
/// lower bits of source operand \a y. The upper 64 bits of the return value
/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// Inserts bits of a specified length from the source integer vector
/// \a __y into the lower 64 bits of the destination integer vector \a __x
/// at the index and of the length specified by \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param __x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
/// specified by operand \a __y.
/// \param __y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a __y with length
/// specified by bits [69:64]. These are inserted into the destination at the
/// index specified by bits [77:72]; all other bits are ignored. If bits
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
/// index and length is greater than 64, the result is undefined. If the
/// length and index are both zero, bits [63:0] of parameter \a __y are
/// inserted into parameter \a __x. If the length is zero but the index is
/// non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a __x with the specified bitfields replaced by the
/// lower bits of source operand \a __y. The upper 64 bits of the return
/// value are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
/// Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
///
/// \param __p
/// The 64-bit memory location used to store the register value.
/// \param __a
/// The 64-bit double-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_sd(double *__p, __m128d __a)
{
__builtin_ia32_movntsd(__p, (__v2df)__a);
}
/// Stores a 32-bit single-precision floating-point value in a 32-bit
/// memory location. To minimize caching, the data is flagged as
/// non-temporal (unlikely to be used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
///
/// \param __p
/// The 32-bit memory location used to store the register value.
/// \param __a
/// The 32-bit single-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ss(float *__p, __m128 __a)
{
__builtin_ia32_movntss(__p, (__v4sf)__a);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __AMMINTRIN_H */
|