1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
|
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __WORD_H__
#define __WORD_H__
/* for posix_memalign */
#define _XOPEN_SOURCE 600
#define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
#include <string.h>
#if defined(__sun) && defined(__SVR4)
extern int posix_memalign(void **, size_t, size_t);
#endif
#include <assert.h>
#include <stdint.h>
#include "arch_intrinsics.h"
#include <decaf/common.h>
#ifndef _BSD_SOURCE
#define _BSD_SOURCE 1
#endif
#ifndef _DEFAULT_SOURCE
#define _DEFAULT_SOURCE 1
#endif
#include "portable_endian.h"
#include <stdlib.h>
#include <sys/types.h>
#include <inttypes.h>
#if defined(__ARM_NEON__)
#include <arm_neon.h>
#elif defined(__SSE2__)
#if !defined(__GNUC__) || __clang__ || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
#include <immintrin.h>
#else
#include <emmintrin.h>
#endif
#endif
#if (ARCH_WORD_BITS == 64)
typedef uint64_t word_t, mask_t;
typedef __uint128_t dword_t;
typedef int32_t hsword_t;
typedef int64_t sword_t;
typedef __int128_t dsword_t;
#elif (ARCH_WORD_BITS == 32)
typedef uint32_t word_t, mask_t;
typedef uint64_t dword_t;
typedef int16_t hsword_t;
typedef int32_t sword_t;
typedef int64_t dsword_t;
#else
#error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif
/* Scalar limbs are keyed off of the API word size instead of the arch word size. */
#if CRYPTONITE_DECAF_WORD_BITS == 64
#define SC_LIMB(x) (x##ull)
#elif CRYPTONITE_DECAF_WORD_BITS == 32
#define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
#else
#error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif
#ifdef __ARM_NEON__
typedef uint32x4_t vecmask_t;
#elif __clang__
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
#else /* GCC, hopefully? */
typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
typedef int64_t int64x2_t __attribute__((vector_size(16)));
typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
typedef int64_t int64x4_t __attribute__((vector_size(32)));
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
typedef int32_t int32x4_t __attribute__((vector_size(16)));
typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
typedef int32_t int32x2_t __attribute__((vector_size(8)));
typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
typedef int32_t int32x8_t __attribute__((vector_size(32)));
typedef word_t vecmask_t __attribute__((vector_size(32)));
#endif
#if __AVX2__
#define VECTOR_ALIGNED __attribute__((aligned(32)))
typedef uint32x8_t big_register_t;
typedef uint64x4_t uint64xn_t;
typedef uint32x8_t uint32xn_t;
static CRYPTONITE_DECAF_INLINE big_register_t
br_set_to_mask(mask_t x) {
uint32_t y = (uint32_t)x;
big_register_t ret = {y,y,y,y,y,y,y,y};
return ret;
}
#elif __SSE2__
#define VECTOR_ALIGNED __attribute__((aligned(16)))
typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t;
typedef uint32x4_t uint32xn_t;
static CRYPTONITE_DECAF_INLINE big_register_t
br_set_to_mask(mask_t x) {
uint32_t y = x;
big_register_t ret = {y,y,y,y};
return ret;
}
#elif __ARM_NEON__
#define VECTOR_ALIGNED __attribute__((aligned(16)))
typedef uint32x4_t big_register_t;
typedef uint64x2_t uint64xn_t;
typedef uint32x4_t uint32xn_t;
static CRYPTONITE_DECAF_INLINE big_register_t
br_set_to_mask(mask_t x) {
return vdupq_n_u32(x);
}
#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
#define VECTOR_ALIGNED __attribute__((aligned(8)))
typedef uint64_t big_register_t, uint64xn_t;
typedef uint32_t uint32xn_t;
static CRYPTONITE_DECAF_INLINE big_register_t
br_set_to_mask(mask_t x) {
return (big_register_t)x;
}
#else
#define VECTOR_ALIGNED __attribute__((aligned(4)))
typedef uint64_t uint64xn_t;
typedef uint32_t uint32xn_t;
typedef uint32_t big_register_t;
static CRYPTONITE_DECAF_INLINE big_register_t
br_set_to_mask(mask_t x) {
return (big_register_t)x;
}
#endif
typedef struct {
uint64xn_t unaligned;
} __attribute__((packed)) unaligned_uint64xn_t;
typedef struct {
uint32xn_t unaligned;
} __attribute__((packed)) unaligned_uint32xn_t;
#if __AVX2__
static CRYPTONITE_DECAF_INLINE big_register_t
br_is_zero(big_register_t x) {
return (big_register_t)(x == br_set_to_mask(0));
}
#elif __SSE2__
static CRYPTONITE_DECAF_INLINE big_register_t
br_is_zero(big_register_t x) {
return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
//return (big_register_t)(x == br_set_to_mask(0));
}
#elif __ARM_NEON__
static CRYPTONITE_DECAF_INLINE big_register_t
br_is_zero(big_register_t x) {
return vceqq_u32(x,x^x);
}
#else
#define br_is_zero word_is_zero
#endif
/**
* Really call memset, in a way that prevents the compiler from optimizing it out.
* @param p The object to zeroize.
* @param c The char to set it to (probably zero).
* @param s The size of the object.
*/
#if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__)
#define HAS_MEMSET_S
#endif
#if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1
#define NEED_MEMSET_S_EXTERN
#endif
#ifdef HAS_MEMSET_S
#ifdef NEED_MEMSET_S_EXTERN
extern int memset_s(void *, size_t, int, size_t);
#endif
static CRYPTONITE_DECAF_INLINE void
really_memset(void *p, char c, size_t s) {
memset_s(p, s, c, s);
}
#else
/* PERF: use words? */
static CRYPTONITE_DECAF_INLINE void
really_memset(void *p, char c, size_t s) {
volatile char *pv = (volatile char *)p;
size_t i;
for (i=0; i<s; i++) pv[i] = c;
}
#endif
/**
* Allocate memory which is sufficiently aligned to be used for the
* largest vector on the system (for now that's a big_register_t).
*
* Man malloc says that it does this, but at least for AVX2 on MacOS X,
* it's lying.
*
* @param size The size of the region to allocate.
* @return A suitable pointer, which can be free'd with free(),
* or NULL if no memory can be allocated.
*/
static CRYPTONITE_DECAF_INLINE void *
malloc_vector(size_t size) {
void *out = NULL;
int ret = posix_memalign(&out, sizeof(big_register_t), size);
if (ret) {
return NULL;
} else {
return out;
}
}
/* PERF: vectorize vs unroll */
#ifdef __clang__
#if 100*__clang_major__ + __clang_minor__ > 305
#define UNROLL _Pragma("clang loop unroll(full)")
#endif
#endif
#ifndef UNROLL
#define UNROLL
#endif
/* The plan on booleans:
*
* The external interface uses cryptonite_decaf_bool_t, but this might be a different
* size than our particular arch's word_t (and thus mask_t). Also, the caller
* isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes
* and checks nonzero.
*
* On the flip side, mask_t is always -1 or 0, but it might be a different size
* than cryptonite_decaf_bool_t.
*
* On the third hand, we have success vs boolean types, but that's handled in
* common.h: it converts between cryptonite_decaf_bool_t and cryptonite_decaf_error_t.
*/
static CRYPTONITE_DECAF_INLINE cryptonite_decaf_bool_t mask_to_bool (mask_t m) {
return (cryptonite_decaf_sword_t)(sword_t)m;
}
static CRYPTONITE_DECAF_INLINE mask_t bool_to_mask (cryptonite_decaf_bool_t m) {
/* On most arches this will be optimized to a simple cast. */
mask_t ret = 0;
unsigned int limit = sizeof(cryptonite_decaf_bool_t)/sizeof(mask_t);
if (limit < 1) limit = 1;
for (unsigned int i=0; i<limit; i++) {
ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
}
return ret;
}
static CRYPTONITE_DECAF_INLINE void ignore_result ( cryptonite_decaf_bool_t boo ) {
(void)boo;
}
#endif /* __WORD_H__ */
|