File: word.h

package info (click to toggle)
haskell-cryptonite 0.26-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,160 kB
  • sloc: ansic: 21,001; haskell: 16,572; makefile: 8
file content (281 lines) | stat: -rw-r--r-- 8,714 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
/* Copyright (c) 2014 Cryptography Research, Inc.
 * Released under the MIT License.  See LICENSE.txt for license information.
 */

#ifndef __WORD_H__
#define __WORD_H__

/* for posix_memalign */
#define _XOPEN_SOURCE 600
#define __STDC_WANT_LIB_EXT1__ 1 /* for memset_s */
#include <string.h>
#if defined(__sun) && defined(__SVR4)
extern int posix_memalign(void **, size_t, size_t);
#endif

#include <assert.h>
#include <stdint.h>
#include "arch_intrinsics.h"

#include <decaf/common.h>

#ifndef _BSD_SOURCE
#define _BSD_SOURCE 1
#endif

#ifndef _DEFAULT_SOURCE
#define _DEFAULT_SOURCE 1
#endif

#include "portable_endian.h"

#include <stdlib.h>
#include <sys/types.h>
#include <inttypes.h>

#if defined(__ARM_NEON__)
#include <arm_neon.h>
#elif defined(__SSE2__)
    #if !defined(__GNUC__) || __clang__ || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
        #include <immintrin.h>
    #else
        #include <emmintrin.h>
    #endif
#endif

#if (ARCH_WORD_BITS == 64)
    typedef uint64_t word_t, mask_t;
    typedef __uint128_t dword_t;
    typedef int32_t hsword_t;
    typedef int64_t sword_t;
    typedef __int128_t dsword_t;
#elif (ARCH_WORD_BITS == 32)
    typedef uint32_t word_t, mask_t;
    typedef uint64_t dword_t;
    typedef int16_t hsword_t;
    typedef int32_t sword_t;
    typedef int64_t dsword_t;
#else
    #error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif
    
/* Scalar limbs are keyed off of the API word size instead of the arch word size. */
#if CRYPTONITE_DECAF_WORD_BITS == 64
    #define SC_LIMB(x) (x##ull)
#elif CRYPTONITE_DECAF_WORD_BITS == 32
    #define SC_LIMB(x) ((uint32_t)x##ull),(x##ull>>32)
#else
    #error "For now, libdecaf only supports 32- and 64-bit architectures."
#endif

#ifdef __ARM_NEON__
    typedef uint32x4_t vecmask_t;
#elif __clang__
    typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
    typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
    typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
    typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
    typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
    typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
    typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
    typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
    typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
    typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
    typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
#else /* GCC, hopefully? */
    typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
    typedef int64_t  int64x2_t __attribute__((vector_size(16)));
    typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
    typedef int64_t  int64x4_t __attribute__((vector_size(32)));
    typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
    typedef int32_t  int32x4_t __attribute__((vector_size(16)));
    typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
    typedef int32_t  int32x2_t __attribute__((vector_size(8)));
    typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
    typedef int32_t  int32x8_t __attribute__((vector_size(32)));
    typedef word_t vecmask_t __attribute__((vector_size(32)));
#endif

#if __AVX2__
    #define VECTOR_ALIGNED __attribute__((aligned(32)))
    typedef uint32x8_t big_register_t;
    typedef uint64x4_t uint64xn_t;
    typedef uint32x8_t uint32xn_t;

    static CRYPTONITE_DECAF_INLINE big_register_t
    br_set_to_mask(mask_t x) {
        uint32_t y = (uint32_t)x;
        big_register_t ret = {y,y,y,y,y,y,y,y};
        return ret;
    }
#elif __SSE2__
    #define VECTOR_ALIGNED __attribute__((aligned(16)))
    typedef uint32x4_t big_register_t;
    typedef uint64x2_t uint64xn_t;
    typedef uint32x4_t uint32xn_t;

    static CRYPTONITE_DECAF_INLINE big_register_t
    br_set_to_mask(mask_t x) {
        uint32_t y = x;
        big_register_t ret = {y,y,y,y};
        return ret;
    }
#elif __ARM_NEON__
    #define VECTOR_ALIGNED __attribute__((aligned(16)))
    typedef uint32x4_t big_register_t;
    typedef uint64x2_t uint64xn_t;
    typedef uint32x4_t uint32xn_t;
    
    static CRYPTONITE_DECAF_INLINE big_register_t
    br_set_to_mask(mask_t x) {
        return vdupq_n_u32(x);
    }
#elif _WIN64 || __amd64__ || __X86_64__ || __aarch64__
    #define VECTOR_ALIGNED __attribute__((aligned(8)))
    typedef uint64_t big_register_t, uint64xn_t;

    typedef uint32_t uint32xn_t;
    static CRYPTONITE_DECAF_INLINE big_register_t
    br_set_to_mask(mask_t x) {
        return (big_register_t)x;
    }
#else
    #define VECTOR_ALIGNED __attribute__((aligned(4)))
    typedef uint64_t uint64xn_t;
    typedef uint32_t uint32xn_t;
    typedef uint32_t big_register_t;

    static CRYPTONITE_DECAF_INLINE big_register_t
    br_set_to_mask(mask_t x) {
        return (big_register_t)x;
    }
#endif

typedef struct {
    uint64xn_t unaligned;
} __attribute__((packed)) unaligned_uint64xn_t;

typedef struct {
    uint32xn_t unaligned;
} __attribute__((packed)) unaligned_uint32xn_t;

#if __AVX2__
    static CRYPTONITE_DECAF_INLINE big_register_t
    br_is_zero(big_register_t x) {
        return (big_register_t)(x == br_set_to_mask(0));
    }
#elif __SSE2__
    static CRYPTONITE_DECAF_INLINE big_register_t
    br_is_zero(big_register_t x) {
        return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
        //return (big_register_t)(x == br_set_to_mask(0));
    }
#elif __ARM_NEON__
    static CRYPTONITE_DECAF_INLINE big_register_t
    br_is_zero(big_register_t x) {
        return vceqq_u32(x,x^x);
    }
#else
    #define br_is_zero word_is_zero
#endif

/**
 * Really call memset, in a way that prevents the compiler from optimizing it out.
 * @param p The object to zeroize.
 * @param c The char to set it to (probably zero).
 * @param s The size of the object.
 */
#if defined(__DARWIN_C_LEVEL) || defined(__STDC_LIB_EXT1__)
#define HAS_MEMSET_S
#endif

#if !defined(__STDC_WANT_LIB_EXT1__) || __STDC_WANT_LIB_EXT1__ != 1
#define NEED_MEMSET_S_EXTERN
#endif

#ifdef HAS_MEMSET_S
    #ifdef NEED_MEMSET_S_EXTERN
        extern int memset_s(void *, size_t, int, size_t);
    #endif
    static CRYPTONITE_DECAF_INLINE void
    really_memset(void *p, char c, size_t s) {
        memset_s(p, s, c, s);
    }
#else
    /* PERF: use words? */
    static CRYPTONITE_DECAF_INLINE void
    really_memset(void *p, char c, size_t s) {
        volatile char *pv = (volatile char *)p;
        size_t i;
        for (i=0; i<s; i++) pv[i] = c;
    }
#endif

/**
 * Allocate memory which is sufficiently aligned to be used for the
 * largest vector on the system (for now that's a big_register_t).
 *
 * Man malloc says that it does this, but at least for AVX2 on MacOS X,
 * it's lying.
 *
 * @param size The size of the region to allocate.
 * @return A suitable pointer, which can be free'd with free(),
 * or NULL if no memory can be allocated.
 */
static CRYPTONITE_DECAF_INLINE void *
malloc_vector(size_t size) {
    void *out = NULL;
    
    int ret = posix_memalign(&out, sizeof(big_register_t), size);
    
    if (ret) {
        return NULL;
    } else {
        return out;
    }
}

/* PERF: vectorize vs unroll */
#ifdef __clang__
#if 100*__clang_major__ + __clang_minor__ > 305
#define UNROLL _Pragma("clang loop unroll(full)")
#endif
#endif

#ifndef UNROLL
#define UNROLL
#endif

/* The plan on booleans:
 *
 * The external interface uses cryptonite_decaf_bool_t, but this might be a different
 * size than our particular arch's word_t (and thus mask_t).  Also, the caller
 * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
 * and checks nonzero.
 *
 * On the flip side, mask_t is always -1 or 0, but it might be a different size
 * than cryptonite_decaf_bool_t.
 *
 * On the third hand, we have success vs boolean types, but that's handled in
 * common.h: it converts between cryptonite_decaf_bool_t and cryptonite_decaf_error_t.
 */
static CRYPTONITE_DECAF_INLINE cryptonite_decaf_bool_t mask_to_bool (mask_t m) {
    return (cryptonite_decaf_sword_t)(sword_t)m;
}

static CRYPTONITE_DECAF_INLINE mask_t bool_to_mask (cryptonite_decaf_bool_t m) {
    /* On most arches this will be optimized to a simple cast. */
    mask_t ret = 0;
    unsigned int limit = sizeof(cryptonite_decaf_bool_t)/sizeof(mask_t);
    if (limit < 1) limit = 1;
    for (unsigned int i=0; i<limit; i++) {
        ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
    }
    return ret;
}

static CRYPTONITE_DECAF_INLINE void ignore_result ( cryptonite_decaf_bool_t boo ) {
    (void)boo;
}

#endif /* __WORD_H__ */