1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
|
/* This file was automatically imported with
import_gcry.py. Please don't modify it */
/* asm-common-aarch64.h - Poly1305 macros for ARMv8/AArch64 assembly
*
* Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GCRY_ASM_POLY1305_AARCH64_H
#define GCRY_ASM_POLY1305_AARCH64_H
#include "asm-common-aarch64.h"
#ifdef __AARCH64EL__
#define le_to_host(reg) /*_*/
#else
#define le_to_host(reg) rev reg, reg;
#endif
/**********************************************************************
poly1305 for stitched chacha20-poly1305 Aarch64 implementations
**********************************************************************/
#define POLY_RSTATE x8
#define POLY_RSRC x9
#define POLY_R_H0 x10
#define POLY_R_H1 x11
#define POLY_R_H2 x12
#define POLY_R_H2d w12
#define POLY_R_R0 x13
#define POLY_R_R1 x14
#define POLY_R_R1_MUL5 x15
#define POLY_R_X0_HI x16
#define POLY_R_X0_LO x17
#define POLY_R_X1_HI x19
#define POLY_R_X1_LO x20
#define POLY_R_ONE x21
#define POLY_R_ONEd w21
#define POLY_TMP0 x22
#define POLY_TMP1 x23
#define POLY_TMP2 x24
#define POLY_TMP3 x25
#define POLY_CHACHA_ROUND x26
#define POLY_S_R0 (4 * 4 + 0 * 8)
#define POLY_S_R1 (4 * 4 + 1 * 8)
#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)
#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)
#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)
#define POLY1305_PUSH_REGS() \
stp x19, x20, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(19, 0); \
CFI_REG_ON_STACK(20, 8); \
stp x21, x22, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(21, 0); \
CFI_REG_ON_STACK(22, 8); \
stp x23, x24, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(23, 0); \
CFI_REG_ON_STACK(24, 8); \
stp x25, x26, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(25, 0); \
CFI_REG_ON_STACK(26, 8);
#define POLY1305_POP_REGS() \
ldp x25, x26, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x25); \
CFI_RESTORE(x26); \
ldp x23, x24, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x23); \
CFI_RESTORE(x24); \
ldp x21, x22, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x21); \
CFI_RESTORE(x22); \
ldp x19, x20, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x19); \
CFI_RESTORE(x20);
#define POLY1305_LOAD_STATE() \
ldr POLY_R_R1, [POLY_RSTATE, #(POLY_S_R1)]; \
ldr POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
ldr POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
ldr POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; \
ldr POLY_R_R0, [POLY_RSTATE, #(POLY_S_R0)]; \
add POLY_R_R1_MUL5, POLY_R_R1, POLY_R_R1, lsr #2; \
mov POLY_R_ONE, #1;
#define POLY1305_STORE_STATE() \
str POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
str POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
str POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)];
#define POLY1305_BLOCK_PART1(src_offset) \
/* a = h + m */ \
ldr POLY_TMP0, [POLY_RSRC, #((src_offset) + 0 * 8)];
#define POLY1305_BLOCK_PART2(src_offset) \
ldr POLY_TMP1, [POLY_RSRC, #((src_offset) + 1 * 8)];
#define POLY1305_BLOCK_PART3() \
le_to_host(POLY_TMP0);
#define POLY1305_BLOCK_PART4() \
le_to_host(POLY_TMP1);
#define POLY1305_BLOCK_PART5() \
adds POLY_R_H0, POLY_R_H0, POLY_TMP0;
#define POLY1305_BLOCK_PART6() \
adcs POLY_R_H1, POLY_R_H1, POLY_TMP1;
#define POLY1305_BLOCK_PART7() \
adc POLY_R_H2d, POLY_R_H2d, POLY_R_ONEd;
#define POLY1305_BLOCK_PART8() \
/* h = a * r (partial mod 2^130-5): */ \
mul POLY_R_X1_LO, POLY_R_H0, POLY_R_R1; /* lo: h0 * r1 */
#define POLY1305_BLOCK_PART9() \
mul POLY_TMP0, POLY_R_H1, POLY_R_R0; /* lo: h1 * r0 */
#define POLY1305_BLOCK_PART10() \
mul POLY_R_X0_LO, POLY_R_H0, POLY_R_R0; /* lo: h0 * r0 */
#define POLY1305_BLOCK_PART11() \
umulh POLY_R_X1_HI, POLY_R_H0, POLY_R_R1; /* hi: h0 * r1 */
#define POLY1305_BLOCK_PART12() \
adds POLY_R_X1_LO, POLY_R_X1_LO, POLY_TMP0;
#define POLY1305_BLOCK_PART13() \
umulh POLY_TMP1, POLY_R_H1, POLY_R_R0; /* hi: h1 * r0 */
#define POLY1305_BLOCK_PART14() \
mul POLY_TMP2, POLY_R_H1, POLY_R_R1_MUL5; /* lo: h1 * r1 mod 2^130-5 */
#define POLY1305_BLOCK_PART15() \
umulh POLY_R_X0_HI, POLY_R_H0, POLY_R_R0; /* hi: h0 * r0 */
#define POLY1305_BLOCK_PART16() \
adc POLY_R_X1_HI, POLY_R_X1_HI, POLY_TMP1;
#define POLY1305_BLOCK_PART17() \
umulh POLY_TMP3, POLY_R_H1, POLY_R_R1_MUL5; /* hi: h1 * r1 mod 2^130-5 */
#define POLY1305_BLOCK_PART18() \
adds POLY_R_X0_LO, POLY_R_X0_LO, POLY_TMP2;
#define POLY1305_BLOCK_PART19() \
mul POLY_R_H1, POLY_R_H2, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */
#define POLY1305_BLOCK_PART20() \
adc POLY_R_X0_HI, POLY_R_X0_HI, POLY_TMP3;
#define POLY1305_BLOCK_PART21() \
mul POLY_R_H2, POLY_R_H2, POLY_R_R0; /* h2 * r0 */
#define POLY1305_BLOCK_PART22() \
adds POLY_R_H1, POLY_R_H1, POLY_R_X1_LO;
#define POLY1305_BLOCK_PART23() \
adc POLY_R_H0, POLY_R_H2, POLY_R_X1_HI;
#define POLY1305_BLOCK_PART24() \
/* carry propagation */ \
and POLY_R_H2, POLY_R_H0, #3;
#define POLY1305_BLOCK_PART25() \
lsr POLY_R_H0, POLY_R_H0, #2;
#define POLY1305_BLOCK_PART26() \
add POLY_R_H0, POLY_R_H0, POLY_R_H0, lsl #2;
#define POLY1305_BLOCK_PART27() \
adds POLY_R_H0, POLY_R_H0, POLY_R_X0_LO;
#define POLY1305_BLOCK_PART28() \
adcs POLY_R_H1, POLY_R_H1, POLY_R_X0_HI;
#define POLY1305_BLOCK_PART29() \
adc POLY_R_H2d, POLY_R_H2d, wzr;
//#define TESTING_POLY1305_ASM
#ifdef TESTING_POLY1305_ASM
/* for testing only. */
.align 3
.globl _gcry_poly1305_aarch64_blocks1
ELF(.type _gcry_poly1305_aarch64_blocks1,%function;)
_gcry_poly1305_aarch64_blocks1:
/* input:
* x0: poly1305-state
* x1: src
* x2: nblks
*/
CFI_STARTPROC()
POLY1305_PUSH_REGS();
mov POLY_RSTATE, x0;
mov POLY_RSRC, x1;
POLY1305_LOAD_STATE();
.L_gcry_poly1305_aarch64_loop1:
POLY1305_BLOCK_PART1(0 * 16);
POLY1305_BLOCK_PART2(0 * 16);
add POLY_RSRC, POLY_RSRC, #16;
POLY1305_BLOCK_PART3();
POLY1305_BLOCK_PART4();
POLY1305_BLOCK_PART5();
POLY1305_BLOCK_PART6();
POLY1305_BLOCK_PART7();
POLY1305_BLOCK_PART8();
POLY1305_BLOCK_PART9();
POLY1305_BLOCK_PART10();
POLY1305_BLOCK_PART11();
POLY1305_BLOCK_PART12();
POLY1305_BLOCK_PART13();
POLY1305_BLOCK_PART14();
POLY1305_BLOCK_PART15();
POLY1305_BLOCK_PART16();
POLY1305_BLOCK_PART17();
POLY1305_BLOCK_PART18();
POLY1305_BLOCK_PART19();
POLY1305_BLOCK_PART20();
POLY1305_BLOCK_PART21();
POLY1305_BLOCK_PART22();
POLY1305_BLOCK_PART23();
POLY1305_BLOCK_PART24();
POLY1305_BLOCK_PART25();
POLY1305_BLOCK_PART26();
POLY1305_BLOCK_PART27();
POLY1305_BLOCK_PART28();
POLY1305_BLOCK_PART29();
subs x2, x2, #1;
b.ne .L_gcry_poly1305_aarch64_loop1;
POLY1305_STORE_STATE();
mov x0, #0;
POLY1305_POP_REGS();
ret_spec_stop;
CFI_ENDPROC()
ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;)
#endif
#endif /* GCRY_ASM_POLY1305_AARCH64_H */
|