1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <asm/cache.h>
#include <asm/assembler.h>
.text
#define state0 v0
#define state1 v1
#define state2 v2
#define state3 v3
#define copy0 v4
#define copy0_q q4
#define copy1 v5
#define copy2 v6
#define copy3 v7
#define copy3_d d7
#define one_d d16
#define one_q q16
#define one_v v16
#define tmp v17
#define rot8 v18
/*
* ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive
* number of blocks of output with nonce 0, taking an input key and 8-bytes
* counter. Importantly does not spill to the stack.
*
* This implementation avoids d8-d15 because they are callee-save in user
* space.
*
* void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
* const uint8_t *key,
* uint32_t *counter,
* size_t nblocks)
*
* x0: output bytes
* x1: 32-byte key input
* x2: 8-byte counter input/output
* x3: number of 64-byte block to write to output
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
/* copy0 = "expand 32-byte k" */
mov_q x8, 0x3320646e61707865
mov_q x9, 0x6b20657479622d32
mov copy0.d[0], x8
mov copy0.d[1], x9
/* copy1,copy2 = key */
ld1 { copy1.4s, copy2.4s }, [x1]
/* copy3 = counter || zero nonce */
ld1 { copy3.2s }, [x2]
movi one_v.2s, #1
uzp1 one_v.4s, one_v.4s, one_v.4s
.Lblock:
/* copy state to auxiliary vectors for the final add after the permute. */
mov state0.16b, copy0.16b
mov state1.16b, copy1.16b
mov state2.16b, copy2.16b
mov state3.16b, copy3.16b
mov w4, 20
.Lpermute:
/*
* Permute one 64-byte block where the state matrix is stored in the four NEON
* registers state0-state3. It performs matrix operations on four words in parallel,
* but requires shuffling to rearrange the words after each round.
*/
.Ldoubleround:
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
add state0.4s, state0.4s, state1.4s
eor state3.16b, state3.16b, state0.16b
rev32 state3.8h, state3.8h
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #12
sri state1.4s, tmp.4s, #20
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
add state0.4s, state0.4s, state1.4s
eor tmp.16b, state3.16b, state0.16b
shl state3.4s, tmp.4s, #8
sri state3.4s, tmp.4s, #24
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #7
sri state1.4s, tmp.4s, #25
/* state1[0,1,2,3] = state1[1,2,3,0] */
ext state1.16b, state1.16b, state1.16b, #4
/* state2[0,1,2,3] = state2[2,3,0,1] */
ext state2.16b, state2.16b, state2.16b, #8
/* state3[0,1,2,3] = state3[1,2,3,0] */
ext state3.16b, state3.16b, state3.16b, #12
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
add state0.4s, state0.4s, state1.4s
eor state3.16b, state3.16b, state0.16b
rev32 state3.8h, state3.8h
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #12
sri state1.4s, tmp.4s, #20
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
add state0.4s, state0.4s, state1.4s
eor tmp.16b, state3.16b, state0.16b
shl state3.4s, tmp.4s, #8
sri state3.4s, tmp.4s, #24
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #7
sri state1.4s, tmp.4s, #25
/* state1[0,1,2,3] = state1[3,0,1,2] */
ext state1.16b, state1.16b, state1.16b, #12
/* state2[0,1,2,3] = state2[2,3,0,1] */
ext state2.16b, state2.16b, state2.16b, #8
/* state3[0,1,2,3] = state3[1,2,3,0] */
ext state3.16b, state3.16b, state3.16b, #4
subs w4, w4, #2
b.ne .Ldoubleround
/* output0 = state0 + state0 */
add state0.4s, state0.4s, copy0.4s
/* output1 = state1 + state1 */
add state1.4s, state1.4s, copy1.4s
/* output2 = state2 + state2 */
add state2.4s, state2.4s, copy2.4s
/* output2 = state3 + state3 */
add state3.4s, state3.4s, copy3.4s
st1 { state0.16b - state3.16b }, [x0]
/*
* ++copy3.counter, the 'add' clears the upper half of the SIMD register
* which is the expected behaviour here.
*/
add copy3_d, copy3_d, one_d
/* output += 64, --nblocks */
add x0, x0, 64
subs x3, x3, #1
b.ne .Lblock
/* counter = copy3.counter */
st1 { copy3.2s }, [x2]
/* Zero out the potentially sensitive regs, in case nothing uses these again. */
movi state0.16b, #0
movi state1.16b, #0
movi state2.16b, #0
movi state3.16b, #0
movi copy1.16b, #0
movi copy2.16b, #0
ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
emit_aarch64_feature_1_and
|