1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
|
#ifdef BASE64_NEON32_USE_ASM
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
{
// This function duplicates the functionality of enc_loop_neon32_inner,
// but entirely with inline assembly. This gives a significant speedup
// over using NEON intrinsics, which do not always generate very good
// code. The logic of the assembly is directly lifted from the
// intrinsics version, so it can be used as a guide to this code.
// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;
uint8x16_t mask0, mask1, mask2, mask3;
// A lookup table containing the absolute offsets for all ranges.
const uint8x16_t lut = {
65U, 71U, 252U, 252U,
252U, 252U, 252U, 252U,
252U, 252U, 252U, 252U,
237U, 240U, 0U, 0U
};
// Numeric constants.
const uint8x16_t n51 = vdupq_n_u8(51);
const uint8x16_t n25 = vdupq_n_u8(25);
const uint8x16_t n63 = vdupq_n_u8(63);
__asm__ (
// Load 48 bytes and deinterleave. The bytes are loaded to
// hard-coded registers q12, q13 and q14, to ensure that they
// are contiguous. Increment the source pointer.
"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
// Reshuffle the bytes using temporaries.
"vshr.u8 %q[t0], q12, #2 \n\t"
"vshr.u8 %q[t1], q13, #4 \n\t"
"vshr.u8 %q[t2], q14, #6 \n\t"
"vsli.8 %q[t1], q12, #4 \n\t"
"vsli.8 %q[t2], q13, #2 \n\t"
"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
"vand.u8 %q[t3], q14, %q[n63] \n\t"
// t0..t3 are the reshuffled inputs. Create LUT indices.
"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
"vqsub.u8 q15, %q[t3], %q[n51] \n\t"
// Create the mask for range #0.
"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
// Subtract -1 to correct the LUT indices.
"vsub.u8 q12, %q[m0] \n\t"
"vsub.u8 q13, %q[m1] \n\t"
"vsub.u8 q14, %q[m2] \n\t"
"vsub.u8 q15, %q[m3] \n\t"
// Lookup the delta values.
"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
"vtbl.u8 d31, {%q[lut]}, d31 \n\t"
// Add the delta values.
"vadd.u8 q12, %q[t0] \n\t"
"vadd.u8 q13, %q[t1] \n\t"
"vadd.u8 q14, %q[t2] \n\t"
"vadd.u8 q15, %q[t3] \n\t"
// Store 64 bytes and interleave. Increment the dest pointer.
"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
// Outputs (modified).
: [src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3),
[m0] "=&w" (mask0),
[m1] "=&w" (mask1),
[m2] "=&w" (mask2),
[m3] "=&w" (mask3)
// Inputs (not modified).
: [lut] "w" (lut),
[n25] "w" (n25),
[n51] "w" (n51),
[n63] "w" (n63)
// Clobbers.
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
"cc", "memory"
);
}
#endif
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
{
#ifdef BASE64_NEON32_USE_ASM
enc_loop_neon32_inner_asm(s, o);
#else
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);
// Reshuffle:
uint8x16x4_t out = enc_reshuffle(src);
// Translate reshuffled bytes to the Base64 alphabet:
out = enc_translate(out);
// Interleave and store output:
vst4q_u8(*o, out);
*s += 48;
*o += 64;
#endif
}
static inline void
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_neon32_inner(s, o);
break;
}
}
|