1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
|
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains assembly-optimized implementations of Scalable Matrix
/// Extension (SME) compatible memset and memchr functions.
///
/// These implementations depend on unaligned access and floating-point support.
///
/// Routines taken from libc/AOR_v20.02/string/aarch64.
///
//===----------------------------------------------------------------------===//
#include "../assembly.h"
//
// __arm_sc_memset
//
#define dstin x0
#define val x1
#define valw w1
#define count x2
#define dst x3
#define dstend2 x4
#define zva_val x5
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
#ifdef __ARM_FEATURE_SVE
mov z0.b, valw
#else
bfi valw, valw, #8, #8
bfi valw, valw, #16, #16
bfi val, val, #32, #32
fmov d0, val
fmov v0.d[1], val
#endif
add dstend2, dstin, count
cmp count, 96
b.hi 7f // set_long
cmp count, 16
b.hs 4f // set_medium
mov val, v0.D[0]
/* Set 0..15 bytes. */
tbz count, 3, 1f
str val, [dstin]
str val, [dstend2, -8]
ret
nop
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend2, -4]
ret
2: cbz count, 3f
strb valw, [dstin]
tbz count, 1, 3f
strh valw, [dstend2, -2]
3: ret
/* Set 17..96 bytes. */
4: // set_medium
str q0, [dstin]
tbnz count, 6, 6f // set96
str q0, [dstend2, -16]
tbz count, 5, 5f
str q0, [dstin, 16]
str q0, [dstend2, -32]
5: ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
6: // set96
str q0, [dstin, 16]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend2, -32]
ret
.p2align 4
7: // set_long
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
cmp count, 160
ccmp valw, 0, 0, hs
b.ne 9f // no_zva
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne 9f // no_zva
#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
sub count, dstend2, dst /* Count is now 64 too large. */
sub count, count, 128 /* Adjust count and bias for loop. */
.p2align 4
8: // zva_loop
add dst, dst, 64
dc zva, dst
subs count, count, 64
b.hi 8b // zva_loop
stp q0, q0, [dstend2, -64]
stp q0, q0, [dstend2, -32]
ret
9: // no_zva
sub count, dstend2, dst /* Count is 16 too large. */
sub dst, dst, 16 /* Dst is biased by -32. */
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
10: // no_zva_loop
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]!
subs count, count, 64
b.hi 10b // no_zva_loop
stp q0, q0, [dstend2, -64]
stp q0, q0, [dstend2, -32]
ret
END_COMPILERRT_FUNCTION(__arm_sc_memset)
//
// __arm_sc_memchr
//
#define srcin x0
#define chrin w1
#define cntin x2
#define result x0
#define src x3
#define tmp x4
#define wtmp2 w5
#define synd x6
#define soff x9
#define cntrem x10
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_chr1 v3
#define vhas_chr2 v4
#define vrepmask v5
#define vend v6
/*
* Core algorithm:
*
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
* requested character and bit 1 is not used (faster than using a 32bit
* syndrome). Since the bits in the syndrome reflect exactly the order in which
* things occur in the original string, counting trailing zeros allows to
* identify exactly which byte has matched.
*/
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, 4f
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
*/
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
/* Work with aligned 32-byte chunks */
bic src, srcin, #31
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
b.eq 0f
/*
* Input string is not 32-byte aligned. We calculate the syndrome
* value for the aligned 32 bytes block containing the first bytes
* and mask the irrelevant part.
*/
ld1 {vdata1.16b, vdata2.16b}, [src], #32
sub tmp, soff, #32
adds cntin, cntin, tmp
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Clear the soff*2 lower bits */
lsl tmp, soff, #1
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
b.ls 2f
/* Have we found something already? */
cbnz synd, 3f
0: // loop
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
b.ls 1f
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.d[0]
/* We're not out of data, loop if we haven't found the character */
cbz synd, 0b
1: // end
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
b.hi 3f
2: // masklast
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
sub tmp, tmp, #32
neg tmp, tmp, lsl #1
lsl synd, synd, tmp
lsr synd, synd, tmp
3: // tail
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
sub src, src, #32
/* Check that we have found a character */
cmp synd, #0
/* And count the leading zeros */
clz synd, synd
/* Compute the potential result */
add result, src, synd, lsr #1
/* Select result or NULL */
csel result, xzr, result, eq
ret
4: // zero_length
mov result, #0
ret
END_COMPILERRT_FUNCTION(__arm_sc_memchr)
|