1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
|
/* Optimized strncat implementation for PowerPC64/POWER7.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* The algorithm is as follows for aligned memory access :
if address of s2 is divisible by 0x7UL,
perform aligned doubleword catenation
else
perform unaligned catenation
The aligned comparison are made using cmpb instructions. */
/* char* [r3] strncat (const char *s1 [r3],
const char *s2 [r4],
size_t size [r5]) */
#include <sysdep.h>
#ifndef STRNCAT
# undef strncat
# define STRNCAT strncat
#endif
#ifndef STRLEN
# define STRLEN __strlen_ppc
#endif
#define FRAMESIZE (FRAME_MIN_SIZE+32)
.machine power7
EALIGN(STRNCAT, 4, 0)
CALL_MCOUNT 3
mflr r0 /* Load link register LR to r0. */
/* We shall use r29, r30 and r31 non volatile register for retention.
Save all the callee registers in the GPR save area. */
std r29, -24(r1) /* Save callers register r29. */
std r30, -16(r1) /* Save callers register r30. */
std r31, -8(r1) /* Save callers register r31. */
std r0, 16(r1) /* Store the link register. */
stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
/* Improve performance with CPU pre-fetch. */
dcbt 0, r3 /* Pre-fetch str to avoid cache
miss. */
dcbt 0, r4 /* Pre-fetch accept to avoid cache
miss. */
mr. r29, r5 /* Save "n" in r29. */
mr r30, r3 /* Save "s1" in r30 from r3. */
beq cr0,L(done)
mr r31, r4 /* Save "s2" in r31 from r4. */
bl STRLEN /* Call optimized strlen on s1; goto
end of s1. */
nop
cmpldi cr7, r29, 7 /* If s2 is <=7 process
byte-by-byte. */
add r3, r30, r3 /* Grab the last character of s1. */
bgt cr7,L(alignment) /* Process by aligned strings. */
cmpldi cr7, r29, 3 /* If n is >= 4, we can
byte-unroll. */
addi r9, r3, -1 /* Make "s1" point before next
character, increment when read. */
bgt cr7, L(bytes_unroll) /* Process each byte. */
L(byte_by_byte):
lbz r10, 0(r31)
addi r8, r9, 1
cmpdi cr7, r10, 0 /* Check for NULL in "s2". */
stb r10, 1(r9)
beq cr7, L(done)
add r9, r9, r29
subf r9, r8, r9
addi r9, r9, 1
mtctr r9
b L(branch2)
.p2align 4
L(branch1):
lbzu r10, 1(r31)
cmpdi cr7, r10, 0
stbu r10, 1(r8)
beq cr7,L(done)
L(branch2):
mr r9, r8
bdnz L(branch1)
beq cr7,L(done)
L(nullTerminate):
li r10, 0 /* Load NULL for termination. */
stb r10, 1(r9) /* Append or terminate s1 with
NULL. */
.p2align 4 /* A small section here. */
L(done): /* We return now. */
addi r1, r1, FRAMESIZE /* Restore stack pointer. */
mr r3, r30 /* Set the return value length of
string. */
ld r0, 16(r1) /* Read the saved link register. */
ld r29, -24(r1) /* Restore save register r29. */
ld r30, -16(r1) /* Restore save register r30. */
ld r31, -8(r1) /* Restore save register r31. */
mtlr r0 /* Restore link register. */
blr /* Branch to link register. */
.p2align 4
L(alignment):
rldicl. r9, r31, 0, 61 /* Check if s2 is 8byte aligned */
beq cr0,L(dwordAligned)
.p2align 4
/* Unaligned bytes in string, so process byte by byte.
POWER7 has performance gains over loop unroll. */
L(bytes_unroll):
addi r9, r3, -1
srdi r10, r29, 2
mtctr r10
b L(L10)
.p2align 4
L(L44):
lbz r10, 1(r31) /* Load byte. */
cmpdi cr7, r10, 0 /* Compare ; if byte not zero,
continue. */
stb r10, 2(r9) /* Store byte */
beq cr7, L(done)
addi r31, r31, 4
lbz r10, -2(r31) /* Perform loop unroll here on byte
load and store. */
cmpdi cr7, r10, 0
stb r10, 3(r9)
beq cr7, L(done)
lbz r10, -1(r31) /* Loop unroll here. */
cmpdi cr7, r10, 0
stbu r10, 4(r9)
beq cr7, L(done)
bdz L(leftNbytes)
L(L10):
lbz r10, 0(r31) /* Loop unroll here. */
cmpdi cr7, r10, 0
stb r10, 1(r9)
bne cr7,L(L44)
b L(done)
.p2align 4
/* If s2 is double word aligned, we load and store double word. */
L(dwordAligned):
/* read, write 8 bytes at a time */
srdi r8, r29, 3 /* Compute count for CTR to loop;
count = n/8. */
li r7, 0 /* Load r7 with NULL. */
li r10, 0 /* Load r10 with MASK '0'. */
mtctr r8 /* Move count to CTR. */
L(loop8):
ld r9, 0(r31) /* Read double word from s2. */
cmpb r6, r9, r10 /* Compare bytes in s2 we read
just now. */
cmpdi r6, 0 /* If cmpb returned NULL,
we continue. */
bne+ L(a8)
std r9, 0(r3) /* Append double word from s2
with s1. */
addi r3, r3, 8 /* Increment s1. */
addi r31, r31, 8 /* Increment s2. */
subi r29, r29, 8 /* Decrement count by 8. */
bdnz L(loop8) /* Continue until "count" is
non zero. */
L(a8):
cmpdi r29, 0 /* If "n" is already zero, we skip. */
beq+ L(align8align)
mtctr r29 /* Process left over bytes in "n". */
L(unaligned0):
lbz r9, 0(r31) /* Read a byte from s2. */
cmpw r9, r7 /* If byte is NULL, we stop here . */
beq+ L(align8align) /* Skip processing further if NULL. */
stb r9, 0(r3) /* If not NULL, store byte into s1. */
addi r3, r3, 1 /* Increment s1 by 1. */
addi r31, r31, 1 /* Increment s2 by 1. */
bdnz L(unaligned0) /* Decrement counter "n" and loop
until non zero. */
L(align8align):
stb r7, 0(r3) /* Terminate s1 with NULL. */
addi r1, r1, FRAMESIZE /* Restore stack pointer. */
mr r3, r30 /* Set the return value, length of
string. */
ld r0, 16(r1) /* Read the saved link register. */
ld r29, -24(r1) /* Restore save register r29. */
ld r30, -16(r1) /* Restore save register r30. */
ld r31, -8(r1) /* Restore save register r31. */
mtlr r0 /* Restore link register. */
blr /* Branch to link register */
.p2align 4
L(leftNbytes):
rldicl. r29, r29, 0, 62 /* Check if n>0 and n < 4 bytes. */
bne cr0,L(byte_by_byte) /* Process bytes one by one. */
b L(nullTerminate) /* Now, finish catenation with
NULL termination. */
END(STRNCAT)
|