1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
/* Copyright (C) 2000-2016 Free Software Foundation, Inc.
Contributed by Richard Henderson (rth@tamu.edu)
EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
.arch ev6
.set noat
.set noreorder
ENTRY(memset)
#ifdef PROF
ldgp gp, 0(pv)
lda AT, _mcount
jsr AT, (AT), _mcount
.prologue 1
#else
.prologue 0
#endif
/*
* Serious stalling happens. The only way to mitigate this is to
* undertake a major re-write to interleave the constant materialization
* with other parts of the fall-through code. This is important, even
* though it makes maintenance tougher.
* Do this later.
*/
and $17, 255, $1 # E : 00000000000000ch
insbl $17, 1, $2 # U : 000000000000ch00
mov $16, $0 # E : return value
ble $18, $end # U : zero length requested?
addq $18, $16, $6 # E : max address to write to
or $1, $2, $17 # E : 000000000000chch
insbl $1, 2, $3 # U : 0000000000ch0000
insbl $1, 3, $4 # U : 00000000ch000000
or $3, $4, $3 # E : 00000000chch0000
inswl $17, 4, $5 # U : 0000chch00000000
xor $16, $6, $1 # E : will complete write be within one quadword?
inswl $17, 6, $2 # U : chch000000000000
or $17, $3, $17 # E : 00000000chchchch
or $2, $5, $2 # E : chchchch00000000
bic $1, 7, $1 # E : fit within a single quadword?
and $16, 7, $3 # E : Target addr misalignment
or $17, $2, $17 # E : chchchchchchchch
beq $1, $within_quad # U :
nop # E :
beq $3, $aligned # U : target is 0mod8
/*
* Target address is misaligned, and won't fit within a quadword.
*/
ldq_u $4, 0($16) # L : Fetch first partial
mov $16, $5 # E : Save the address
insql $17, $16, $2 # U : Insert new bytes
subq $3, 8, $3 # E : Invert (for addressing uses)
addq $18, $3, $18 # E : $18 is new count ($3 is negative)
mskql $4, $16, $4 # U : clear relevant parts of the quad
subq $16, $3, $16 # E : $16 is new aligned destination
or $2, $4, $1 # E : Final bytes
nop
stq_u $1,0($5) # L : Store result
nop
nop
.align 4
$aligned:
/*
* We are now guaranteed to be quad aligned, with at least
* one partial quad to write.
*/
sra $18, 3, $3 # U : Number of remaining quads to write
and $18, 7, $18 # E : Number of trailing bytes to write
mov $16, $5 # E : Save dest address
beq $3, $no_quad # U : tail stuff only
/*
* It's worth the effort to unroll this and use wh64 if possible.
* At this point, entry values are:
* $16 Current destination address
* $5 A copy of $16
* $6 The max quadword address to write to
* $18 Number trailer bytes
* $3 Number quads to write
*/
and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
blt $4, $loop # U :
/*
* We know we've got at least 16 quads, minimum of one trip
* through unrolled loop. Do a quad at a time to get us 0mod64
* aligned.
*/
nop # E :
nop # E :
nop # E :
beq $1, $bigalign # U :
$alignmod64:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : For consistency later
addq $1, 8, $1 # E : Increment towards zero for alignment
addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
nop
nop
addq $5, 8, $5 # E : Inc address
blt $1, $alignmod64 # U :
$bigalign:
/*
* $3 - number quads left to go
* $5 - target address (aligned 0mod64)
* $17 - mask of stuff to store
* Scratch registers available: $7, $2, $4, $1
* We know that we'll be taking a minimum of one trip through.
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* Assumes the wh64 needs to be for 2 trips through the loop in the future.
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
$do_wh64:
wh64 ($4) # L1 : memory subsystem write hint
subq $3, 24, $2 # E : For determining future wh64 addresses
stq $17, 0($5) # L :
nop # E :
addq $5, 128, $4 # E : speculative target of next wh64
stq $17, 8($5) # L :
stq $17, 16($5) # L :
addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
stq $17, 24($5) # L :
stq $17, 32($5) # L :
cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
nop
stq $17, 40($5) # L :
stq $17, 48($5) # L :
subq $3, 16, $2 # E : Repeat the loop at least once more?
nop
stq $17, 56($5) # L :
addq $5, 64, $5 # E :
subq $3, 8, $3 # E :
bge $2, $do_wh64 # U :
nop
nop
nop
beq $3, $no_quad # U : Might have finished already
.align 4
/*
* Simple loop for trailing quadwords, or for small amounts
* of data (where we can't use an unrolled loop and wh64)
*/
$loop:
stq $17, 0($5) # L :
subq $3, 1, $3 # E : Decrement number quads left
addq $5, 8, $5 # E : Inc address
bne $3, $loop # U : more?
$no_quad:
/*
* Write 0..7 trailing bytes.
*/
nop # E :
beq $18, $end # U : All done?
ldq $7, 0($5) # L :
mskqh $7, $6, $2 # U : Mask final quad
insqh $17, $6, $4 # U : New bits
or $2, $4, $1 # E : Put it all together
stq $1, 0($5) # L : And back to memory
ret $31,($26),1 # L0 :
$within_quad:
ldq_u $1, 0($16) # L :
insql $17, $16, $2 # U : New bits
mskql $1, $16, $4 # U : Clear old
or $2, $4, $2 # E : New result
mskql $2, $6, $4 # U :
mskqh $1, $6, $2 # U :
or $2, $4, $1 # E :
stq_u $1, 0($16) # L :
$end:
nop
nop
nop
ret $31,($26),1 # L0 :
END(memset)
libc_hidden_builtin_def (memset)
|