1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
|
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
*/
#include <linux/linkage.h>
#include <asm/cache.h>
/*
* The memset implementation below is optimized to use prefetchw and prealloc
* instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
* If you want to implement optimized memset for other possible L1 data cache
* line lengths (32B and 128B) you should rewrite code carefully checking
* we don't call any prefetchw/prealloc instruction for L1 cache lines which
* don't belongs to memset area.
*/
#if L1_CACHE_SHIFT == 6
.macro PREALLOC_INSTR reg, off
prealloc [\reg, \off]
.endm
.macro PREFETCHW_INSTR reg, off
prefetchw [\reg, \off]
.endm
#else
.macro PREALLOC_INSTR reg, off
.endm
.macro PREFETCHW_INSTR reg, off
.endm
#endif
ENTRY_CFI(memset)
mov.f 0, r2
;;; if size is zero
jz.d [blink]
mov r3, r0 ; don't clobber ret val
PREFETCHW_INSTR r0, 0 ; Prefetch the first write location
;;; if length < 8
brls.d.nt r2, 8, .Lsmallchunk
mov.f lp_count,r2
and.f r4, r0, 0x03
rsub lp_count, r4, 4
lpnz @.Laligndestination
;; LOOP BEGIN
stb.ab r1, [r3,1]
sub r2, r2, 1
.Laligndestination:
;;; Destination is aligned
and r1, r1, 0xFF
asl r4, r1, 8
or r4, r4, r1
asl r5, r4, 16
or r5, r5, r4
mov r4, r5
sub3 lp_count, r2, 8
cmp r2, 64
bmsk.hi r2, r2, 5
mov.ls lp_count, 0
add3.hi r2, r2, 8
;;; Convert len to Dwords, unfold x8
lsr.f lp_count, lp_count, 6
lpnz @.Lset64bytes
;; LOOP START
PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching
#ifdef CONFIG_ARC_HAS_LL64
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
#else
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
#endif
.Lset64bytes:
lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
lpnz .Lset32bytes
;; LOOP START
#ifdef CONFIG_ARC_HAS_LL64
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
std.ab r4, [r3, 8]
#else
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
st.ab r4, [r3, 4]
#endif
.Lset32bytes:
and.f lp_count, r2, 0x1F ;Last remaining 31 bytes
.Lsmallchunk:
lpnz .Lcopy3bytes
;; LOOP START
stb.ab r1, [r3, 1]
.Lcopy3bytes:
j [blink]
END_CFI(memset)
ENTRY_CFI(memzero)
; adjust bzero args to memset args
mov r2, r1
b.d memset ;tail call so need to tinker with blink
mov r1, 0
END_CFI(memzero)
|