1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
/*
* Copyright (C) 1999-2002 Hewlett-Packard Co.
* Contributed by Stephane Eranian <eranian@hpl.hp.com>
*
* This file is part of the ELILO, the EFI Linux boot loader.
*
* ELILO is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* ELILO is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ELILO; see the file COPYING. If not, write to the Free
* Software Foundation, 59 Temple Place - Suite 330, Boston, MA
* 02111-1307, USA.
*
* Please check out the elilo.txt for complete documentation on how
* to use this program.
*
* This code is derived from the Linux/ia64 source code.
*/
/*
*
* Optimized version of the standard memset() function
*
* Return: none
*
* Inputs:
* in0: address of buffer
* in1: byte value to use for storing
* in2: length of the buffer
*
*/
// arguments
//
#define buf r32
#define val r33
#define len r34
//
// local registers
//
#define saved_pfs r14
#define cnt r18
#define buf2 r19
#define saved_lc r20
#define tmp r21
.text
.global Memset
.proc Memset
Memset:
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,0,0,0 // cnt is sink here
cmp.eq p8,p0=r0,len // check for zero length
.save ar.lc, saved_lc
mov saved_lc=ar.lc // preserve ar.lc (slow)
;;
.body
adds tmp=-1,len // br.ctop is repeat/until
tbit.nz p6,p0=buf,0 // odd alignment
(p8) br.ret.spnt.few rp
cmp.lt p7,p0=16,len // if len > 16 then long memset
mux1 val=val,@brcst // prepare value
(p7) br.cond.dptk.few long_memset
;;
mov ar.lc=tmp // initialize lc for small count
;; // avoid RAW and WAW on ar.lc
1: // worst case 15 cyles, avg 8 cycles
st1 [buf]=val,1
br.cloop.dptk.few 1b
;; // avoid RAW on ar.lc
mov ar.lc=saved_lc
mov ar.pfs=saved_pfs
br.ret.sptk.few rp // end of short memset
// at this point we know we have more than 16 bytes to copy
// so we focus on alignment
long_memset:
(p6) st1 [buf]=val,1 // 1-byte aligned
(p6) adds len=-1,len;; // sync because buf is modified
tbit.nz p6,p0=buf,1
;;
(p6) st2 [buf]=val,2 // 2-byte aligned
(p6) adds len=-2,len;;
tbit.nz p6,p0=buf,2
;;
(p6) st4 [buf]=val,4 // 4-byte aligned
(p6) adds len=-4,len;;
tbit.nz p6,p0=buf,3
;;
(p6) st8 [buf]=val,8 // 8-byte aligned
(p6) adds len=-8,len;;
shr.u cnt=len,4 // number of 128-bit (2x64bit) words
;;
cmp.eq p6,p0=r0,cnt
adds tmp=-1,cnt
(p6) br.cond.dpnt.few .dotail // we have less than 16 bytes left
;;
adds buf2=8,buf // setup second base pointer
mov ar.lc=tmp
;;
2: // 16bytes/iteration
st8 [buf]=val,16
st8 [buf2]=val,16
br.cloop.dptk.few 2b
;;
.dotail: // tail correction based on len only
tbit.nz p6,p0=len,3
;;
(p6) st8 [buf]=val,8 // at least 8 bytes
tbit.nz p6,p0=len,2
;;
(p6) st4 [buf]=val,4 // at least 4 bytes
tbit.nz p6,p0=len,1
;;
(p6) st2 [buf]=val,2 // at least 2 bytes
tbit.nz p6,p0=len,0
mov ar.lc=saved_lc
;;
(p6) st1 [buf]=val // only 1 byte left
br.ret.dptk.few rp
.endp Memset
|