File: memset.S

package info (click to toggle)
elilo 3.2-2
  • links: PTS
  • area: main
  • in suites: woody
  • size: 1,356 kB
  • ctags: 2,507
  • sloc: ansic: 9,556; sh: 639; asm: 532; makefile: 196
file content (133 lines) | stat: -rw-r--r-- 3,361 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/*
 *  Copyright (C) 1999-2002 Hewlett-Packard Co.
 *	Contributed by Stephane Eranian <eranian@hpl.hp.com>
 *
 * This file is part of the ELILO, the EFI Linux boot loader.
 *
 *  ELILO is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  ELILO is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with ELILO; see the file COPYING.  If not, write to the Free
 *  Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 *  02111-1307, USA.
 *
 * Please check out the elilo.txt for complete documentation on how
 * to use this program.
 *
 * This code is derived from the Linux/ia64 source code.
 */

/*
 *
 * Optimized version of the standard memset() function
 *
 * Return: none
 *
 * Inputs:
 *	in0:	address of buffer
 * 	in1:	byte value to use for storing
 *	in2:	length of the buffer
 *
 */

// arguments
//
#define buf		r32
#define val		r33
#define len		r34

//
// local registers
//
#define saved_pfs	r14
#define cnt		r18
#define buf2		r19
#define saved_lc	r20
#define tmp		r21
	.text
	.global Memset
	.proc Memset
Memset:
	.prologue
	.save ar.pfs, saved_pfs
 	alloc saved_pfs=ar.pfs,3,0,0,0	// cnt is sink here
	cmp.eq p8,p0=r0,len	// check for zero length
	.save ar.lc, saved_lc
	mov saved_lc=ar.lc	// preserve ar.lc (slow)
	;;

	.body

	adds tmp=-1,len		// br.ctop is repeat/until
	tbit.nz p6,p0=buf,0	// odd alignment
(p8)	br.ret.spnt.few rp

	cmp.lt p7,p0=16,len	// if len > 16 then long memset
	mux1 val=val,@brcst	// prepare value
(p7)	br.cond.dptk.few long_memset
	;;
	mov ar.lc=tmp		// initialize lc for small count
	;;			// avoid RAW and WAW on ar.lc
1:				// worst case 15 cyles, avg 8 cycles
	st1 [buf]=val,1
	br.cloop.dptk.few 1b
	;;				// avoid RAW on ar.lc
	mov ar.lc=saved_lc
	mov ar.pfs=saved_pfs
	br.ret.sptk.few rp	// end of short memset

	// at this point we know we have more than 16 bytes to copy
	// so we focus on alignment
long_memset:
(p6)	st1 [buf]=val,1		// 1-byte aligned
(p6)	adds len=-1,len;;	// sync because buf is modified
	tbit.nz p6,p0=buf,1
	;;
(p6)	st2 [buf]=val,2		// 2-byte aligned
(p6)	adds len=-2,len;;
	tbit.nz p6,p0=buf,2
	;;
(p6)	st4 [buf]=val,4		// 4-byte aligned
(p6)	adds len=-4,len;;
	tbit.nz p6,p0=buf,3
	;;
(p6)	st8 [buf]=val,8		// 8-byte aligned
(p6)	adds len=-8,len;;
	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
	;;
	cmp.eq p6,p0=r0,cnt
	adds tmp=-1,cnt
(p6)	br.cond.dpnt.few .dotail // we have less than 16 bytes left
	;;
	adds buf2=8,buf		// setup second base pointer
	mov ar.lc=tmp
	;;
2:				// 16bytes/iteration
	st8 [buf]=val,16
	st8 [buf2]=val,16
	br.cloop.dptk.few 2b
	;;
.dotail:			// tail correction based on len only
	tbit.nz p6,p0=len,3
	;;
(p6)	st8 [buf]=val,8		// at least 8 bytes
	tbit.nz p6,p0=len,2
	;;
(p6)	st4 [buf]=val,4		// at least 4 bytes
	tbit.nz p6,p0=len,1
	;;
(p6)	st2 [buf]=val,2		// at least 2 bytes
	tbit.nz p6,p0=len,0
	mov ar.lc=saved_lc
	;;
(p6)	st1 [buf]=val		// only 1 byte left
	br.ret.dptk.few rp
	.endp Memset