File: sqrt_n2.S

package info (click to toggle)
numerix 0.22-4
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 4,380 kB
  • ctags: 4,165
  • sloc: asm: 26,210; ansic: 12,168; ml: 4,912; sh: 3,899; pascal: 414; makefile: 179
file content (205 lines) | stat: -rw-r--r-- 6,080 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
// file kernel/n/ppc32/sqrt_n2.S: O(n^2) square root of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Racine carre quadratique                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                             ; +-----------------+
                             ; |  Racine carre  |
                             ; +-----------------+

; void xn(sqrt_n2)(chiffre *a, long la, chiffre *b)
;
; entre :
; a = naturel longueur la
; b = naturel de longueur la/2
;
; contraintes :
; la > 0, la pair, BASE/16 <= a[la-1] < BASE/4
; a,b non confondus
;
; sortie :
; b <- 2*floor(sqrt(a))
; a <- a - b^2/4

#ifdef assembly_sn_sqrt_n2
#define L(x) .Lsn_sqrt_n2_##x
#ifdef debug_sqrt_n2
.globl _sn_sqrt_n2_buggy
_sn_sqrt_n2_buggy:
#else
.globl _sn_sqrt_n2
_sn_sqrt_n2:
#endif

	; variables locales
	#define _a_    r12
	#define _la_   r8
	#define _b_    r11
	#define _b0_   r31
	#define _bi_   r30
	#define _bh_   r29
	#define _ah_   r2
	#define _msub_ r6
	#define _add_  r7
	#define _u_    r6
	#define _v_    r7
	#define _x_    r3
	#define _y_    r4
	#define _z_    r5
	#define _q_    r9

	mflr   r0
	stmw   r29,  4(r1)

	add    _b_,  r5,   r4	; b <- &b[la/2]
	add    _b_,  _b_,  r4
	subi   _b0_, _b_,  8	; b0 <- &b[la/2-2]
	subi   _la_, r4,   2	; la -= 2
	slwi   r2,   _la_, 2
	add    _a_,  r3,   r2 	; a <- &a[la-2]

	; calcule la racine carre des deux chiffres de tte
	lwz    _x_,  0(_a_)	; y:x <- a[1]:a[0]
	lwz    _y_,  4(_a_)
	lis    _bh_, 0x8000	; bh <- BASE/2
	subis  _y_,  _y_, 0x1000; y:x -= (bh/2)^2
	lis    _z_,  0x2000	; z <- BASE/8 (bit de test)
	slwi   _y_,  _y_,  2	; dcale le reste de 2 bits
	inslwi _y_,  _x_,  2,30
	slwi   _x_,  _x_,  2
1:
	add    _q_,  _z_,  _bh_ ; bit de test <- 1
	srwi   _u_,  _y_,  31	; dcale le reste d un bit
	addc   _x_,  _x_,  _x_
	adde   _y_,  _y_,  _y_
	subfc  _v_,  _q_,  _y_
	addme. _u_,  _u_	; a passe ?
	blt    2f
	add    _bh_, _z_,  _q_	; si oui, valide le bit
	mr     _y_,  _v_	; mise  jour du reste
2:
	srwi.  _z_,  _z_,  1
	bne    1b

	stw    _y_,  0(_a_)	; sauve le reste dans a[0]:a[1]
	stw    _z_,  4(_a_)
	stw    _bh_,-4(_b_)	; sauve le chiffre de tte de b
	and.   _la_, _la_, _la_
	beq    L(done)

	; prpare le droulement des boucles internes
	bcl    20,31, L(here)   ; adresses de saut pour 2 chiffres
L(here):
	mflr   r2
	addi   _add_,  r2,     lo16(Lsn_addloop    - L(here)  + 4*30*4)
/*	addis  _add_,  _add_,  ha16(Lsn_addloop    - L(here)  + 4*30*4) */
	addi   _msub_, r2,     lo16(Lsn_mulsubloop - L(here)  + 4*30*10)
/*	addis  _msub_, _msub_, ha16(Lsn_mulsubloop - L(here)  + 4*30*10) */
        INVERSE(_bh_,_bi_,_y_,_z_)

	; calcule les chiffres suivants par divisions
L(loop):
	; quotient approch, peut tre trop grand d une ou deux units
	lwz    _x_,  0(_a_)
	lwz    _y_, -4(_a_)
	mr     _ah_, _x_
        DIV(_y_,_x_,_bh_,_bi_,_q_,_z_)
	stw    _q_, 0(_b0_)	; b <- b + q

	; a <- a - v*q - q^2
	mtlr   _msub_
	subf   _x_, _b0_, _b_
	addi   _x_, _x_,  124
	srwi   _x_, _x_,  7
	mtctr  _x_		; ctr <- ceil(lb/32)
	slwi   _x_, _x_,  7
	subf   _a_, _x_,  _a_	; cadre a et b sur le multiple de 32 prc.
	subf   _b_, _x_,  _b_
	li     _x_, 0
	blrl
	lwz    _z_,  4(_b0_) 	; b <- b + 2q
	addc   _q_,  _q_, _q_
	addze  _z_,  _z_
	stw    _q_,  0(_b0_)
	stw    _z_,  4(_b0_)
	subf.  _ah_, _x_, _ah_	; dernire retenue de la soustraction

	; corrige le quotient et le reste si < 0
	beq    L(q_ok)
L(corr):
	mtlr   _add_
	subic  _q_, _q_,  2	; 2q <- 2q-2, CA <- 1
	addme  _z_, _z_
	stw    _q_,  0(_b0_)
	stw    _z_,  4(_b0_)
	subf   _x_, _b0_, _b_
	addi   _x_, _x_,  124
	srwi   _x_, _x_,  7
	mtctr  _x_		; ctr <- ceil(lb/32)
	slwi   _x_, _x_,  7
	subf   _a_, _x_,  _a_	; cadre a et b sur le multiple de 32 prc.
	subf   _b_, _x_,  _b_
	mr     r10, _a_
	blrl
	addze. _ah_, _ah_	; dernire retenue
	bne    L(corr)

	# chiffre suivant
L(q_ok):
	subi   _msub_,_msub_,40 ; recule les adresses de saut
	subi   _add_, _add_, 16
	subf   _x_,   _b0_,  _b_
	andi.  _x_,   _x_, 0x7f	; si on franchit un multiple de 32
	bne    1f		; repart en fin de boucle
	addi   _msub_,_msub_,4*32*10
	addi   _add_, _add_, 4*32*4
1:
	subic. _la_, _la_, 2	; la -= 2
	subi   _b0_, _b0_, 4	; lb++
	subi   _a_,  _a_,  4	; a--
	bne    L(loop)
	
	# termin
L(done):
	mtlr   r0
	lmw    r29, 4(r1)
	blr

	#undef _a_
	#undef _la_
	#undef _b_
	#undef _b0_
	#undef _bi_
	#undef _bh_
	#undef _ah_
	#undef _msub_
	#undef _add_
	#undef _u_
	#undef _v_
	#undef _x_
	#undef _y_
	#undef _z_
	#undef _q_
	
#undef L
#endif /* assembly_sn_sqrt_n2 */