File: lbnppc.c

package info (click to toggle)
bnlib 1.1-2.1
  • links: PTS
  • area: main
  • in suites: sarge, woody
  • size: 768 kB
  • ctags: 874
  • sloc: ansic: 6,526; asm: 1,380; makefile: 197; sh: 156
file content (318 lines) | stat: -rw-r--r-- 11,706 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#include "lbnppc.h"

/*
 * lbnppc.c - Assembly primitives for the bignum library, PowerPC version.
 *
 * Copyright (c) 1995  Colin Plumb.  All rights reserved.
 * For licensing and other legal details, see the file legal.c
 *
 * Register usage during function calls is:
 * r0 - volatile
 * r1 - stack pointer, preserved
 * r2 - TOC pointer, preserved
 * r3 - First argument and return value register
 * r4-r10 - More argument registers, volatile
 * r11-r12 - Volatile
 * r13-r31 - Preserved
 * LR, CTR, XER and MQ are all volatile.
 * LR holds return address on entry.
 *
 * On the PPC 601, unrolling the loops more doesn't seem to speed things
 * up at all.  I'd be curious if other chips differed.
 */
#if __MWERKS__ < 0x800

#include "ppcasm.h"	/* PowerPC assembler */
 
/*
 * MulN1 expects (*out, *in, len, k), count >= 1
 *                r3    r4   r5   r6
 */
static const unsigned mulN1[] = {
	PPC_LWZ(7,4,0), 	/* Load first word of in in r7 */
	PPC_MULLW(8,7,6),	/* Low half of multiply in r8 */
	PPC_MTCTR(5),		/* Move len into CTR */
	PPC_ADDIC(0,0,0),	/* Clear carry bit for loop */
	PPC_MULHWU(5,7,6),	/* High half of multiply in r5 */
	PPC_STW(8,3,0),
	PPC_BC(18,31,7),	/* Branch to Label if --ctr == 0 */
/* Loop: */
	PPC_LWZU(7,4,4),	/* r7 = *++in */
	PPC_MULLW(8,7,6),	/* r8 = low word of product */
	PPC_ADDE(8,8,5),	/* Add carry word r5 and bit CF to r8 */
	PPC_STWU(8,3,4),	/* *++out = r8 */
	PPC_MULHWU(5,7,6),	/* r5 is high word of product, for carry word */
	PPC_BC(16,31,-5),	/* Branch to Loop if --ctr != 0 */
/* Label: */
	PPC_ADDZE(5,5),		/* Add carry flag to r5 */
	PPC_STW(5,3,4),		/* out[1] = r5 */
	PPC_BLR()
};

/*
 * MulAdd1 expects (*out, *in, len, k), count >= 1
 *                  r3    r4   r5   r6
 */
static unsigned const mulAdd1[] = {
	PPC_LWZ(7,4,0), 	/* Load first word of in in r7 */
	PPC_LWZ(0,3,0),		/* Load first word of out into r0 */
	PPC_MULLW(8,7,6),	/* Low half of multiply in r8 */
	PPC_MTCTR(5),		/* Move len into CTR */
	PPC_MULHWU(5,7,6),	/* High half of multiply in r5 */
	PPC_ADDC(8,8,0),	/* r8 = r8 + r0 */
	PPC_STW(8,3,0),		/* Store result to memory */
	PPC_BC(18,31,10),	/* Branch to Label if --ctr == 0 */
/* Loop: */
	PPC_LWZU(7,4,4),	/* r7 = *++in */
	PPC_LWZU(0,3,4),	/* r0 = *++out */
	PPC_MULLW(8,7,6),	/* r8 = low word of product */
	PPC_ADDE(8,8,5), 	/* Add carry word r5 and carry bit CF to r8 */
	PPC_MULHWU(5,7,6),	/* r5 is high word of product, for carry word */
	PPC_ADDZE(5,5),		/* Add carry bit from low add to r5 */
	PPC_ADDC(8,8,0),	/* r8 = r8 + r0 */
	PPC_STW(8,3,0), 	/* *out = r8 */
	PPC_BC(16,31,-8),	/* Branch to Loop if --ctr != 0 */
/* Label: */
	PPC_ADDZE(3,5),		/* Add carry flag to r5 and move to r3 */
	PPC_BLR()
};

/*
 * MulSub1 expects (*out, *in, len, k), count >= 1
 *                  r3    r4   r5   r6
 *
 * Multiply and subtract is rather a pain.  If the subtract of the
 * low word of the product from out[i] generates a borrow, we want to
 * increment the carry word (initially in the range 0..0xfffffffe).
 * However, the PPC's carry bit CF is *clear* after a subtract, so
 * we want to add (1-CF) to the carry word.  This is done using two
 * instructions:
 *
 * SUBFME, subtract from minus one extended.  This computes
 *   rD = ~rS + 0xffffffff + CF.  Since rS is from 0 to 0xfffffffe,
 *   ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
 *   from 0 through 0xfffffffff, setting the carry flag unconditionally, and
 * NOR, which is used as a bitwise invert NOT instruction.
 *
 * The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
 * = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
 * which is the bitwise complement of the value we want.
 * We want to add the complement of that result to the low word of the
 * product, which is just what a subtract would do, if only we could get
 * the carry flag clear.  But it's always set, except for SUBFE, and the
 * operation we just performed unconditionally *sets* the carry flag.  Ugh.
 * So find the complement in a separate instruction.
 */
static unsigned const mulSub1[] = {
	PPC_LWZ(7,4,0), 	/* Load first word of in in r7 */
	PPC_LWZ(0,3,0),		/* Load first word of out into r0 */
	PPC_MTCTR(5),		/* Move len into CTR */
	PPC_MULLW(8,7,6),	/* Low half of multiply in r8 */
	PPC_MULHWU(5,7,6),	/* High half of multiply in r5 */
	PPC_SUBFC(8,8,0),	/* r8 = r0 - r8, setting CF */
	PPC_STW(8,3,0),		/* Store result to memory */
	PPC_SUBFME(5,5),	/* First of two insns to add (1-CF) to r5 */
	PPC_BC(18,31,12),	/* Branch to Label if --ctr == 0 */
/* Loop: */
	PPC_LWZU(7,4,4),	/* r7 = *++in */
	PPC_LWZU(0,3,4),	/* r0 = *++out */
	PPC_NOR(5,5,5),		/* Second of two insns to add (1-CF) to r5 */
	PPC_MULLW(8,7,6),	/* r8 = low word of product */
	PPC_ADDC(8,8,5), 	/* Add carry word r5 to r8 */
	PPC_MULHWU(5,7,6),	/* r5 is high word of product, for carry word */
	PPC_ADDZE(5,5),		/* Add carry bit from low add to r5 */
	PPC_SUBFC(8,8,0),	/* r8 = r0 - r8, setting CF */
	PPC_STW(8,3,0), 	/* *out = r8 */
	PPC_SUBFME(5,5),	/* First of two insns to add (1-CF) to r5 */
	PPC_BC(16,31,-10),	/* Branch to Loop if --ctr != 0 */
/* Label: */
	PPC_NOR(3,5,5),		/* Finish adding (1-CF) to r5, store in r3 */
	PPC_BLR()
};

#if 0
/*
 * Args: BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv)
 *                r3                  r4            r5             r6
 * r7, r8 and r9 are the triple-width accumulator.
 * r0 and r10 are temporary registers.
 * r11 and r12 are temporary pointers into n and mod, respectively. 
 * r2 (!) is another temporary register.
 */
static unsigned const montReduce[] = {
	PPC_MTCTR(5),	/* ??? */
	PPC_LWZ(7,3,0),		/* Load low word of n into r7 */
	PPC_LWZ(10,4,0),	/* Fetch low word of mod */
	PPC_MULLW(0,7,6),	/* Invert r7 into r0 */
	PPC_STW(0,3,0),		/* Store back for future use */
	PPC_MULHWU(8,10,7),	/* Get high word of whatnot */
	PPC_MULLW(10,10,7),	/* Get low word of it */
	PPC_ADDC(7,7,10),	/* Add low word of product to r7 */
	PPC_ADDZE(8,8),		/* Add carry to high word */
	PPC_
	

	PPC_MULHW(8,7,6),
	PPC_ADDC(7,7,0),	/* Add inverse back to r7 */
	PPC_ADDZE(8,8),
	PPC_
	
	PPC_LWZU(
/* Loop: */
	PPC_LWZU(0,11,4),
	PPC_LWZU(10,23,-4),
	PPC_MULLW(2,0,10),
	PPC_ADDC(7,7,2),
	PPC_MULHWU(0,0,10),
	PPC_ADDE(8,8,0),
	PPC_ADDZE(9,9),
	PPC_BC(16,31,-7),	/* Branch to Loop if --ctr != 0 */

	PPC_ADDIC_(count,-1),
	PPC_LWZU(0,x,4),
	PPC_ADDC(0,7,0),
	PPC_STW(0,x,0),
	PPC_ADDZE(7,8),
	PPC_ADDZE(8,9),
	PPC_LI(9,0),
	PPC_BC(xx,2,yy),
	
};
#endif

/*
 * Three overlapped transition vectors for three functions.
 * A PowerPC transition vector for a (potentially) inter-module
 * jump or call consists of two words, an instruction address
 * and a Table Of Contents (TOC) pointer, which is loaded into
 * r1.  Since none of the routines here have global variables,
 * they don't need a TOC pointer, so the value is unimportant.
 * This array places an unintersting 32-bit value after each address.
 */
unsigned const * const lbnPPC_tv[] = {
	mulN1,
	mulAdd1,
	mulSub1,
	0
};

#else /* __MWERKS >= 0x800 */

/*
 * MulN1 expects (*out, *in, len, k), count >= 1
 *                r3    r4   r5   r6
 */
asm void
lbnMulN1_32(register unsigned *out, register unsigned const *in,
	register unsigned len, register unsigned k)
{
	lwz 	r7,0(in) 	/* Load first word of in in r7 */
	mtctr	len			/* Move len into CTR */
	mullw	r8,r7,k		/* Low half of multiply in r8 */
	addic	r0,r0,0		/* Clear carry bit for loop */
	mulhwu	len,r7,k	/* High half of multiply in len */
	stw 	r8,0(out)	/* *out = r8 */
	mulhwu	len,r7,k	/* len is high word of product, for carry */
	bdz-	label		/* Branch to Label if --ctr == 0 */
loop:
	lwzu	r7,4(in)	/* r7 = *++in */
	mullw	r8,r7,k		/* Low half of multiply in r8 */
	adde	r8,r8,len	/* Add carry word len and bit CF to r8 */
	stwu	r8,4(out)	/* *++out = r8 */
	mulhwu	len,r7,k	/* len is high word of product, for carry */
	bdnz+	loop		/* Branch to Loop if --ctr != 0 */
label:
	addze	len,len		/* Add carry flag to carry word */
	stw 	len,4(out)
	blr
}

/*
 * MulAdd1 expects (*out, *in, len, k), count >= 1
 *                  r3    r4   r5   r6
 */
asm unsigned
lbnMulAdd1_32(register unsigned *out, register unsigned const *in,
	register unsigned len, register unsigned k)
{
	lwz 	r7,0(in) 	/* Load first word of in in r7 */
	lwz 	r0,0(out)	/* Load first word of out into r0 */
	mullw	r8,r7,k 	/* Low half of multiply in r8 */
	mtctr	len 		/* Move len into CTR */
	mulhwu	len,r7,k	/* High half of multiply in len */
	addc	r8,r8,r0	/* r8 = r8 + r0 */
	stw 	r8,0(out)	/* Store result to memory */
	bdz-	label		/* Branch to Label if --ctr == 0 */
loop:
	lwzu	r7,4(in)	/* r7 = *++in */
	lwzu	r0,4(out)	/* r0 = *++out */
	mullw	r8,r7,k		/* r8 = low word of product */
	adde	r8,r8,len	/* Add carry word len and carry bit CF to r8 */
	mulhwu	len,r7,k	/* len is high word of product, for carry */
	addze	len,len		/* Add carry bit from low add to r5 */
	addc	r8,r8,r0	/* r8 = r8 + r0 */
	stw 	r8,0(out)	/* *out = r8 */
	bdnz+	loop		/* Branch to Loop if --ctr != 0 */
label:
	addze	r3,r5		/* Add carry flag to r5 and move to r3 */
	blr
}

/*
 * MulSub1 expects (*out, *in, len, k), count >= 1
 *                  r3    r4   r5   r6
 *
 * Multiply and subtract is rather a pain.  If the subtract of the
 * low word of the product from out[i] generates a borrow, we want to
 * increment the carry word (initially in the range 0..0xfffffffe).
 * However, the PPC's carry bit CF is *clear* after a subtract, so
 * we want to add (1-CF) to the carry word.  This is done using two
 * instructions:
 *
 * SUBFME, subtract from minus one extended.  This computes
 *   rD = ~rS + 0xffffffff + CF.  Since rS is from 0 to 0xfffffffe,
 *   ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
 *   from 0 through 0xfffffffff, setting the carry flag unconditionally, and
 * NOR, which is used as a bitwise invert NOT instruction.
 *
 * The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
 * = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
 * which is the bitwise complement of the value we want.
 * We want to add the complement of that result to the low word of the
 * product, which is just what a subtract would do, if only we could get
 * the carry flag clear.  But it's always set, except for SUBFE, and the
 * operation we just performed unconditionally *sets* the carry flag.  Ugh.
 * So find the complement in a separate instruction.
 */
asm unsigned
lbnMulSub1_32(register unsigned *out, register unsigned const *in,
	register unsigned len, register unsigned k)
{
	lwz 	r7,0(in) 	/* Load first word of in in r7 */
	lwz 	r0,0(out)	/* Load first word of out into r0 */
	mtctr	len 		/* Move len into CTR */
	mullw	r8,r7,k 	/* Low half of multiply in r8 */
	mulhwu	len,r7,k	/* High half of multiply in len */
	subfc	r8,r8,r0	/* r8 = r0 - r8, setting CF */
	stw 	r8,0(out)	/* Store result to memory */
	subfme	len,len		/* First of two insns to add (1-CF) to len */
	bdz-	label		/* Branch to Label if --ctr == 0 */
loop:
	lwzu	r7,4(in)	/* r7 = *++in */
	lwzu	r0,4(out)	/* r0 = *++out */
	nor 	len,len,len	/* Second of two insns to add (1-CF) to len */
	mullw	r8,r7,k		/* r8 = low word of product */
	addc	r8,r8,len	/* Add carry word len to r8 */
	mulhwu	len,r7,k	/* len is high word of product, for carry */
	addze	len,len		/* Add carry bit from low add to len */
	subfc	r8,r8,r0	/* r8 = r0 - r8 */
	stw 	r8,0(out)	/* *out = r8 */
	subfme	len,len		/* First of two insns to add (1-CF) to len */
	bdnz+	loop		/* Branch to Loop if --ctr != 0 */
label:
	nor 	r3,r5,r5	/* Finish adding (1-CF) to len, store in r3 */
	blr
}

#endif /* __MWERKS >= 0x800 */
/* 45678901234567890123456789012345678901234567890123456789012345678901234567 */