1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
|
#include "lbnppc.h"
/*
* lbnppc.c - Assembly primitives for the bignum library, PowerPC version.
*
* Copyright (c) 1995 Colin Plumb. All rights reserved.
* For licensing and other legal details, see the file legal.c
*
* Register usage during function calls is:
* r0 - volatile
* r1 - stack pointer, preserved
* r2 - TOC pointer, preserved
* r3 - First argument and return value register
* r4-r10 - More argument registers, volatile
* r11-r12 - Volatile
* r13-r31 - Preserved
* LR, CTR, XER and MQ are all volatile.
* LR holds return address on entry.
*
* On the PPC 601, unrolling the loops more doesn't seem to speed things
* up at all. I'd be curious if other chips differed.
*/
#if __MWERKS__ < 0x800
#include "ppcasm.h" /* PowerPC assembler */
/*
* MulN1 expects (*out, *in, len, k), count >= 1
* r3 r4 r5 r6
*/
static const unsigned mulN1[] = {
PPC_LWZ(7,4,0), /* Load first word of in in r7 */
PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
PPC_MTCTR(5), /* Move len into CTR */
PPC_ADDIC(0,0,0), /* Clear carry bit for loop */
PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
PPC_STW(8,3,0),
PPC_BC(18,31,7), /* Branch to Label if --ctr == 0 */
/* Loop: */
PPC_LWZU(7,4,4), /* r7 = *++in */
PPC_MULLW(8,7,6), /* r8 = low word of product */
PPC_ADDE(8,8,5), /* Add carry word r5 and bit CF to r8 */
PPC_STWU(8,3,4), /* *++out = r8 */
PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
PPC_BC(16,31,-5), /* Branch to Loop if --ctr != 0 */
/* Label: */
PPC_ADDZE(5,5), /* Add carry flag to r5 */
PPC_STW(5,3,4), /* out[1] = r5 */
PPC_BLR()
};
/*
* MulAdd1 expects (*out, *in, len, k), count >= 1
* r3 r4 r5 r6
*/
static unsigned const mulAdd1[] = {
PPC_LWZ(7,4,0), /* Load first word of in in r7 */
PPC_LWZ(0,3,0), /* Load first word of out into r0 */
PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
PPC_MTCTR(5), /* Move len into CTR */
PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
PPC_ADDC(8,8,0), /* r8 = r8 + r0 */
PPC_STW(8,3,0), /* Store result to memory */
PPC_BC(18,31,10), /* Branch to Label if --ctr == 0 */
/* Loop: */
PPC_LWZU(7,4,4), /* r7 = *++in */
PPC_LWZU(0,3,4), /* r0 = *++out */
PPC_MULLW(8,7,6), /* r8 = low word of product */
PPC_ADDE(8,8,5), /* Add carry word r5 and carry bit CF to r8 */
PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
PPC_ADDZE(5,5), /* Add carry bit from low add to r5 */
PPC_ADDC(8,8,0), /* r8 = r8 + r0 */
PPC_STW(8,3,0), /* *out = r8 */
PPC_BC(16,31,-8), /* Branch to Loop if --ctr != 0 */
/* Label: */
PPC_ADDZE(3,5), /* Add carry flag to r5 and move to r3 */
PPC_BLR()
};
/*
* MulSub1 expects (*out, *in, len, k), count >= 1
* r3 r4 r5 r6
*
* Multiply and subtract is rather a pain. If the subtract of the
* low word of the product from out[i] generates a borrow, we want to
* increment the carry word (initially in the range 0..0xfffffffe).
* However, the PPC's carry bit CF is *clear* after a subtract, so
* we want to add (1-CF) to the carry word. This is done using two
* instructions:
*
* SUBFME, subtract from minus one extended. This computes
* rD = ~rS + 0xffffffff + CF. Since rS is from 0 to 0xfffffffe,
* ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
* from 0 through 0xfffffffff, setting the carry flag unconditionally, and
* NOR, which is used as a bitwise invert NOT instruction.
*
* The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
* = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
* which is the bitwise complement of the value we want.
* We want to add the complement of that result to the low word of the
* product, which is just what a subtract would do, if only we could get
* the carry flag clear. But it's always set, except for SUBFE, and the
* operation we just performed unconditionally *sets* the carry flag. Ugh.
* So find the complement in a separate instruction.
*/
static unsigned const mulSub1[] = {
PPC_LWZ(7,4,0), /* Load first word of in in r7 */
PPC_LWZ(0,3,0), /* Load first word of out into r0 */
PPC_MTCTR(5), /* Move len into CTR */
PPC_MULLW(8,7,6), /* Low half of multiply in r8 */
PPC_MULHWU(5,7,6), /* High half of multiply in r5 */
PPC_SUBFC(8,8,0), /* r8 = r0 - r8, setting CF */
PPC_STW(8,3,0), /* Store result to memory */
PPC_SUBFME(5,5), /* First of two insns to add (1-CF) to r5 */
PPC_BC(18,31,12), /* Branch to Label if --ctr == 0 */
/* Loop: */
PPC_LWZU(7,4,4), /* r7 = *++in */
PPC_LWZU(0,3,4), /* r0 = *++out */
PPC_NOR(5,5,5), /* Second of two insns to add (1-CF) to r5 */
PPC_MULLW(8,7,6), /* r8 = low word of product */
PPC_ADDC(8,8,5), /* Add carry word r5 to r8 */
PPC_MULHWU(5,7,6), /* r5 is high word of product, for carry word */
PPC_ADDZE(5,5), /* Add carry bit from low add to r5 */
PPC_SUBFC(8,8,0), /* r8 = r0 - r8, setting CF */
PPC_STW(8,3,0), /* *out = r8 */
PPC_SUBFME(5,5), /* First of two insns to add (1-CF) to r5 */
PPC_BC(16,31,-10), /* Branch to Loop if --ctr != 0 */
/* Label: */
PPC_NOR(3,5,5), /* Finish adding (1-CF) to r5, store in r3 */
PPC_BLR()
};
#if 0
/*
* Args: BNWORD32 *n, BNWORD32 const *mod, unsigned mlen, BNWORD32 inv)
* r3 r4 r5 r6
* r7, r8 and r9 are the triple-width accumulator.
* r0 and r10 are temporary registers.
* r11 and r12 are temporary pointers into n and mod, respectively.
* r2 (!) is another temporary register.
*/
static unsigned const montReduce[] = {
PPC_MTCTR(5), /* ??? */
PPC_LWZ(7,3,0), /* Load low word of n into r7 */
PPC_LWZ(10,4,0), /* Fetch low word of mod */
PPC_MULLW(0,7,6), /* Invert r7 into r0 */
PPC_STW(0,3,0), /* Store back for future use */
PPC_MULHWU(8,10,7), /* Get high word of whatnot */
PPC_MULLW(10,10,7), /* Get low word of it */
PPC_ADDC(7,7,10), /* Add low word of product to r7 */
PPC_ADDZE(8,8), /* Add carry to high word */
PPC_
PPC_MULHW(8,7,6),
PPC_ADDC(7,7,0), /* Add inverse back to r7 */
PPC_ADDZE(8,8),
PPC_
PPC_LWZU(
/* Loop: */
PPC_LWZU(0,11,4),
PPC_LWZU(10,23,-4),
PPC_MULLW(2,0,10),
PPC_ADDC(7,7,2),
PPC_MULHWU(0,0,10),
PPC_ADDE(8,8,0),
PPC_ADDZE(9,9),
PPC_BC(16,31,-7), /* Branch to Loop if --ctr != 0 */
PPC_ADDIC_(count,-1),
PPC_LWZU(0,x,4),
PPC_ADDC(0,7,0),
PPC_STW(0,x,0),
PPC_ADDZE(7,8),
PPC_ADDZE(8,9),
PPC_LI(9,0),
PPC_BC(xx,2,yy),
};
#endif
/*
* Three overlapped transition vectors for three functions.
* A PowerPC transition vector for a (potentially) inter-module
* jump or call consists of two words, an instruction address
* and a Table Of Contents (TOC) pointer, which is loaded into
* r1. Since none of the routines here have global variables,
* they don't need a TOC pointer, so the value is unimportant.
* This array places an unintersting 32-bit value after each address.
*/
unsigned const * const lbnPPC_tv[] = {
mulN1,
mulAdd1,
mulSub1,
0
};
#else /* __MWERKS >= 0x800 */
/*
* MulN1 expects (*out, *in, len, k), count >= 1
* r3 r4 r5 r6
*/
asm void
lbnMulN1_32(register unsigned *out, register unsigned const *in,
register unsigned len, register unsigned k)
{
lwz r7,0(in) /* Load first word of in in r7 */
mtctr len /* Move len into CTR */
mullw r8,r7,k /* Low half of multiply in r8 */
addic r0,r0,0 /* Clear carry bit for loop */
mulhwu len,r7,k /* High half of multiply in len */
stw r8,0(out) /* *out = r8 */
mulhwu len,r7,k /* len is high word of product, for carry */
bdz- label /* Branch to Label if --ctr == 0 */
loop:
lwzu r7,4(in) /* r7 = *++in */
mullw r8,r7,k /* Low half of multiply in r8 */
adde r8,r8,len /* Add carry word len and bit CF to r8 */
stwu r8,4(out) /* *++out = r8 */
mulhwu len,r7,k /* len is high word of product, for carry */
bdnz+ loop /* Branch to Loop if --ctr != 0 */
label:
addze len,len /* Add carry flag to carry word */
stw len,4(out)
blr
}
/*
* MulAdd1 expects (*out, *in, len, k), count >= 1
* r3 r4 r5 r6
*/
asm unsigned
lbnMulAdd1_32(register unsigned *out, register unsigned const *in,
register unsigned len, register unsigned k)
{
lwz r7,0(in) /* Load first word of in in r7 */
lwz r0,0(out) /* Load first word of out into r0 */
mullw r8,r7,k /* Low half of multiply in r8 */
mtctr len /* Move len into CTR */
mulhwu len,r7,k /* High half of multiply in len */
addc r8,r8,r0 /* r8 = r8 + r0 */
stw r8,0(out) /* Store result to memory */
bdz- label /* Branch to Label if --ctr == 0 */
loop:
lwzu r7,4(in) /* r7 = *++in */
lwzu r0,4(out) /* r0 = *++out */
mullw r8,r7,k /* r8 = low word of product */
adde r8,r8,len /* Add carry word len and carry bit CF to r8 */
mulhwu len,r7,k /* len is high word of product, for carry */
addze len,len /* Add carry bit from low add to r5 */
addc r8,r8,r0 /* r8 = r8 + r0 */
stw r8,0(out) /* *out = r8 */
bdnz+ loop /* Branch to Loop if --ctr != 0 */
label:
addze r3,r5 /* Add carry flag to r5 and move to r3 */
blr
}
/*
* MulSub1 expects (*out, *in, len, k), count >= 1
* r3 r4 r5 r6
*
* Multiply and subtract is rather a pain. If the subtract of the
* low word of the product from out[i] generates a borrow, we want to
* increment the carry word (initially in the range 0..0xfffffffe).
* However, the PPC's carry bit CF is *clear* after a subtract, so
* we want to add (1-CF) to the carry word. This is done using two
* instructions:
*
* SUBFME, subtract from minus one extended. This computes
* rD = ~rS + 0xffffffff + CF. Since rS is from 0 to 0xfffffffe,
* ~rS is from 1 through 0xffffffff, and the sum with 0xffffffff+CF is
* from 0 through 0xfffffffff, setting the carry flag unconditionally, and
* NOR, which is used as a bitwise invert NOT instruction.
*
* The SUBFME performs the computation rD = ~rS + 0xffffffff + CF,
* = (-rS - 1) + (CF - 1) = -(rS - CF + 1) - 1 = ~(rS + 1-CF),
* which is the bitwise complement of the value we want.
* We want to add the complement of that result to the low word of the
* product, which is just what a subtract would do, if only we could get
* the carry flag clear. But it's always set, except for SUBFE, and the
* operation we just performed unconditionally *sets* the carry flag. Ugh.
* So find the complement in a separate instruction.
*/
asm unsigned
lbnMulSub1_32(register unsigned *out, register unsigned const *in,
register unsigned len, register unsigned k)
{
lwz r7,0(in) /* Load first word of in in r7 */
lwz r0,0(out) /* Load first word of out into r0 */
mtctr len /* Move len into CTR */
mullw r8,r7,k /* Low half of multiply in r8 */
mulhwu len,r7,k /* High half of multiply in len */
subfc r8,r8,r0 /* r8 = r0 - r8, setting CF */
stw r8,0(out) /* Store result to memory */
subfme len,len /* First of two insns to add (1-CF) to len */
bdz- label /* Branch to Label if --ctr == 0 */
loop:
lwzu r7,4(in) /* r7 = *++in */
lwzu r0,4(out) /* r0 = *++out */
nor len,len,len /* Second of two insns to add (1-CF) to len */
mullw r8,r7,k /* r8 = low word of product */
addc r8,r8,len /* Add carry word len to r8 */
mulhwu len,r7,k /* len is high word of product, for carry */
addze len,len /* Add carry bit from low add to len */
subfc r8,r8,r0 /* r8 = r0 - r8 */
stw r8,0(out) /* *out = r8 */
subfme len,len /* First of two insns to add (1-CF) to len */
bdnz+ loop /* Branch to Loop if --ctr != 0 */
label:
nor r3,r5,r5 /* Finish adding (1-CF) to len, store in r3 */
blr
}
#endif /* __MWERKS >= 0x800 */
/* 45678901234567890123456789012345678901234567890123456789012345678901234567 */
|