1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
|
/*
* M_APM - mapmfmul.c
*
* Copyright (C) 1999 - 2007 Michael C. Ring
*
* Permission to use, copy, and distribute this software and its
* documentation for any purpose with or without fee is hereby granted,
* provided that the above copyright notice appear in all copies and
* that both that copyright notice and this permission notice appear
* in supporting documentation.
*
* Permission to modify the software is granted. Permission to distribute
* the modified code is granted. Modifications are to be distributed by
* using the file 'license.txt' as a template to modify the file header.
* 'license.txt' is available in the official MAPM distribution.
*
* This software is provided "as is" without express or implied warranty.
*/
/*
* This file contains the divide-and-conquer FAST MULTIPLICATION
* function as well as its support functions.
*
*/
#include "pgAdmin3.h"
#include "pgscript/utilities/mapm-lib/m_apm_lc.h"
static int M_firsttimef = TRUE;
/*
* specify the max size the FFT routine can handle
* (in MAPM, #digits = 2 * #bytes)
*
* this number *must* be an exact power of 2.
*
* **WORST** case input numbers (all 9's) has shown that
* the FFT math will overflow if the #define here is
* >= 1048576. On my system, 524,288 worked OK. I will
* factor down another factor of 2 to safeguard against
* other computers have less precise floating point math.
* If you are confident in your system, 524288 will
* theoretically work fine.
*
* the define here allows the FFT algorithm to multiply two
* 524,288 digit numbers yielding a 1,048,576 digit result.
*/
#define MAX_FFT_BYTES 262144
/*
* the Divide-and-Conquer multiplication kicks in when the size of
* the numbers exceed the capability of the FFT (#define just above).
*
* #bytes D&C call depth
* ------ --------------
* 512K 1
* 1M 2
* 2M 3
* 4M 4
* ... ...
* 2.1990E+12 23
*
* the following stack sizes are sized to meet the
* above 2.199E+12 example, though I wouldn't want to
* wait for it to finish...
*
* Each call requires 7 stack variables to be saved so
* we need a stack depth of 23 * 7 + PAD. (we use 164)
*
* For 'exp_stack', 3 integers also are required to be saved
* for each recursive call so we need a stack depth of
* 23 * 3 + PAD. (we use 72)
*
*
* If the FFT multiply is disabled, resize the arrays
* as follows:
*
* the following stack sizes are sized to meet the
* worst case expected assuming we are multiplying
* numbers with 2.14E+9 (2 ^ 31) digits.
*
* For sizeof(int) == 4 (32 bits) there may be up to 32 recursive
* calls. Each call requires 7 stack variables so we need a
* stack depth of 32 * 7 + PAD. (we use 240)
*
* For 'exp_stack', 3 integers also are required to be saved
* for each recursive call so we need a stack depth of
* 32 * 3 + PAD. (we use 100)
*/
#ifdef NO_FFT_MULTIPLY
#define M_STACK_SIZE 240
#define M_ISTACK_SIZE 100
#else
#define M_STACK_SIZE 164
#define M_ISTACK_SIZE 72
#endif
static int exp_stack[M_ISTACK_SIZE];
static int exp_stack_ptr;
static UCHAR *mul_stack_data[M_STACK_SIZE];
static int mul_stack_data_size[M_STACK_SIZE];
static int M_mul_stack_ptr;
static UCHAR *fmul_a1, *fmul_a0, *fmul_a9, *fmul_b1, *fmul_b0,
*fmul_b9, *fmul_t0;
static int size_flag, bit_limit, stmp, itmp, mii;
static M_APM M_ain;
static M_APM M_bin;
static const char *M_stack_ptr_error_msg = "\'M_get_stack_ptr\', Out of memory";
extern void M_fast_multiply(M_APM, M_APM, M_APM);
extern void M_fmul_div_conq(UCHAR *, UCHAR *, UCHAR *, int);
extern void M_fmul_add(UCHAR *, UCHAR *, int, int);
extern int M_fmul_subtract(UCHAR *, UCHAR *, UCHAR *, int);
extern void M_fmul_split(UCHAR *, UCHAR *, UCHAR *, int);
extern int M_next_power_of_2(int);
extern int M_get_stack_ptr(int);
extern void M_push_mul_int(int);
extern int M_pop_mul_int(void);
#ifdef NO_FFT_MULTIPLY
extern void M_4_byte_multiply(UCHAR *, UCHAR *, UCHAR *);
#else
extern void M_fast_mul_fft(UCHAR *, UCHAR *, UCHAR *, int);
#endif
/*
* the following algorithm is used in this fast multiply routine
* (sometimes called the divide-and-conquer technique.)
*
* assume we have 2 numbers (a & b) with 2N digits.
*
* let : a = (2^N) * A1 + A0 , b = (2^N) * B1 + B0
*
* where 'A1' is the 'most significant half' of 'a' and
* 'A0' is the 'least significant half' of 'a'. Same for
* B1 and B0.
*
* Now use the identity :
*
* 2N N N N
* ab = (2 + 2 ) A1B1 + 2 (A1-A0)(B0-B1) + (2 + 1)A0B0
*
*
* The original problem of multiplying 2 (2N) digit numbers has
* been reduced to 3 multiplications of N digit numbers plus some
* additions, subtractions, and shifts.
*
* The fast multiplication algorithm used here uses the above
* identity in a recursive process. This algorithm results in
* O(n ^ 1.585) growth.
*/
/****************************************************************************/
void M_free_all_fmul()
{
int k;
if (M_firsttimef == FALSE)
{
m_apm_free(M_ain);
m_apm_free(M_bin);
for (k = 0; k < M_STACK_SIZE; k++)
{
if (mul_stack_data_size[k] != 0)
{
MAPM_FREE(mul_stack_data[k]);
}
}
M_firsttimef = TRUE;
}
}
/****************************************************************************/
void M_push_mul_int(int val)
{
exp_stack[++exp_stack_ptr] = val;
}
/****************************************************************************/
int M_pop_mul_int()
{
return(exp_stack[exp_stack_ptr--]);
}
/****************************************************************************/
void M_fmul_split(UCHAR *x1, UCHAR *x0, UCHAR *xin, int nbytes)
{
memcpy(x1, xin, nbytes);
memcpy(x0, (xin + nbytes), nbytes);
}
/****************************************************************************/
void M_fast_multiply(M_APM rr, M_APM aa, M_APM bb)
{
void *vp;
int ii, k, nexp, sign;
if (M_firsttimef)
{
M_firsttimef = FALSE;
for (k = 0; k < M_STACK_SIZE; k++)
mul_stack_data_size[k] = 0;
size_flag = M_get_sizeof_int();
bit_limit = 8 * size_flag + 1;
M_ain = m_apm_init();
M_bin = m_apm_init();
}
exp_stack_ptr = -1;
M_mul_stack_ptr = -1;
m_apm_copy(M_ain, aa);
m_apm_copy(M_bin, bb);
sign = M_ain->m_apm_sign * M_bin->m_apm_sign;
nexp = M_ain->m_apm_exponent + M_bin->m_apm_exponent;
if (M_ain->m_apm_datalength >= M_bin->m_apm_datalength)
ii = M_ain->m_apm_datalength;
else
ii = M_bin->m_apm_datalength;
ii = (ii + 1) >> 1;
ii = M_next_power_of_2(ii);
/* Note: 'ii' must be >= 4 here. this is guaranteed
by the caller: m_apm_multiply
*/
k = 2 * ii; /* required size of result, in bytes */
M_apm_pad(M_ain, k); /* fill out the data so the number of */
M_apm_pad(M_bin, k); /* bytes is an exact power of 2 */
if (k > rr->m_apm_malloclength)
{
if ((vp = MAPM_REALLOC(rr->m_apm_data, (k + 32))) == NULL)
{
/* fatal, this does not return */
M_apm_log_error_msg(M_APM_FATAL, "\'M_fast_multiply\', Out of memory");
}
rr->m_apm_malloclength = k + 28;
rr->m_apm_data = (UCHAR *)vp;
}
#ifdef NO_FFT_MULTIPLY
M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
#else
/*
* if the numbers are *really* big, use the divide-and-conquer
* routine first until the numbers are small enough to be handled
* by the FFT algorithm. If the numbers are already small enough,
* call the FFT multiplication now.
*
* Note that 'ii' here is (and must be) an exact power of 2.
*/
if (size_flag == 2) /* if still using 16 bit compilers .... */
{
M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
}
else /* >= 32 bit compilers */
{
if (ii > (MAX_FFT_BYTES + 2))
{
M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
}
else
{
M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
}
}
#endif
rr->m_apm_sign = sign;
rr->m_apm_exponent = nexp;
rr->m_apm_datalength = 4 * ii;
M_apm_normalize(rr);
}
/****************************************************************************/
/*
* This is the recursive function to perform the multiply. The
* design intent here is to have no local variables. Any local
* data that needs to be saved is saved on one of the two stacks.
*/
void M_fmul_div_conq(UCHAR *rr, UCHAR *aa, UCHAR *bb, int sz)
{
#ifdef NO_FFT_MULTIPLY
if (sz == 4) /* multiply 4x4 yielding an 8 byte result */
{
M_4_byte_multiply(rr, aa, bb);
return;
}
#else
/*
* if the numbers are now small enough, let the FFT algorithm
* finish up.
*/
if (sz == MAX_FFT_BYTES)
{
M_fast_mul_fft(rr, aa, bb, sz);
return;
}
#endif
memset(rr, 0, (2 * sz)); /* zero out the result */
mii = sz >> 1;
itmp = M_get_stack_ptr(mii);
M_push_mul_int(itmp);
fmul_a1 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(mii);
fmul_a0 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(2 * sz);
fmul_a9 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(mii);
fmul_b1 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(mii);
fmul_b0 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(2 * sz);
fmul_b9 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(2 * sz);
fmul_t0 = mul_stack_data[itmp];
M_fmul_split(fmul_a1, fmul_a0, aa, mii);
M_fmul_split(fmul_b1, fmul_b0, bb, mii);
stmp = M_fmul_subtract(fmul_a9, fmul_a1, fmul_a0, mii);
stmp *= M_fmul_subtract(fmul_b9, fmul_b0, fmul_b1, mii);
M_push_mul_int(stmp);
M_push_mul_int(mii);
M_fmul_div_conq(fmul_t0, fmul_a0, fmul_b0, mii);
mii = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();
M_push_mul_int(itmp);
M_push_mul_int(stmp);
M_push_mul_int(mii);
/* to restore all stack variables ...
fmul_a1 = mul_stack_data[itmp];
fmul_a0 = mul_stack_data[itmp+1];
fmul_a9 = mul_stack_data[itmp+2];
fmul_b1 = mul_stack_data[itmp+3];
fmul_b0 = mul_stack_data[itmp+4];
fmul_b9 = mul_stack_data[itmp+5];
fmul_t0 = mul_stack_data[itmp+6];
*/
fmul_a1 = mul_stack_data[itmp];
fmul_b1 = mul_stack_data[itmp + 3];
fmul_t0 = mul_stack_data[itmp + 6];
memcpy((rr + sz), fmul_t0, sz); /* first 'add', result is now zero */
/* so we just copy in the bytes */
M_fmul_add(rr, fmul_t0, mii, sz);
M_fmul_div_conq(fmul_t0, fmul_a1, fmul_b1, mii);
mii = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();
M_push_mul_int(itmp);
M_push_mul_int(stmp);
M_push_mul_int(mii);
fmul_a9 = mul_stack_data[itmp + 2];
fmul_b9 = mul_stack_data[itmp + 5];
fmul_t0 = mul_stack_data[itmp + 6];
M_fmul_add(rr, fmul_t0, 0, sz);
M_fmul_add(rr, fmul_t0, mii, sz);
if (stmp != 0)
M_fmul_div_conq(fmul_t0, fmul_a9, fmul_b9, mii);
mii = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();
fmul_t0 = mul_stack_data[itmp + 6];
/*
* if the sign of (A1 - A0)(B0 - B1) is positive, ADD to
* the result. if it is negative, SUBTRACT from the result.
*/
if (stmp < 0)
{
fmul_a9 = mul_stack_data[itmp + 2];
fmul_b9 = mul_stack_data[itmp + 5];
memset(fmul_b9, 0, (2 * sz));
memcpy((fmul_b9 + mii), fmul_t0, sz);
M_fmul_subtract(fmul_a9, rr, fmul_b9, (2 * sz));
memcpy(rr, fmul_a9, (2 * sz));
}
if (stmp > 0)
M_fmul_add(rr, fmul_t0, mii, sz);
M_mul_stack_ptr -= 7;
}
/****************************************************************************/
/*
* special addition function for use with the fast multiply operation
*/
void M_fmul_add(UCHAR *r, UCHAR *a, int offset, int sz)
{
int i, j;
UCHAR carry;
carry = 0;
j = offset + sz;
i = sz;
while (TRUE)
{
r[--j] += carry + a[--i];
if (r[j] >= 100)
{
r[j] -= 100;
carry = 1;
}
else
carry = 0;
if (i == 0)
break;
}
if (carry)
{
while (TRUE)
{
r[--j] += 1;
if (r[j] < 100)
break;
r[j] -= 100;
}
}
}
/****************************************************************************/
/*
* special subtraction function for use with the fast multiply operation
*/
int M_fmul_subtract(UCHAR *r, UCHAR *a, UCHAR *b, int sz)
{
int k, jtmp, sflag, nb, borrow;
nb = sz;
sflag = 0; /* sign flag: assume the numbers are equal */
/*
* find if a > b (so we perform a-b)
* or a < b (so we perform b-a)
*/
for (k = 0; k < nb; k++)
{
if (a[k] < b[k])
{
sflag = -1;
break;
}
if (a[k] > b[k])
{
sflag = 1;
break;
}
}
if (sflag == 0)
{
memset(r, 0, nb); /* zero out the result */
}
else
{
k = nb;
borrow = 0;
while (TRUE)
{
k--;
if (sflag == 1)
jtmp = (int)a[k] - ((int)b[k] + borrow);
else
jtmp = (int)b[k] - ((int)a[k] + borrow);
if (jtmp >= 0)
{
r[k] = (UCHAR)jtmp;
borrow = 0;
}
else
{
r[k] = (UCHAR)(100 + jtmp);
borrow = 1;
}
if (k == 0)
break;
}
}
return(sflag);
}
/****************************************************************************/
int M_next_power_of_2(int n)
{
int ct, k;
if (n <= 2)
return(n);
k = 2;
ct = 0;
while (TRUE)
{
if (k >= n)
break;
k = k << 1;
if (++ct == bit_limit)
{
/* fatal, this does not return */
M_apm_log_error_msg(M_APM_FATAL,
"\'M_next_power_of_2\', ERROR :sizeof(int) too small ??");
}
}
return(k);
}
/****************************************************************************/
int M_get_stack_ptr(int sz)
{
int i, k;
UCHAR *cp;
k = ++M_mul_stack_ptr;
/* if size is 0, just need to malloc and return */
if (mul_stack_data_size[k] == 0)
{
if ((i = sz) < 16)
i = 16;
if ((cp = (UCHAR *)MAPM_MALLOC(i + 4)) == NULL)
{
/* fatal, this does not return */
M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
}
mul_stack_data[k] = cp;
mul_stack_data_size[k] = i;
}
else /* it has been malloc'ed, see if it's big enough */
{
if (sz > mul_stack_data_size[k])
{
cp = mul_stack_data[k];
if ((cp = (UCHAR *)MAPM_REALLOC(cp, (sz + 4))) == NULL)
{
/* fatal, this does not return */
M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
}
mul_stack_data[k] = cp;
mul_stack_data_size[k] = sz;
}
}
return(k);
}
/****************************************************************************/
#ifdef NO_FFT_MULTIPLY
/*
* multiply a 4 byte number by a 4 byte number
* yielding an 8 byte result. each byte contains
* a base 100 'digit', i.e.: range from 0-99.
*
* MSB LSB
*
* a,b [0] [1] [2] [3]
* result [0] ..... [7]
*/
void M_4_byte_multiply(UCHAR *r, UCHAR *a, UCHAR *b)
{
int jj;
unsigned int *ip, t1, rr[8];
memset(rr, 0, (8 * sizeof(int))); /* zero out result */
jj = 3;
ip = rr + 5;
/*
* loop for one number [b], un-roll the inner 'loop' [a]
*
* accumulate partial sums in UINT array, release carries
* and convert back to base 100 at the end
*/
while (1)
{
t1 = (unsigned int)b[jj];
ip += 2;
*ip-- += t1 * a[3];
*ip-- += t1 * a[2];
*ip-- += t1 * a[1];
*ip += t1 * a[0];
if (jj-- == 0)
break;
}
jj = 7;
while (1)
{
t1 = rr[jj] / 100;
r[jj] = (UCHAR)(rr[jj] - 100 * t1);
if (jj == 0)
break;
rr[--jj] += t1;
}
}
#endif
/****************************************************************************/
|