1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
|
.file "erfl.s"
// Copyright (c) 2001 - 2003, Intel Corporation
// All rights reserved.
//
// Contributed 2001 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 11/21/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 08/14/02 Changed mli templates to mlx
// 02/06/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
// long double erfl(long double)
//
// Overview of operation
//==============================================================
//
// Algorithm description
// ---------------------
//
// There are 4 paths:
//
// 1. Special path: x = 0, Inf, NaNs, denormal
// Return erfl(x) = +/-0.0 for zeros
// Return erfl(x) = QNaN for NaNs
// Return erfl(x) = sign(x)*1.0 for Inf
// Return erfl(x) = (A0H+A0L)*x + x^2, ((A0H+A0L) = 2.0/sqrt(Pi))
// for denormals
//
// 2. [0;1/8] path: 0.0 < |x| < 1/8
// Return erfl(x) = x*(A1H+A1L) + x^3*A3 + ... + x^15*A15
//
// 3. Main path: 1/8 <= |x| < 6.53
// For several ranges of 1/8 <= |x| < 6.53
// Return erfl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
// + y^3*A3 + y^4*A4 + ... + y^25*A25 )
// where y = (|x|/a) - b
//
// For each range there is particular set of coefficients.
// Below is the list of ranges:
// 1/8 <= |x| < 1/4 a = 0.125, b = 1.5
// 1/4 <= |x| < 1/2 a = 0.25, b = 1.5
// 1/2 <= |x| < 1.0 a = 0.5, b = 1.5
// 1.0 <= |x| < 2.0 a = 1.0, b = 1.5
// 2.0 <= |x| < 3.25 a = 2.0, b = 1.5
// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0
// 4.0 <= |x| < 6.53 a = 4.0, b = 1.5
// ( [3.25;4.0] subrange separated for monotonicity issues resolve )
//
// 4. Saturation path: 6.53 <= |x| < +INF
// Return erfl(x) = sign(x)*(1.0 - tiny_value)
// (tiny_value ~ 1e-1233)
//
// Implementation notes
// --------------------
//
// 1. Special path: x = 0, INF, NaNa, denormals
//
// This branch is cut off by one fclass operation.
// Then zeros+nans, infinities and denormals processed separately.
// For denormals we had to use multiprecision A0 coefficient to reach
// necessary accuracy: (A0H+A0L)*x-x^2
//
// 2. [0;1/8] path: 0.0 < |x| < 1/8
//
// First coefficient of polynomial we must split to multiprecision too.
// Also we can parallelise computations:
// (x*(A1H+A1L)) calculated in parallel with "tail" (x^3*A3 + ... + x^15*A15)
// Furthermore the second part is factorized using binary tree technique.
//
// 3. Main path: 1/8 <= |x| < 6.53
//
// Multiprecision have to be performed only for first few
// polynomial iterations (up to 3-rd x degree)
// Here we use the same parallelisation way as above:
// Split whole polynomial to first, "multiprecision" part, and second,
// so called "tail", native precision part.
//
// 1) Multiprecision part:
// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
// v1 and v2 terms calculated in parallel
//
// 2) Tail part:
// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
// v3 is splitted to 2 even parts (10 coefficient in each one).
// These 2 parts are also factorized using binary tree technique.
//
// So Multiprecision and Tail parts cost is almost the same
// and we have both results ready before final summation.
//
// 4. Saturation path: 6.53 <= |x| < +INF
//
// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
// just to meet IEEE requirements for different rounding modes in this case.
//
// Registers used
//==============================================================
// Floating Point registers used:
// f8 - input & output
// f32 -> f90
// General registers used:
// r2, r3, r32 -> r52
// Predicate registers used:
// p0, p6 -> p11, p14, p15
// p6 - arg is zero, denormal or special IEEE
// p7 - arg is in [4;8] binary interval
// p8 - arg is in [3.25;4] interval
// p9 - arg < 1/8
// p10 - arg is NOT in [3.25;4] interval
// p11 - arg in saturation domain
// p14 - arg is positive
// p15 - arg is negative
// Assembly macros
//==============================================================
rDataPtr = r2
rTailDataPtr = r3
rBias = r33
rSignBit = r34
rInterval = r35
rArgExp = r36
rArgSig = r37
r3p25Offset = r38
r2to4 = r39
r1p25 = r40
rOffset = r41
r1p5 = r42
rSaturation = r43
r3p25Sign = r44
rTiny = r45
rAddr1 = r46
rAddr2 = r47
rTailAddr1 = r48
rTailAddr2 = r49
rTailOffset = r50
rTailAddOffset = r51
rShiftedDataPtr = r52
//==============================================================
fA0H = f32
fA0L = f33
fA1H = f34
fA1L = f35
fA2H = f36
fA2L = f37
fA3 = f38
fA4 = f39
fA5 = f40
fA6 = f41
fA7 = f42
fA8 = f43
fA9 = f44
fA10 = f45
fA11 = f46
fA12 = f47
fA13 = f48
fA14 = f49
fA15 = f50
fA16 = f51
fA17 = f52
fA18 = f53
fA19 = f54
fA20 = f55
fA21 = f56
fA22 = f57
fA23 = f58
fA24 = f59
fA25 = f60
fArgSqr = f61
fArgCube = f62
fArgFour = f63
fArgEight = f64
fArgAbsNorm = f65
fArgAbsNorm2 = f66
fArgAbsNorm2L = f67
fArgAbsNorm3 = f68
fArgAbsNorm4 = f69
fArgAbsNorm11 = f70
fRes = f71
fResH = f72
fResL = f73
fRes1H = f74
fRes1L = f75
fRes1Hd = f76
fRes2H = f77
fRes2L = f78
fRes3H = f79
fRes3L = f80
fRes4 = f81
fTT = f82
fTH = f83
fTL = f84
fTT2 = f85
fTH2 = f86
fTL2 = f87
f1p5 = f88
f2p0 = f89
fTiny = f90
// Data tables
//==============================================================
RODATA
.align 64
LOCAL_OBJECT_START(erfl_data)
////////// Main tables ///////////
_0p125_to_0p25_data: // exp = 2^-3
// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
data8 0xACD9ED470F0BB048, 0x0000BFF4 //A3 = -6.5937529303909561891162915809e-04
data8 0xBF6A254428DDB452 //A2H = -3.1915980570631852578089571182e-03
data8 0xBC131B3BE3AC5079 //A2L = -2.5893976889070198978842231134e-19
data8 0x3FC16E2D7093CD8C //A1H = 1.3617485043469590433318217038e-01
data8 0x3C6979A52F906B4C //A1L = 1.1048096806003284897639351952e-17
data8 0x3FCAC45E37FE2526 //A0H = 2.0911767705937583938791135552e-01
data8 0x3C648D48536C61E3 //A0L = 8.9129592834861155344147026365e-18
data8 0xD1FC135B4A30E746, 0x00003F90 //A25 = 6.3189963203954877364460345654e-34
data8 0xB1C79B06DD8C988C, 0x00003F97 //A24 = 6.8478253118093953461840838106e-32
data8 0xCC7AE121D1DEDA30, 0x0000BF9A //A23 = -6.3010264109146390803803408666e-31
data8 0x8927B8841D1E0CA8, 0x0000BFA1 //A22 = -5.4098171537601308358556861717e-29
data8 0xB4E84D6D0C8F3515, 0x00003FA4 //A21 = 5.7084320046554628404861183887e-28
data8 0xC190EAE69A67959A, 0x00003FAA //A20 = 3.9090359419467121266470910523e-26
data8 0x90122425D312F680, 0x0000BFAE //A19 = -4.6551806872355374409398000522e-25
data8 0xF8456C9C747138D6, 0x0000BFB3 //A18 = -2.5670639225386507569611436435e-23
data8 0xCDCAE0B3C6F65A3A, 0x00003FB7 //A17 = 3.4045511783329546779285646369e-22
data8 0x8F41909107C62DCC, 0x00003FBD //A16 = 1.5167830861896169812375771948e-20
data8 0x82F0FCB8A4B8C0A3, 0x0000BFC1 //A15 = -2.2182328575376704666050112195e-19
data8 0x92E992C58B7C3847, 0x0000BFC6 //A14 = -7.9641369349930600223371163611e-18
LOCAL_OBJECT_END(erfl_data)
LOCAL_OBJECT_START(_0p25_to_0p5_data)
// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
data8 0xF083628E8F7CE71D, 0x0000BFF6 //A3 = -3.6699405305266733332335619531e-03
data8 0xBF978749A434FE4E //A2H = -2.2977018973732214746075186440e-02
data8 0xBC30B3FAFBC21107 //A2L = -9.0547407100537663337591537643e-19
data8 0x3FCF5F0CDAF15313 //A1H = 2.4508820238647696654332719390e-01
data8 0x3C1DFF29F5AD8117 //A1L = 4.0653155218104625249413579084e-19
data8 0x3FD9DD0D2B721F38 //A0H = 4.0411690943482225790717166092e-01
data8 0x3C874C71FEF1759E //A0L = 4.0416653425001310671815863946e-17
data8 0xA621D99B8C12595E, 0x0000BFAB //A25 = -6.7100271986703749013021666304e-26
data8 0xBD7BBACB439992E5, 0x00003FAE //A24 = 6.1225362452814749024566661525e-25
data8 0xFF2FEFF03A98E410, 0x00003FB2 //A23 = 1.3192871864994282747963195183e-23
data8 0xAE8180957ABE6FD5, 0x0000BFB6 //A22 = -1.4434787102181180110707433640e-22
data8 0xAF0566617B453AA6, 0x0000BFBA //A21 = -2.3163848847252215762970075142e-21
data8 0x8F33D3616B9B8257, 0x00003FBE //A20 = 3.0324297082969526400202995913e-20
data8 0xD58AB73354438856, 0x00003FC1 //A19 = 3.6175397854863872232142412590e-19
data8 0xD214550E2F3210DF, 0x0000BFC5 //A18 = -5.6942141660091333278722310354e-18
data8 0xE2CA60C328F3BBF5, 0x0000BFC8 //A17 = -4.9177359011428870333915211291e-17
data8 0x88D9BB274F9B3873, 0x00003FCD //A16 = 9.4959118337089189766177270051e-16
data8 0xCA4A00AB538A2DB2, 0x00003FCF //A15 = 5.6146496538690657993449251855e-15
data8 0x9CC8FFFBDDCF9853, 0x0000BFD4 //A14 = -1.3925319209173383944263942226e-13
LOCAL_OBJECT_END(_0p25_to_0p5_data)
LOCAL_OBJECT_START(_0p5_to_1_data)
// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
data8 0xDB742C8FB372DBE0, 0x00003FF6 //A3 = 3.3485993187250381721535255963e-03
data8 0xBFBEDC5644353C26 //A2H = -1.2054957547410136142751468924e-01
data8 0xBC6D7215B023455F //A2L = -1.2770012232203569059818773287e-17
data8 0x3FD492E42D78D2C4 //A1H = 3.2146553459760363047337250464e-01
data8 0x3C83A163CAC22E05 //A1L = 3.4053365952542489137756724868e-17
data8 0x3FE6C1C9759D0E5F //A0H = 7.1115563365351508462453011816e-01
data8 0x3C8B1432F2CBC455 //A0L = 4.6974407716428899960674098333e-17
data8 0x95A6B92162813FF8, 0x00003FC3 //A25 = 1.0140763985766801318711038400e-18
data8 0xFE5EC3217F457B83, 0x0000BFC6 //A24 = -1.3789434273280972156856405853e-17
data8 0x9B49651031B5310B, 0x0000BFC8 //A23 = -3.3672435142472427475576375889e-17
data8 0xDBF73927E19B7C8D, 0x00003FCC //A22 = 7.6315938248752024965922341872e-16
data8 0xF55CBA3052730592, 0x00003FCB //A21 = 4.2563559623888750271176552350e-16
data8 0xA1DC9380DA82CFF6, 0x0000BFD2 //A20 = -3.5940500736023122607663701015e-14
data8 0xAAD1AE1067F3D577, 0x00003FD2 //A19 = 3.7929451192558641569555227613e-14
data8 0xCD1DB83F3B9D2090, 0x00003FD7 //A18 = 1.4574374961011929143375716362e-12
data8 0x87235ACB5E8BB298, 0x0000BFD9 //A17 = -3.8408559294899660346666452560e-12
data8 0xDA417B78FF9F46B4, 0x0000BFDC //A16 = -4.9625621225715971268115023451e-11
data8 0xF075762685484436, 0x00003FDE //A15 = 2.1869603559309150844390066920e-10
data8 0xB989FDB3795165C7, 0x00003FE1 //A14 = 1.3499740992928183247608593000e-09
LOCAL_OBJECT_END(_0p5_to_1_data)
LOCAL_OBJECT_START(_1_to_2_data)
// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
data8 0x8E15015F5B55BEAC, 0x00003FFC //A3 = 1.3875200409423426678618977531e-01
data8 0xBFC6D5A95D0A1B7E //A2H = -1.7839543383544403942764233761e-01
data8 0xBC7499F704C80E02 //A2L = -1.7868888188464394090788198634e-17
data8 0x3FBE723726B824A8 //A1H = 1.1893028922362935961842822508e-01
data8 0x3C6B77F399C2AD27 //A1L = 1.1912589318015368492508652194e-17
data8 0x3FEEEA5557137ADF //A0H = 9.6610514647531064991170524081e-01
data8 0x3C963D0DDD0A762F //A0L = 7.7155271023949055047261953350e-17
data8 0x8FAA405DAD409771, 0x0000BFDB //A25 = -1.6332824616946528652252813763e-11
data8 0x941386F4697976D8, 0x0000BFDD //A24 = -6.7337295147729213955410252613e-11
data8 0xBCBE75234530B404, 0x00003FDF //A23 = 3.4332329029092304943838374908e-10
data8 0xF55E2CE71A00D040, 0x00003FDF //A22 = 4.4632156034175937694868068394e-10
data8 0xA6CADFE489D2671F, 0x0000BFE3 //A21 = -4.8543000253822277507724949798e-09
data8 0xA4C69F11FEAFB3A8, 0x00003FE2 //A20 = 2.3978044150868471771557059958e-09
data8 0xD63441E3BED59703, 0x00003FE6 //A19 = 4.9873285553412397317802071288e-08
data8 0xDFDAED9D3089D732, 0x0000BFE7 //A18 = -1.0424069510877052249228047044e-07
data8 0xB47287FF165756A5, 0x0000BFE9 //A17 = -3.3610945128073834488448164164e-07
data8 0xCDAF2DC0A79A9059, 0x00003FEB //A16 = 1.5324673941628851136481785187e-06
data8 0x9FD6A7B2ECE8EDA9, 0x00003FEA //A15 = 5.9544479989469083598476592569e-07
data8 0xEC6E63BB4507B585, 0x0000BFEE //A14 = -1.4092398243085031882423746824e-05
LOCAL_OBJECT_END(_1_to_2_data)
LOCAL_OBJECT_START(_2_to_3p25_data)
// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
data8 0xCEDBA58E8EE6F055, 0x00003FF7 //A3 = 6.3128050215859026984338771121e-03
data8 0xBF5B60D5E974CBBD //A2H = -1.6710366233609740427984435840e-03
data8 0xBC0E11E2AEC18AF6 //A2L = -2.0376133202996259839305825162e-19
data8 0x3F32408E9BA3327E //A1H = 2.7850610389349567379974059733e-04
data8 0x3BE41010E4B3B224 //A1L = 3.3987633691879253781833531576e-20
data8 0x3FEFFFD1AC4135F9 //A0H = 9.9997790950300136092465663751e-01
data8 0x3C8EEAFA1E97EAE0 //A0L = 5.3633970564750967956196033852e-17
data8 0xBF9C6F2C6D7263C1, 0x00003FF0 //A25 = 4.5683639377039166585098497471e-05
data8 0xCB4167CC4798096D, 0x00003FF0 //A24 = 4.8459885139772945417160731273e-05
data8 0xE1394FECFE972D32, 0x0000BFF2 //A23 = -2.1479022581129892562916533804e-04
data8 0xC7F9E47581FC2A5F, 0x0000BFF2 //A22 = -1.9071211076537531370822343363e-04
data8 0xDD612EDFAA41BEAE, 0x00003FF2 //A21 = 2.1112405918671957390188348542e-04
data8 0x8C166AA4CB2AD8FD, 0x0000BFF4 //A20 = -5.3439165021555312536009227942e-04
data8 0xEFBE33D9F62B68D4, 0x0000BFF2 //A19 = -2.2863672131516067770956697877e-04
data8 0xCCB92F5D91562494, 0x00003FF5 //A18 = 1.5619154280865226092321881421e-03
data8 0x80A5DBE71D4BA0E2, 0x0000BFF6 //A17 = -1.9630109664962540123775799179e-03
data8 0xA0ADEB2D4C41347A, 0x0000BFF4 //A16 = -6.1294315248639348947483422457e-04
data8 0xB1F5D4911B911665, 0x00003FF7 //A15 = 5.4309165882071876864550213817e-03
data8 0xF2F3D8D21E8762E0, 0x0000BFF7 //A14 = -7.4143227286535936033409745884e-03
LOCAL_OBJECT_END(_2_to_3p25_data)
LOCAL_OBJECT_START(_4_to_6p53_data)
// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
data8 0xDF3151BE8652827E, 0x00003FD5 //A3 = 3.9646979666953349095427642209e-13
data8 0xBD1C4A9787DF888B //A2H = -2.5127788450714750484839908889e-14
data8 0xB99B35483E4603FD //A2L = -3.3536613901268985626466020210e-31
data8 0x3CD2DBF507F1A1F3 //A1H = 1.0468963266736687758710258897e-15
data8 0x398A97B60913B4BD //A1L = 1.6388968267515149775818013207e-31
data8 0x3FEFFFFFFFFFFFFF //A0H = 9.9999999999999988897769753748e-01
data8 0x3C99CC25E658129E //A0L = 8.9502895736398715695745861054e-17
data8 0xB367B21294713D39, 0x00003FFB //A25 = 8.7600127403270828432337605471e-02
data8 0xCEE3A423ADEC0F4C, 0x00003FFD //A24 = 4.0408051429309221404807497715e-01
data8 0xC389626CF2D727C0, 0x00003FFE //A23 = 7.6381507072332210580356159947e-01
data8 0xD15A03E082D0A307, 0x00003FFE //A22 = 8.1777977210259904277239787430e-01
data8 0x8FD3DA92675E8E00, 0x00003FFE //A21 = 5.6182638239203638864793584264e-01
data8 0xFD375E6EE167AA58, 0x00003FFC //A20 = 2.4728152801285544751731937424e-01
data8 0x89A9482FADE66AE1, 0x00003FFB //A19 = 6.7217410998398471333985773237e-02
data8 0xC62E1F02606C04DD, 0x00003FF7 //A18 = 6.0479785358923404401184993359e-03
data8 0xEE7BF2BE71CC531C, 0x0000BFF5 //A17 = -1.8194898432032114199803271708e-03
data8 0x8084081981CDC79C, 0x0000BFF5 //A16 = -9.8049734947701208487713246099e-04
data8 0x8975DFB834C118C3, 0x0000BFF0 //A15 = -3.2773123965143773578608926094e-05
data8 0x965DA4A80008B7BC, 0x0000BFEE //A14 = -8.9624997201558650125662820562e-06
LOCAL_OBJECT_END(_4_to_6p53_data)
LOCAL_OBJECT_START(_3p25_to_4_data)
// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
data8 0xB01D29846286CE08, 0x00003FEE //A3 = 1.0497207328743021499800978059e-05
data8 0xBEC10B1488AEB234 //A2H = -2.0317175474986489113480084279e-06
data8 0xBB7F19701B8B74F9 //A2L = -4.1159669348226960337518214996e-22
data8 0x3E910B1488AEB234 //A1H = 2.5396469343733111391850105348e-07
data8 0x3B4F1944906D5D60 //A1L = 5.1448487494628801547474934193e-23
data8 0x3FEFFFFFF7B91176 //A0H = 9.9999998458274208523732795584e-01
data8 0x3C70B2865615DB3F //A0L = 1.4482653192002495179309994964e-17
data8 0xA818D085D56F3021, 0x00003FEC //A25 = 2.5048394770210505593609705765e-06
data8 0xD9C5C509AAE5561F, 0x00003FEC //A24 = 3.2450636894654766492719395406e-06
data8 0x9682D71C549EEB07, 0x0000BFED //A23 = -4.4855801709974050650263470866e-06
data8 0xBC230E1EB6FBF8B9, 0x00003FEA //A22 = 7.0086469577174843181452303996e-07
data8 0xE1432649FF29D4DE, 0x0000BFEA //A21 = -8.3916747195472308725504497231e-07
data8 0xB40CEEBD2803D2F0, 0x0000BFEF //A20 = -2.1463694318102769992677291330e-05
data8 0xEAAB57ABFFA003EB, 0x00003FEF //A19 = 2.7974761309213643228699449426e-05
data8 0xFBFA4D0B893A5BFB, 0x0000BFEE //A18 = -1.5019043571612821858165073446e-05
data8 0xBB6AA248EED3E364, 0x0000BFF0 //A17 = -4.4683584873907316507141131797e-05
data8 0x86C1B3AE3E500ED9, 0x00003FF2 //A16 = 1.2851395412345761361068234880e-04
data8 0xB60729445F0C37B5, 0x0000BFF2 //A15 = -1.7359540313300841352152461287e-04
data8 0xCA389F9E707337B1, 0x00003FF1 //A14 = 9.6426575465763394281615740282e-05
LOCAL_OBJECT_END(_3p25_to_4_data)
//////// "Tail" tables //////////
LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
data8 0x93086CBD21ED3962, 0x00003FCA //A13 = 1.2753071968462837024755878679e-16
data8 0x83CB5045A6D4B419, 0x00003FCF //A12 = 3.6580237062957773626379648530e-15
data8 0x8FCDB723209690EB, 0x0000BFD3 //A11 = -6.3861616307180801527566117146e-14
data8 0xCAA173F680B5D56B, 0x0000BFD7 //A10 = -1.4397775466324880354578008779e-12
data8 0xF0CEA934AD6AC013, 0x00003FDB //A9 = 2.7376616955640415767655526857e-11
data8 0x81C69F9D0B5AB8EE, 0x00003FE0 //A8 = 4.7212187567505249115688961488e-10
data8 0xA8B590298C20A194, 0x0000BFE4 //A7 = -9.8201697105565925460801441797e-09
data8 0x84F3DE72AC964615, 0x0000BFE8 //A6 = -1.2382176987480830706988411266e-07
data8 0xC01A1398868CC4BD, 0x00003FEC //A5 = 2.8625408039722670291121341583e-06
data8 0xCC43247F4410C54A, 0x00003FEF //A4 = 2.4349960762505993017186935493e-05
LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
data8 0x8CEAC59AF361B78A, 0x0000BFD6 //A13 = -5.0063802958258679384986669123e-13
data8 0x9BC67404F348C0CE, 0x00003FDB //A12 = 1.7709590771868743572061278273e-11
data8 0xF4B5D0348AFAAC7A, 0x00003FDB //A11 = 2.7820329729584630464848160970e-11
data8 0x83AB447FF619DA4A, 0x0000BFE2 //A10 = -1.9160363295631539615395477207e-09
data8 0x82115AB487202E7B, 0x00003FE0 //A9 = 4.7318386460142606822119637959e-10
data8 0xB84D5B0AE17054AA, 0x00003FE8 //A8 = 1.7164477188916895004843908951e-07
data8 0xB2E085C1C4AA06E5, 0x0000BFE9 //A7 = -3.3318445266863554512523957574e-07
data8 0xCD3CA2E6C3971666, 0x0000BFEE //A6 = -1.2233070175554502732980949519e-05
data8 0xBA445C53F8DD40E6, 0x00003FF0 //A5 = 4.4409521535330413551781808621e-05
data8 0xAA94D5E68033B764, 0x00003FF4 //A4 = 6.5071635765452563856926608000e-04
LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
LOCAL_OBJECT_START(_0p5_to_1_data_tail)
// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
data8 0x9ED99EDF111CB785, 0x0000BFE4 //A13 = -9.2462916180079278241704711522e-09
data8 0xDEAF7539AE2FB062, 0x0000BFE5 //A12 = -2.5923990465973151101298441139e-08
data8 0xA392D5E5CC9DB1A7, 0x00003FE9 //A11 = 3.0467952847327075747032372101e-07
data8 0xC311A7619B96CA1A, 0x00003FE8 //A10 = 1.8167212632079596881709988649e-07
data8 0x82082E6B6A93F116, 0x0000BFEE //A9 = -7.7505086843257228386931766018e-06
data8 0x96D9997CF326A36D, 0x00003FEE //A8 = 8.9913605625817479172071008270e-06
data8 0x97057D85DCB0ED99, 0x00003FF2 //A7 = 1.4402527482741758767786898553e-04
data8 0xDC23BCB3599C0490, 0x0000BFF3 //A6 = -4.1988296144950673955519083419e-04
data8 0xDA150C4867208A81, 0x0000BFF5 //A5 = -1.6638352864915033417887831090e-03
data8 0x9A4DAF550A2CC29A, 0x00003FF8 //A4 = 9.4179355839141698591817907680e-03
LOCAL_OBJECT_END(_0p5_to_1_data_tail)
LOCAL_OBJECT_START(_1_to_2_data_tail)
// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
data8 0x969EAC5C7B46CAB9, 0x00003FEF //A13 = 1.7955281439310148162059582795e-05
data8 0xA2ED832912E9FCD9, 0x00003FF1 //A12 = 7.7690020847111408916570845775e-05
data8 0x85677C39C48E43E7, 0x0000BFF3 //A11 = -2.5444839340796031538582511806e-04
data8 0xC2DAFA91683DAAE4, 0x0000BFF1 //A10 = -9.2914288456063075386925076097e-05
data8 0xE01C061CBC6A2825, 0x00003FF5 //A9 = 1.7098195515864039518892834211e-03
data8 0x9AD7271CAFD01C78, 0x0000BFF6 //A8 = -2.3626776207372761518718893636e-03
data8 0x9B6B9D30EDD5F4FF, 0x0000BFF7 //A7 = -4.7430532011804570628999212874e-03
data8 0x9E51EB9623F1D446, 0x00003FF9 //A6 = 1.9326171998839772791190405201e-02
data8 0xF391B935C12546DE, 0x0000BFF8 //A5 = -1.4866286152953671441682166195e-02
data8 0xB6AD4AE850DBF526, 0x0000BFFA //A4 = -4.4598858458861014323191919669e-02
LOCAL_OBJECT_END(_1_to_2_data_tail)
LOCAL_OBJECT_START(_2_to_3p25_data_tail)
// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
data8 0x847C24DAC7C7558B, 0x00003FF5 //A13 = 1.0107798565424606512130100541e-03
data8 0xCB6340EAF02C3DF8, 0x00003FF8 //A12 = 1.2413800617425931997420375435e-02
data8 0xB5163D252DBBC107, 0x0000BFF9 //A11 = -2.2105330871844825370020459523e-02
data8 0x82FF9C0B68E331E4, 0x00003FF9 //A10 = 1.5991024756001692140897408128e-02
data8 0xE9519E4A49752E04, 0x00003FF7 //A9 = 7.1203253651891723548763348088e-03
data8 0x8D52F11B7AE846D9, 0x0000BFFA //A8 = -3.4502927613795425888684181521e-02
data8 0xCCC5A3E32BC6FA30, 0x00003FFA //A7 = 4.9993171868423886228679106871e-02
data8 0xC1791AD8284A1919, 0x0000BFFA //A6 = -4.7234635220336795411997070641e-02
data8 0x853DAAA35A8A3C18, 0x00003FFA //A5 = 3.2529512934760303976755163452e-02
data8 0x88E42D8F47FAB60E, 0x0000BFF9 //A4 = -1.6710366233609742619461063050e-02
LOCAL_OBJECT_END(_2_to_3p25_data_tail)
LOCAL_OBJECT_START(_4_to_6p53_data_tail)
// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
data8 0xD8235ABF08B8A6D1, 0x00003FEE //A13 = 1.2882834877224764938429832586e-05
data8 0xAEDF44F9C77844C2, 0x0000BFEC //A12 = -2.6057980393716019511497492890e-06
data8 0xCCD5490956A4FCFD, 0x00003FEA //A11 = 7.6306293047300300284923464089e-07
data8 0xF71AF0126EE26AEA, 0x0000BFE8 //A10 = -2.3013467500738417953513680935e-07
data8 0xE4CE68089858AC20, 0x00003FE6 //A9 = 5.3273112263151109935867439775e-08
data8 0xBD15106FBBAEE593, 0x0000BFE4 //A8 = -1.1006037358336556244645388790e-08
data8 0x8BBF9A5769B6E480, 0x00003FE2 //A7 = 2.0336075804332107927300019116e-09
data8 0xB049D845D105E302, 0x0000BFDF //A6 = -3.2066683399502826067820249320e-10
data8 0xBAC69B3F0DFE5483, 0x00003FDC //A5 = 4.2467901578369360007795282687e-11
data8 0xA29C398F83F8A0D1, 0x0000BFD9 //A4 = -4.6216613698438694005327544047e-12
LOCAL_OBJECT_END(_4_to_6p53_data_tail)
LOCAL_OBJECT_START(_3p25_to_4_data_tail)
// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
data8 0x95BE1BEAD738160F, 0x00003FF2 //A13 = 1.4280568455209843005829620687e-04
data8 0x8108C8FFAC0F0B21, 0x0000BFF4 //A12 = -4.9222685622046459346377033307e-04
data8 0xD72A7FAEE7832BBE, 0x00003FF4 //A11 = 8.2079319302109644436194651098e-04
data8 0x823AB4281CA7BBE7, 0x0000BFF5 //A10 = -9.9357079675971109178261577703e-04
data8 0xFA1232D476048D11, 0x00003FF4 //A9 = 9.5394549599882496825916138915e-04
data8 0xC463D7AF88025FB2, 0x0000BFF4 //A8 = -7.4916843357898101689031755368e-04
data8 0xFEBE32B6B379D072, 0x00003FF3 //A7 = 4.8588363901002111193445057206e-04
data8 0x882829BB68409BF3, 0x0000BFF3 //A6 = -2.5969865184916169002074135516e-04
data8 0xED2F886E29DAAB09, 0x00003FF1 //A5 = 1.1309894347742479284610149994e-04
data8 0xA4C07129436555B2, 0x0000BFF0 //A4 = -3.9279872584973887163830479579e-05
LOCAL_OBJECT_END(_3p25_to_4_data_tail)
LOCAL_OBJECT_START(_0_to_1o8_data)
// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.125
data8 0x3FF20DD750429B6D, 0x3C71AE3A8DDFFEDE //A1H, A1L
data8 0xF8B0DACE42525CC2, 0x0000BFEE //A15
data8 0xFCD02E1BF0EC2C37, 0x00003FF1 //A13
data8 0xE016D968FE473B5E, 0x0000BFF4 //A11
data8 0xAB2DE68711BF5A79, 0x00003FF7 //A9
data8 0xDC16718944518309, 0x0000BFF9 //A7
data8 0xE71790D0215F0C8F, 0x00003FFB //A5
data8 0xC093A3581BCF3612, 0x0000BFFD //A3
LOCAL_OBJECT_END(_0_to_1o8_data)
LOCAL_OBJECT_START(_denorm_data)
data8 0x3FF20DD750429B6D //A1H = 1.1283791670955125585606992900e+00
data8 0x3C71AE3A914FED80 //A1L = 1.5335459613165880745599768129e-17
LOCAL_OBJECT_END(_denorm_data)
.section .text
GLOBAL_LIBM_ENTRY(erfl)
{ .mfi
alloc r32 = ar.pfs, 0, 21, 0, 0
fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0)
addl rSignBit = 0x20000, r0 // Set sign bit for exponent
}
{ .mlx
addl rDataPtr = @ltoff(erfl_data), gp // Get common data ptr
movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres.
};;
{ .mfi
getf.exp rArgExp = f8 // Get arg exponent
fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
// 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
addl rBias = 0xfffc, r0 // Value to subtract from exp
// to get actual interval number
}
{ .mfi
ld8 rDataPtr = [rDataPtr] // Get real common data pointer
fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path)
addl r2to4 = 0x10000, r0 // unbiased exponent
// for [2;4] binary interval
};;
{ .mfi
getf.sig rArgSig = f8 // Get arg significand
fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive?
addl rSaturation = 0xd0e, r0 // First 12 bits of
// saturation value signif.
}
{ .mfi
setf.d f1p5 = r1p5 // 1.5 construction
fma.s1 f2p0 = f1,f1,f1 // 2.0 construction
addl r3p25Sign = 0xd00, r0 // First 12 bits of
// 3.25 value signif.
};;
{ .mfi
addl rTailDataPtr = 0x700, rDataPtr // Pointer to "tail" data
nop.f 0
andcm rArgExp = rArgExp, rSignBit // Remove sign of exp
}
{ .mfb
addl rTiny = 0xf000, r0 // Tiny value for saturation path
nop.f 0
(p6) br.cond.spnt erfl_spec // Branch to zero, denorm & specs
};;
{ .mfi
sub rInterval = rArgExp, rBias // Get actual interval number
nop.f 0
shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
}
{ .mfi
adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
nop.f 0
cmp.eq p8, p10 = r2to4, rArgExp // If exp is in 2to4 interval?
};;
{ .mfi
(p8) cmp.le p8, p10 = r3p25Sign, rArgSig // If sign. is greater
// than 1.25? (means arg is in [3.25;4] interval)
nop.f 0
shl rOffset = rInterval, 8 // Make offset from
// interval number
}
{ .mfi
cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0
// (means arg is in [0; 1/8])
nop.f 0
cmp.eq p7, p0 = 0x5, rInterval // If arg is in [4:8] interv.?
};;
{ .mfi
(p8) adds rOffset = 0x200, rOffset // Add additional offset
// if arg is in [3.25;4] (another data set)
fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path)
shl rTailOffset = rInterval, 7 // Make offset to "tail" data
// from interval number
}
{ .mib
setf.exp fTiny = rTiny // Construct "tiny" value
// for saturation path
cmp.ltu p11, p0 = 0x5, rInterval // if arg > 8
(p9) br.cond.spnt _0_to_1o8
};;
{ .mfi
add rAddr1 = rDataPtr, rOffset // Get address for
// interval data
nop.f 0
shl rTailAddOffset = rInterval, 5 // Offset to interval
// "tail" data
}
{ .mib
add rAddr2 = rShiftedDataPtr, rOffset // Get second
// address for interval data
(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
// in [6.53;8] interval
(p11) br.cond.spnt _saturation // Branch to Saturation path
};;
{ .mmi
ldfe fA3 = [rAddr1], 0x90 // Load A3
ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low
add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset
};;
{ .mmi
ldfe fA20 = [rAddr1], 16 // Load A20
ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low
(p8) adds rTailOffset = 0x140, rTailOffset // Additional offset
// for [3.24;4] interval
};;
{ .mmi
ldfe fA19 = [rAddr1], 16 // Load A19
ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low
add rTailAddr1 = rTailDataPtr, rTailOffset // First tail
// data address
};;
.pred.rel "mutex",p8,p10
{ .mfi
ldfe fA18 = [rAddr1], 16 // Load A18
(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
// to normalized arg (for [3.24;4] interval)
adds rTailAddr2 = 0x10, rTailAddr1 // First tail
// data address
}
{ .mfi
ldfe fA25 = [rAddr2], 16 // Load A25
(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
// to normalized arg
nop.i 0
};;
{ .mmi
ldfe fA17 = [rAddr1], 16 // Load A17
ldfe fA24 = [rAddr2], 16 // Load A24
nop.i 0
};;
{ .mmi
ldfe fA16 = [rAddr1], 16 // Load A16
ldfe fA23 = [rAddr2], 16 // Load A23
nop.i 0
};;
{ .mmi
ldfe fA15 = [rAddr1], 16 // Load A15
ldfe fA22 = [rAddr2], 16 // Load A22
nop.i 0
};;
{ .mmi
ldfe fA14 = [rAddr1], 16 // Load A14
ldfe fA21 = [rAddr2], 16 // Load A21
nop.i 0
};;
{ .mfi
ldfe fA13 = [rTailAddr1], 32 // Load A13
fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2
nop.i 0
}
{ .mfi
ldfe fA12 = [rTailAddr2], 32 // Load A12
nop.f 0
nop.i 0
};;
{ .mfi
ldfe fA11 = [rTailAddr1], 32 // Load A11
fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2
nop.i 0
}
{ .mfi
ldfe fA10 = [rTailAddr2], 32 // Load A10
fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2
nop.i 0
};;
{ .mfi
ldfe fA9 = [rTailAddr1], 32 // Load A9
fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0
nop.i 0
}
{ .mfi
ldfe fA8 = [rTailAddr2], 32 // Load A8
nop.f 0
nop.i 0
};;
{ .mmi
ldfe fA7 = [rTailAddr1], 32 // Load A7
ldfe fA6 = [rTailAddr2], 32 // Load A6
nop.i 0
};;
{ .mmi
ldfe fA5 = [rTailAddr1], 32 // Load A5
ldfe fA4 = [rTailAddr2], 32 // Load A4
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2
// Low part of x^2 (delta)
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes1L = fRes1L, f1, fTH2 // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fTT = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes1L = fRes1L, f1, fTL2 // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA15 = fA19, fArgAbsNorm4, fA15 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA4 = fA8, fArgAbsNorm4, fA4 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes2H = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes1L = fRes1L, f1, fA0L // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of
// polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fResH = fRes2H, f1, fRes1H // High result
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes1L = fRes4, fArgAbsNorm4, fRes1L // A1*x+A0
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fResL = fRes1H, f1, fResH // Low result
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes1L = fRes1L, f1, fRes2L // Low result
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fResL = fResL, f1, fRes2H // Low result
nop.i 0
};;
{ .mfi
nop.m 0
(p15) fneg fResH = fResH // Invert high result if arg is neg.
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fResL = fResL, f1, fRes1L // Low result
nop.i 0
};;
.pred.rel "mutex",p14,p15
{ .mfi
nop.m 0
(p14) fma.s0 f8 = fResH, f1, fResL // Add high and low results
nop.i 0
}
{ .mfb
nop.m 0
(p15) fms.s0 f8 = fResH, f1, fResL // Add high and low results
br.ret.sptk b0 // Main path return
};;
// satiration path ////////////////////////////////////////////////////////////
_saturation:
.pred.rel "mutex",p14,p15
{ .mfi
nop.m 0
(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny
nop.i 0
};;
{ .mfb
nop.m 0
(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1
br.ret.sptk b0 // Saturation path return
};;
// 0, denormals and special IEEE numbers path /////////////////////////////////
erfl_spec:
{ .mfi
addl rDataPtr = 0xBE0, rDataPtr // Ptr to denormals coeffs
fclass.m p6,p0 = f8, 0x23 // To filter infinities
// 0x23 = @pos|@neg|@inf
nop.i 0
};;
{ .mfi
ldfpd fA1H, fA1L = [rDataPtr] // Load denormals coeffs A1H, A1L
fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
// 0xC7 = @pos|@neg|@zero|@qnan|@snan
nop.i 0
};;
{ .mfb
nop.m 0
(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
(p6) br.ret.spnt b0 // exit for x = INF
};;
{ .mfb
nop.m 0
(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
// and NaNs for NaNs
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
};;
{ .mfi
nop.m 0
fnorm.s0 f8 = f8 // Normalize arg
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fRes1H = f8, fA1H, f0 // HighRes
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fRes1L = f8, fA1L, f0 // LowRes
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fRes1Hd = f8, fA1H, fRes1H // HighRes delta
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes = fRes1L, f1, fRes1Hd // LowRes+HighRes delta
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes = f8, f8, fRes // r=x^2+r
nop.i 0
};;
{ .mfb
nop.m 0
fma.s0 f8 = fRes, f1, fRes1H // res = r+ResHigh
br.ret.sptk b0 // 0, denormals, specials return
};;
// 0 < |x| < 1/8 path /////////////////////////////////////////////////////////
_0_to_1o8:
{ .mmi
adds rAddr1 = 0xB60, rDataPtr // Ptr 1 to coeffs
adds rAddr2 = 0xB80, rDataPtr // Ptr 2 to coeffs
nop.i 0
};;
{ .mmi
ldfpd fA1H, fA1L = [rAddr1], 16 // Load A1High, A1Low
ldfe fA13 = [rAddr2], 16 // Load A13
nop.i 0
};;
{ .mmi
ldfe fA15 = [rAddr1], 48 // Load A15
ldfe fA11 = [rAddr2], 32 // Load A11
nop.i 0
};;
{ .mmi
ldfe fA9 = [rAddr1], 32 // Load A9
ldfe fA7 = [rAddr2], 32 // Load A7
nop.i 0
};;
{ .mmi
ldfe fA5 = [rAddr1] // Load A5
ldfe fA3 = [rAddr2] // Load A3
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fRes1H = f8, fA1H, f0 // x*(A1H+A1L)
nop.i 0
}
{ .mfi
nop.m 0
fms.s1 fRes1L = f8, fA1L, f0 // x*(A1H+A1L)
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fms.s1 fRes1Hd = f8, fA1H, fRes1H // x*(A1H+A1L) delta
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 f8 = fRes1L, f1, fRes1Hd // x*(A1H+A1L)
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3
nop.i 0
};;
{ .mfb
nop.m 0
fma.s0 f8 = f8, f1, fRes1H // (Polynomial tail)*x^3 +
// + x*(A1H+A1L)
br.ret.sptk b0 // [0;1/8] interval return
};;
GLOBAL_LIBM_END(erfl)
|