| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 
 | /*
 * Copyright (c) 2014 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#include <clc/clc.h>
#include "math.h"
#include "tables.h"
#include "sincos_helpers.h"
#define bitalign(hi, lo, shift) \
  ((hi) << (32 - (shift))) | ((lo) >> (shift));
#define bytealign(src0, src1, src2) \
  ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
_CLC_DEF float __clc_sinf_piby4(float x, float y) {
    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
    // = x * f(w)
    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
    // We use a minimax approximation of (f(w) - 1) / w
    // because this produces an expansion in even powers of x.
    const float c1 = -0.1666666666e0f;
    const float c2 = 0.8333331876e-2f;
    const float c3 = -0.198400874e-3f;
    const float c4 = 0.272500015e-5f;
    const float c5 = -2.5050759689e-08f; // 0xb2d72f34
    const float c6 = 1.5896910177e-10f;	 // 0x2f2ec9d3
    float z = x * x;
    float v = z * x;
    float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
    float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
    return ret;
}
_CLC_DEF float __clc_cosf_piby4(float x, float y) {
    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
    // = f(w)
    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
    // because this produces an expansion in even powers of x.
    const float c1 = 0.416666666e-1f;
    const float c2 = -0.138888876e-2f;
    const float c3 = 0.248006008e-4f;
    const float c4 = -0.2730101334e-6f;
    const float c5 = 2.0875723372e-09f;	 // 0x310f74f6
    const float c6 = -1.1359647598e-11f; // 0xad47d74e
    float z = x * x;
    float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6,  c5), c4), c3), c2), c1);
    // if |x| < 0.3
    float qx = 0.0f;
    int ix = as_int(x) & EXSIGNBIT_SP32;
    //  0.78125 > |x| >= 0.3
    float xby4 = as_float(ix - 0x01000000);
    qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
    // x > 0.78125
    qx = ix > 0x3f480000 ? 0.28125f : qx;
    float hz = mad(z, 0.5f, -qx);
    float a = 1.0f - qx;
    float ret = a - (hz - mad(z, r, -x*y));
    return ret;
}
_CLC_DEF float __clc_tanf_piby4(float x, int regn)
{
    // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
    float r = x * x;
    float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f);
    float b = mad(r,
	          mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f),
	          1.15588821434688393452299f);
    float t = mad(x*r, native_divide(a, b), x);
    float tr = -MATH_RECIP(t);
    return regn & 1 ? tr : t;
}
_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
{
    if (HAVE_HW_FMA32()) {
        float ph = a * b;
        *hi = ph;
        *lo = fma(a, b, -ph);
    } else {
        float ah = as_float(as_uint(a) & 0xfffff000U);
        float at = a - ah;
        float ph = a * b;
        float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
        *hi = ph;
        *lo = pt;
    }
}
_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x)
{
    // 72 bits of pi/2
    const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
    const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
    const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
    const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
    const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
    const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
    const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
    const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
    const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
    const float twobypi = 0x1.45f306p-1f;
    float fnpi2 = trunc(mad(x, twobypi, 0.5f));
    // subtract n * pi/2 from x
    float rhead, rtail;
    __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
    float v = x - rhead;
    float rem = v + (((x - v) - rhead) - rtail);
    float rhead2, rtail2;
    __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
    v = rem - rhead2;
    rem = v + (((rem - v) - rhead2) - rtail2);
    float rhead3, rtail3;
    __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
    v = rem - rhead3;
    *hi = v + ((rem - v) - rhead3);
    *lo = -rtail3;
    return fnpi2;
}
_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
{
    float fnpi2 = __clc_removePi2S(r, rr, x);
    return (int)fnpi2 & 0x3;
}
#define FULL_MUL(A, B, HI, LO) \
    LO = A * B; \
    HI = mul_hi(A, B)
#define FULL_MAD(A, B, C, HI, LO) \
    LO = ((A) * (B) + (C)); \
    HI = mul_hi(A, B); \
    HI += LO < C
_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
{
    int xe = (int)(as_uint(x) >> 23) - 127;
    uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
    // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
    const uint b6 = 0xA2F9836EU;
    const uint b5 = 0x4E441529U;
    const uint b4 = 0xFC2757D1U;
    const uint b3 = 0xF534DDC0U;
    const uint b2 = 0xDB629599U;
    const uint b1 = 0x3C439041U;
    const uint b0 = 0xFE5163ABU;
    uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
    FULL_MUL(xm, b0, c0, p0);
    FULL_MAD(xm, b1, c0, c1, p1);
    FULL_MAD(xm, b2, c1, c0, p2);
    FULL_MAD(xm, b3, c0, c1, p3);
    FULL_MAD(xm, b4, c1, c0, p4);
    FULL_MAD(xm, b5, c0, c1, p5);
    FULL_MAD(xm, b6, c1, p7, p6);
    uint fbits = 224 + 23 - xe;
    // shift amount to get 2 lsb of integer part at top 2 bits
    //   min: 25 (xe=18) max: 134 (xe=127)
    uint shift = 256U - 2 - fbits;
    // Shift by up to 134/32 = 4 words
    int c = shift > 31;
    p7 = c ? p6 : p7;
    p6 = c ? p5 : p6;
    p5 = c ? p4 : p5;
    p4 = c ? p3 : p4;
    p3 = c ? p2 : p3;
    p2 = c ? p1 : p2;
    p1 = c ? p0 : p1;
    shift -= (-c) & 32;
    c = shift > 31;
    p7 = c ? p6 : p7;
    p6 = c ? p5 : p6;
    p5 = c ? p4 : p5;
    p4 = c ? p3 : p4;
    p3 = c ? p2 : p3;
    p2 = c ? p1 : p2;
    shift -= (-c) & 32;
    c = shift > 31;
    p7 = c ? p6 : p7;
    p6 = c ? p5 : p6;
    p5 = c ? p4 : p5;
    p4 = c ? p3 : p4;
    p3 = c ? p2 : p3;
    shift -= (-c) & 32;
    c = shift > 31;
    p7 = c ? p6 : p7;
    p6 = c ? p5 : p6;
    p5 = c ? p4 : p5;
    p4 = c ? p3 : p4;
    shift -= (-c) & 32;
    // bitalign cannot handle a shift of 32
    c = shift > 0;
    shift = 32 - shift;
    uint t7 = bitalign(p7, p6, shift);
    uint t6 = bitalign(p6, p5, shift);
    uint t5 = bitalign(p5, p4, shift);
    p7 = c ? t7 : p7;
    p6 = c ? t6 : p6;
    p5 = c ? t5 : p5;
    // Get 2 lsb of int part and msb of fraction
    int i = p7 >> 29;
    // Scoot up 2 more bits so only fraction remains
    p7 = bitalign(p7, p6, 30);
    p6 = bitalign(p6, p5, 30);
    p5 = bitalign(p5, p4, 30);
    // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
    uint flip = i & 1 ? 0xffffffffU : 0U;
    uint sign = i & 1 ? 0x80000000U : 0U;
    p7 = p7 ^ flip;
    p6 = p6 ^ flip;
    p5 = p5 ^ flip;
    // Find exponent and shift away leading zeroes and hidden bit
    xe = clz(p7) + 1;
    shift = 32 - xe;
    p7 = bitalign(p7, p6, shift);
    p6 = bitalign(p6, p5, shift);
    // Most significant part of fraction
    float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
    // Shift out bits we captured on q1
    p7 = bitalign(p7, p6, 32-23);
    // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
    int xxe = clz(p7) + 1;
    p7 = bitalign(p7, p6, 32-xxe);
    float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
    // At this point, the fraction q1 + q0 is correct to at least 48 bits
    // Now we need to multiply the fraction by pi/2
    // This loses us about 4 bits
    // pi/2 = C90 FDA A22 168 C23 4C4
    const float pio2h = (float)0xc90fda / 0x1.0p+23f;
    const float pio2hh = (float)0xc90 / 0x1.0p+11f;
    const float pio2ht = (float)0xfda / 0x1.0p+23f;
    const float pio2t = (float)0xa22168 / 0x1.0p+47f;
    float rh, rt;
    if (HAVE_HW_FMA32()) {
        rh = q1 * pio2h;
        rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
    } else {
        float q1h = as_float(as_uint(q1) & 0xfffff000);
        float q1t = q1 - q1h;
        rh = q1 * pio2h;
        rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
        rt = mad(q0, pio2h, mad(q1, pio2t, rt));
    }
    float t = rh + rt;
    rt = rt - (t - rh);
    *r = t;
    *rr = rt;
    return ((i >> 1) + (i & 1)) & 0x3;
}
_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
{
    if (x < 0x1.0p+23f)
        return __clc_argReductionSmallS(r, rr, x);
    else
        return __clc_argReductionLargeS(r, rr, x);
}
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
// Reduction for medium sized arguments
_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
    // How many pi/2 is x a multiple of?
    const double two_by_pi = 0x1.45f306dc9c883p-1;
    double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
    const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
    const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
    const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
    // Compute product of npi2 with 159 bits of 2/pi
    double p_hh = piby2_h * dnpi2;
    double p_ht = fma(piby2_h, dnpi2, -p_hh);
    double p_mh = piby2_m * dnpi2;
    double p_mt = fma(piby2_m, dnpi2, -p_mh);
    double p_th = piby2_t * dnpi2;
    double p_tt = fma(piby2_t, dnpi2, -p_th);
    // Reduce to 159 bits
    double ph = p_hh;
    double pm = p_ht + p_mh;
    double t = p_mh - (pm - p_ht);
    double pt = p_th + t + p_mt + p_tt;
    t = ph + pm; pm = pm - (t - ph); ph = t;
    t = pm + pt; pt = pt - (t - pm); pm = t;
    // Subtract from x
    t = x + ph;
    double qh = t + pm;
    double qt = pm - (qh - t) + pt;
    *r = qh;
    *rr = qt;
    *regn = (int)(long)dnpi2 & 0x3;
}
// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
// extra precision, and return the result in r, rr.
// Return value "regn" tells how many lots of pi/2 were subtracted
// from x to put it in the range [-pi/4,pi/4], mod 4.
_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
    long ux = as_long(x);
    int e = (int)(ux >> 52) -  1023;
    int i = max(23, (e >> 3) + 17);
    int j = 150 - i;
    int j16 = j & ~0xf;
    double fract_temp;
    // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
    uint4 q0 = USE_TABLE(pibits_tbl, j16);
    uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
    uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
    int k = (j >> 2) & 0x3;
    int4 c = (int4)k == (int4)(0, 1, 2, 3);
    uint u0, u1, u2, u3, u4, u5, u6;
    u0 = c.s1 ? q0.s1 : q0.s0;
    u0 = c.s2 ? q0.s2 : u0;
    u0 = c.s3 ? q0.s3 : u0;
    u1 = c.s1 ? q0.s2 : q0.s1;
    u1 = c.s2 ? q0.s3 : u1;
    u1 = c.s3 ? q1.s0 : u1;
    u2 = c.s1 ? q0.s3 : q0.s2;
    u2 = c.s2 ? q1.s0 : u2;
    u2 = c.s3 ? q1.s1 : u2;
    u3 = c.s1 ? q1.s0 : q0.s3;
    u3 = c.s2 ? q1.s1 : u3;
    u3 = c.s3 ? q1.s2 : u3;
    u4 = c.s1 ? q1.s1 : q1.s0;
    u4 = c.s2 ? q1.s2 : u4;
    u4 = c.s3 ? q1.s3 : u4;
    u5 = c.s1 ? q1.s2 : q1.s1;
    u5 = c.s2 ? q1.s3 : u5;
    u5 = c.s3 ? q2.s0 : u5;
    u6 = c.s1 ? q1.s3 : q1.s2;
    u6 = c.s2 ? q2.s0 : u6;
    u6 = c.s3 ? q2.s1 : u6;
    uint v0 = bytealign(u1, u0, j);
    uint v1 = bytealign(u2, u1, j);
    uint v2 = bytealign(u3, u2, j);
    uint v3 = bytealign(u4, u3, j);
    uint v4 = bytealign(u5, u4, j);
    uint v5 = bytealign(u6, u5, j);
    // Place those 192 bits in 4 48-bit doubles along with correct exponent
    // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
    i = 2 + 8*i;
    x *= i > 1018 ? 0x1.0p-136 : 1.0;
    i -= i > 1018 ? 136 : 0;
    uint ua = (uint)(1023 + 52 - i) << 20;
    double a = as_double((uint2)(0, ua));
    double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
    ua += 0x03000000U;
    a = as_double((uint2)(0, ua));
    double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
    ua += 0x03000000U;
    a = as_double((uint2)(0, ua));
    double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
    ua += 0x03000000U;
    a = as_double((uint2)(0, ua));
    double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
    // Exact multiply
    double f0h = p0 * x;
    double f0l = fma(p0, x, -f0h);
    double f1h = p1 * x;
    double f1l = fma(p1, x, -f1h);
    double f2h = p2 * x;
    double f2l = fma(p2, x, -f2h);
    double f3h = p3 * x;
    double f3l = fma(p3, x, -f3h);
    // Accumulate product into 4 doubles
    double s, t;
    double f3 = f3h + f2h;
    t = f2h - (f3 - f3h);
    s = f3l + t;
    t = t - (s - f3l);
    double f2 = s + f1h;
    t = f1h - (f2 - s) + t;
    s = f2l + t;
    t = t - (s - f2l);
    double f1 = s + f0h;
    t = f0h - (f1 - s) + t;
    s = f1l + t;
    double f0 = s + f0l;
    // Strip off unwanted large integer bits
    f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
    f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
    // Compute least significant integer bits
    t = f3 + f2;
    double di = t - fract(t, &fract_temp);
    i = (float)di;
    // Shift out remaining integer part
    f3 -= di;
    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
    f1 += f0;
    // Subtract 1 if fraction is >= 0.5, and update regn
    int g = f3 >= 0.5;
    i += g;
    f3 -= (float)g;
    // Shift up bits
    s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
    // Multiply precise fraction by pi/2 to get radians
    const double p2h = 7074237752028440.0 / 0x1.0p+52;
    const double p2t = 4967757600021510.0 / 0x1.0p+106;
    double rhi = f3 * p2h;
    double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
    *r = rhi + rlo;
    *rr = rlo - (*r - rhi);
    *regn = i & 0x3;
}
_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
    //                      = x * f(w)
    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
    // We use a minimax approximation of (f(w) - 1) / w
    // because this produces an expansion in even powers of x.
    // If xx (the tail of x) is non-zero, we add a correction
    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
    // is an approximation to cos(x)*sin(xx) valid because
    // xx is tiny relative to x.
    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
    //                      = f(w)
    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
    // because this produces an expansion in even powers of x.
    // If xx (the tail of x) is non-zero, we subtract a correction
    // term g(x,xx) = x*xx to the result, where g(x,xx)
    // is an approximation to sin(x)*sin(xx) valid because
    // xx is tiny relative to x.
    const double sc1 = -0.166666666666666646259241729;
    const double sc2 =  0.833333333333095043065222816e-2;
    const double sc3 = -0.19841269836761125688538679e-3;
    const double sc4 =  0.275573161037288022676895908448e-5;
    const double sc5 = -0.25051132068021699772257377197e-7;
    const double sc6 =  0.159181443044859136852668200e-9;
    const double cc1 =  0.41666666666666665390037e-1;
    const double cc2 = -0.13888888888887398280412e-2;
    const double cc3 =  0.248015872987670414957399e-4;
    const double cc4 = -0.275573172723441909470836e-6;
    const double cc5 =  0.208761463822329611076335e-8;
    const double cc6 = -0.113826398067944859590880e-10;
    double x2 = x * x;
    double x3 = x2 * x;
    double r = 0.5 * x2;
    double t = 1.0 - r;
    double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
    double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
                        x2*x2, fma(x, xx, (1.0 - t) - r));
    double2 ret;
    ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
    ret.hi = cp;
    return ret;
}
#endif
 |