File: expm1.cl

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (142 lines) | stat: -rw-r--r-- 4,606 bytes parent folder | download | duplicates (18)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <clc/clc.h>

#include "math.h"
#include "tables.h"
#include "../clcmacro.h"

/* Refer to the exp routine for the underlying algorithm */

_CLC_OVERLOAD _CLC_DEF float expm1(float x) {
    const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673
    const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184

    const float R_64_BY_LOG2 = 0x1.715476p+6f;     // 64/log2 : 92.332482616893657
    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388

    uint xi = as_uint(x);
    int n = (int)(x * R_64_BY_LOG2);
    float fn = (float)n;

    int j = n & 0x3f;
    int m = n >> 6;

    float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x));

    // Truncated Taylor series
    float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f,  0x1.555556p-3f), 0.5f), r);

    float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
    float2 tv = USE_TABLE(exp_tbl_ep, j);

    float two_to_jby64_h = tv.s0 * m2;
    float two_to_jby64_t = tv.s1 * m2;
    float two_to_jby64 = two_to_jby64_h + two_to_jby64_t;

    z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f);
	//Make subnormals work
    z2 = x == 0.f ? x : z2;
    z2 = x < X_MIN | m < -24 ? -1.0f : z2;
    z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2;
    z2 = isnan(x) ? x : z2;

    return z2;
}

_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, expm1, float)

#ifdef cl_khr_fp64

#include "exp_helper.h"

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

_CLC_OVERLOAD _CLC_DEF double expm1(double x) {
    const double max_expm1_arg = 709.8;
    const double min_expm1_arg = -37.42994775023704;
    const double log_OnePlus_OneByFour = 0.22314355131420976;   //0x3FCC8FF7C79A9A22 = log(1+1/4)
    const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4)
    const double sixtyfour_by_lnof2 = 92.33248261689366;        //0x40571547652b82fe
    const double lnof2_by_64_head = 0.010830424696223417;       //0x3f862e42fefa0000
    const double lnof2_by_64_tail = 2.5728046223276688e-14;     //0x3d1cf79abc9e3b39

    // First, assume log(1-1/4) < x < log(1+1/4) i.e  -0.28768 < x < 0.22314
    double u = as_double(as_ulong(x) & 0xffffffffff000000UL);
    double v = x - u;
    double y = u * u * 0.5;
    double z = v * (x + u) * 0.5;

    double q = fma(x,
	           fma(x,
		       fma(x,
			   fma(x,
			       fma(x,
				   fma(x,
				       fma(x,
					   fma(x,2.4360682937111612e-8, 2.7582184028154370e-7),
					   2.7558212415361945e-6),
				       2.4801576918453420e-5),
				   1.9841269447671544e-4),
			       1.3888888890687830e-3),
			   8.3333333334012270e-3),
		       4.1666666666665560e-2),
		   1.6666666666666632e-1);
    q *= x * x * x;

    double z1g = (u + y) + (q + (v + z));
    double z1 = x + (y + (q + z));
    z1 = y >= 0x1.0p-7 ? z1g : z1;

    // Now assume outside interval around 0
    int n = (int)(x * sixtyfour_by_lnof2);
    int j = n & 0x3f;
    int m = n >> 6;

    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
    double f1 = tv.s0;
    double f2 = tv.s1;
    double f = f1 + f2;

    double dn = -n;
    double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x));

    q = fma(r,
	    fma(r,
		fma(r,
		    fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
		    4.16666666662260795726e-02),
		1.66666666665260878863e-01),
	     5.00000000000000008883e-01);
    q = fma(r*r, q, r);

    double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64);
    double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64);

    // Computations for m > 52, including where result is close to Inf
    ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2))));
    int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1;

    double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64));
    zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024;

    double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm));
    zmg52 = m == 1024 ? zme1024 : zmg52;

    // For m < 53
    double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q)));

    // For m < -7
    double zmln7 = fma(twopm,  f1 + fma(f, q, f2), -1.0);

    z = m < 53 ? zml53 : zmg52;
    z = m < -7 ? zmln7 : z;
    z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z;
    z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z;
    z = x < min_expm1_arg ? -1.0 : z;

    return z;
}

_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double)

#endif