1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
/*========================== begin_copyright_notice ============================
Copyright (C) 2022 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include <cm-cl/math.h>
#include <cm-cl/vector.h>
using namespace cm;
namespace {
// We have to use 32-bit integers when it's possible
constexpr unsigned exp_shift = 52 - 32;
constexpr unsigned exp_mask = 0x7ff;
constexpr unsigned nan_hi = 0x7ff80000;
constexpr unsigned inf_hi = 0x7ff00000;
template <bool NNaN, bool NInf, bool NSZ, int N>
CM_NODEBUG CM_INLINE vector<double, N>
__impl_fsqrt_special(vector<double, N> x) {
vector<uint32_t, N * 2> result = 0;
auto result_lo = result.template select<N, 2>(0);
auto result_hi = result.template select<N, 2>(1);
auto xi = x.template format<uint32_t>();
vector<uint32_t, N> x_lo = xi.template select<N, 2>(0);
vector<uint32_t, N> x_hi = xi.template select<N, 2>(1);
vector<uint32_t, N> exp = (x_hi >> exp_shift) & exp_mask;
vector<uint32_t, N> sign = x_hi & (1u << 31);
mask<N> is_pinf = (x_hi == inf_hi) & (x_lo == 0);
mask<N> is_nan = (is_pinf == 0) & (exp == exp_mask);
if constexpr (!NInf)
result_hi.merge(inf_hi, is_pinf);
if constexpr (!NNaN)
result_hi.merge(nan_hi, is_nan | (sign != 0));
if constexpr (NSZ)
result_hi.merge(vector<uint32_t, N>(0), x == 0.0);
else
result_hi.merge(sign, x == 0.0);
return result.template format<double>();
}
template <int N>
CM_NODEBUG CM_NOINLINE cl_vector<double, N * 3>
__impl_fsqrt_ieee_steps__rte_(cl_vector<double, N> vx) {
vector<double, N> x = vx;
// Should be mapped to math.rsqt
vector<float, N> xf = x;
vector<float, N> y0f = detail::__cm_cl_rsqrt(xf.cl_vector());
vector<double, N> y0 = y0f;
vector<double, N * 3> result;
auto s1 = result.template select<N, 1>(0 * N);
auto h1 = result.template select<N, 1>(1 * N);
auto d1 = result.template select<N, 1>(2 * N);
auto h0 = 0.5 * y0;
auto s0 = x * y0;
auto d = math::mad(s0, -h0, vector<double, N>(0.5));
auto e = math::mad(d, vector<double, N>(1.5), vector<double, N>(1.0));
e *= d;
s1 = math::mad(s0, e, s0);
h1 = math::mad(h0, e, h0);
vector<double, N> vs1 = s1;
d1 = math::mad(vs1, -vs1, x);
return result.cl_vector();
}
template <bool IEEE, bool NNaN, bool NInf, bool NSZ, int N>
CM_NODEBUG CM_INLINE vector<double, N> __impl_fsqrt(vector<double, N> x) {
auto xi = x.template format<uint32_t>();
vector<uint32_t, N> x_hi = xi.template select<N, 2>(1);
vector<uint32_t, N> es = x_hi >> (exp_shift + 1);
vector<double, N> sc0 = 0.0, sc1 = 0.0;
auto sc0_hi = sc0.template format<uint32_t>().template select<N, 2>(1);
auto sc1_hi = sc1.template format<uint32_t>().template select<N, 2>(1);
sc0_hi = (0x3ff + 0x1ff - es) << exp_shift;
sc1_hi = (0x200 + es) << exp_shift;
// prescaling
x *= sc0;
x *= sc0;
vector<double, N> s;
if constexpr (IEEE) {
vector<double, N * 3> result = __impl_fsqrt_ieee_steps__rte_(x.cl_vector());
vector<double, N> s1 = result.template select<N, 1>(0 * N);
vector<double, N> h1 = result.template select<N, 1>(1 * N);
vector<double, N> d1 = result.template select<N, 1>(2 * N);
s = math::mad(h1, d1, s1);
} else { // Fast algorithm, 1ULP
// Should be mapped to math.rsqt
vector<float, N> xf = x;
vector<float, N> y0f = detail::__cm_cl_rsqrt(xf.cl_vector());
vector<double, N> y0 = y0f;
auto h0 = -0.5 * y0;
auto s0 = x * y0;
auto e = math::mad(s0, h0, vector<double, N>(0.5));
auto s1 = math::mad(s0, e, s0);
auto d1 = math::mad(s1, s1, -x);
s = math::mad(h0, d1, s1);
}
// final scaling
s *= sc1;
mask<N> special = (x == 0.0) | (x_hi >= inf_hi);
if (special.any())
s.merge(__impl_fsqrt_special<NNaN, NInf, NSZ>(x), special);
return s;
}
constexpr bool _fast = false;
constexpr bool _ieee = true;
constexpr bool _nnan = true;
constexpr bool _ninf = true;
constexpr bool _nsz = true;
constexpr bool _ = false;
} // namespace
#define __IMPL_FSQRT_SCALAR(ALG, NNAN, NINF, NSZ) \
CM_NODEBUG CM_NOINLINE extern "C" double \
__cm_intrinsic_impl_fsqrt_##ALG##_##NNAN##_##NINF##_##NSZ(double a) { \
vector<double, 1> va = a; \
return __impl_fsqrt<ALG, NNAN, NINF, NSZ>(va)[0]; \
}
#define __IMPL_FSQRT_VECTOR(WIDTH, ALG, NNAN, NINF, NSZ) \
CM_NODEBUG CM_NOINLINE extern "C" cl_vector<double, WIDTH> \
__cm_intrinsic_impl_fsqrt_##ALG##__v##WIDTH##_##NNAN##_##NINF##_##NSZ( \
cl_vector<double, WIDTH> a) { \
vector<double, WIDTH> va{a}; \
auto r = __impl_fsqrt<ALG, NNAN, NINF, NSZ>(va); \
return r.cl_vector(); \
}
#include "emulation_fsqrt_boilerplate.h"
|