1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
|
/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "../include/BiF_Definitions.cl"
#include "../../Headers/spirv.h"
extern __constant int __Native64Bit;
#if defined(cl_khr_fp16)
INLINE
half SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(sqrt_cr, _f16, )( half a )
{
return (half)SPIRV_OCL_BUILTIN(sqrt_cr, _f32, )((float)a);
}
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_1ARGS( sqrt_cr, half, half, f16 )
#endif // define(cl_khr_fp16)
float SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(sqrt_cr, _f32, )( float a )
{
if (!__CRMacros)
{
typedef union binary32
{
uint u;
int s;
float f;
} binary32;
binary32 fa, y0, onehalf, H0, H1, S0, S1, S, d0, e0;
int aExp, sExp;
fa.f = a;
aExp = (fa.u >> 23) & 0xff;
uint significand = fa.u & 0x7fffff;
if (aExp == 0 & significand == 0) {
/* return +/-zero for +/-zero */
S.u = fa.u;
}
else if (aExp == 0xff) { /* NaN and Inf */
if ((fa.u & 0x7fffff) == 0) { /* Inf */
S.u = (fa.u & 0x80000000) ? 0xffc00000 : 0x7f800000;
} else { /* NaN */
S.u = fa.u | 0x400000; /* Quiet signalling NaN */
}
} else {
if (fa.u & 0x80000000) { /* Negative normals/denormals */
if (__FlushDenormals & (aExp == 0))
S.u = fa.u & 0x80000000;
else
/* return qNaN for negative normal/denormal values */
S.u = 0xffc00000;
} else if (__FlushDenormals & (aExp == 0)) {
S.u = 0; // positive denorms
} else { /* Positive normals/denormals */
bool denorm = (aExp == 0);
if (denorm & !__FlushDenormals) {
fa.f = SPIRV_OCL_BUILTIN(ldexp, _f32_i32, )(fa.f, 126);
}
else {
// Scale a to [1/2, 2)
fa.u = (fa.u & 0x00ffffff) | 0x3f000000;
}
// Initial approximation
y0.f = SPIRV_OCL_BUILTIN(rsqrt, _f32, )(fa.f);
onehalf.u = 0x3f000000;
// Step(1), H0 = 1/2y0
H0.f = onehalf.f * y0.f;
// Step(2), S0 = a*y0
S0.f = fa.f * y0.f;
// Step(3), d0 = 1/2 - S0*H0
d0.f = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(-S0.f, H0.f, onehalf.f);
// Step(4), H1 = H0 + d0*H0
H1.f = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(d0.f, H0.f, H0.f);
// Step(5), S1 = S0 + d0*S0
S1.f = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(d0.f, S0.f, S0.f);
// Step(6), e0 = a - S1*S1
e0.f = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(-S1.f, S1.f, fa.f);
// Step(7), S = S1 + e0*H1
S.f = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(e0.f, H1.f, S1.f);
if (denorm & !__FlushDenormals) {
S.f = SPIRV_OCL_BUILTIN(ldexp, _f32_i32, )(S.f, -126/2);
}
else {
// Adjust exponent
sExp = ((aExp - FLOAT_BIAS) >> 1) + FLOAT_BIAS;
S.u = (S.u & 0x007fffff) | (sExp << 23);
}
}
}
return S.f;
}
else
{
return FSQRT_IEEE(a);
}
}
#ifdef cl_fp64_basic_ops
INLINE double SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(sqrt_cr, _f64, )( double x )
{
return SPIRV_OCL_BUILTIN(sqrt, _f64, )(x);
}
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_1ARGS( sqrt_cr, double, double, f64 )
#endif // cl_fp64_basic_ops
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_1ARGS( sqrt_cr, float, float, f32 )
|