1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
#include <clc/clc.h>
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#define ROUND_VEC1(out, in, ROUNDF) out = ROUNDF(in);
#define ROUND_VEC2(out, in, ROUNDF) \
ROUND_VEC1(out.lo, in.lo, ROUNDF); \
ROUND_VEC1(out.hi, in.hi, ROUNDF);
#define ROUND_VEC3(out, in, ROUNDF) \
ROUND_VEC1(out.s0, in.s0, ROUNDF); \
ROUND_VEC1(out.s1, in.s1, ROUNDF); \
ROUND_VEC1(out.s2, in.s2, ROUNDF);
#define ROUND_VEC4(out, in, ROUNDF) \
ROUND_VEC2(out.lo, in.lo, ROUNDF); \
ROUND_VEC2(out.hi, in.hi, ROUNDF);
#define ROUND_VEC8(out, in, ROUNDF) \
ROUND_VEC4(out.lo, in.lo, ROUNDF); \
ROUND_VEC4(out.hi, in.hi, ROUNDF);
#define ROUND_VEC16(out, in, ROUNDF) \
ROUND_VEC8(out.lo, in.lo, ROUNDF); \
ROUND_VEC8(out.hi, in.hi, ROUNDF);
#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, ROUNDF) \
void _CLC_OVERLOAD vstore_half_##VEC_SIZE(TYPE, size_t, AS half *); \
_CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \
AS half *mem) { \
TYPE rounded_vec; \
ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \
vstore_half_##VEC_SIZE(rounded_vec, offset, mem); \
} \
void _CLC_OVERLOAD vstorea_half_##VEC_SIZE(TYPE, size_t, AS half *); \
_CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \
AS half *mem) { \
TYPE rounded_vec; \
ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \
vstorea_half_##VEC_SIZE(rounded_vec, offset, mem); \
}
_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
/* Handle nan corner case */
if (isnan(x))
return x;
/* RTZ does not produce Inf for large numbers */
if (fabs(x) > 65504.0f && !isinf(x))
return copysign(65504.0f, x);
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
/* Manage range rounded to +- zero explicitely */
if (exp < -24)
return copysign(0.0f, x);
/* Remove lower 13 bits to make sure the number is rounded down */
int mask = 0xffffe000;
/* Denormals cannot be flushed, and they use different bit for rounding */
if (exp < -14)
mask <<= min(-(exp + 14), 10);
return as_float(as_uint(x) & mask);
}
_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
/* Handle nan corner case */
if (isnan(x))
return x;
const float inf = copysign(INFINITY, x);
uint ux = as_uint(x);
/* Manage +- infinity explicitely */
if (as_float(ux & 0x7fffffff) > 0x1.ffcp+15f) {
return inf;
}
/* Manage +- zero explicitely */
if ((ux & 0x7fffffff) == 0) {
return copysign(0.0f, x);
}
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
/* Manage range rounded to smallest half denormal explicitely */
if (exp < -24) {
return copysign(0x1.0p-24f, x);
}
/* Set lower 13 bits */
int mask = (1 << 13) - 1;
/* Denormals cannot be flushed, and they use different bit for rounding */
if (exp < -14) {
mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
}
const float next = nextafter(as_float(ux | mask), inf);
return ((ux & mask) == 0) ? as_float(ux) : next;
}
_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
}
_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
}
_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
/* Mantisa + implicit bit */
const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
const int exp = (as_uint(x) >> 23 & 0xff) - 127;
int shift = 13;
if (exp < -14) {
/* The default assumes lower 13 bits are rounded,
* but it might be more for denormals.
* Shifting beyond last == 0b, and qr == 00b is not necessary */
shift += min(-(exp + 14), 15);
}
int mask = (1 << shift) - 1;
const uint grs = mantissa & mask;
const uint last = mantissa & (1 << shift);
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
* exp > 15 should round to inf. */
bool roundup = (grs > (1 << (shift - 1))) ||
(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
return roundup ? __clc_rti(x) : __clc_rtz(x);
}
#define __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
__FUNC(SUFFIX, VEC_SIZE, TYPE, AS, __clc_rte) \
__FUNC(SUFFIX##_rtz, VEC_SIZE, TYPE, AS, __clc_rtz) \
__FUNC(SUFFIX##_rtn, VEC_SIZE, TYPE, AS, __clc_rtn) \
__FUNC(SUFFIX##_rtp, VEC_SIZE, TYPE, AS, __clc_rtp) \
__FUNC(SUFFIX##_rte, VEC_SIZE, TYPE, AS, __clc_rte)
#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS)
#define __CLC_BODY "vstore_half.inc"
#include <clc/math/gentype.inc>
#undef __CLC_BODY
#undef FUNC
#undef __XFUNC
#undef __FUNC
|