1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
// Start of atomics16.h
SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_global(volatile __global int16_t *p,
int16_t cmp, int16_t val);
SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_shared(volatile __local int16_t *p,
int16_t cmp, int16_t val);
SCALAR_FUN_ATTR int16_t atomic_add_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_add_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR f16 atomic_fadd_f16_global(volatile __global uint16_t *p, f16 x);
SCALAR_FUN_ATTR f16 atomic_fadd_f16_shared(volatile __local uint16_t *p, f16 x);
SCALAR_FUN_ATTR int16_t atomic_smax_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_smax_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_smin_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_smin_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umax_i16_global(volatile __global uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umax_i16_shared(volatile __local uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umin_i16_global(volatile __global uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umin_i16_shared(volatile __local uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR int16_t atomic_and_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_and_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_or_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_or_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_xor_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_xor_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_global(volatile __global int16_t *p,
int16_t cmp, int16_t val) {
int offset = ((uintptr_t)p >> 1 & 1);
volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
int shift = offset * 16;
int32_t mask = 0xffff << shift;
int32_t shifted_val = val << shift;
int32_t shifted_cmp = cmp << shift;
uint32_t old = shifted_cmp;
uint32_t upd = shifted_val;
uint32_t got;
while ((got=atomic_cmpxchg_i32_global(p32, old, upd)) != old) {
old = got;
upd = (old & ~mask) | shifted_val;
}
return old >> shift;
}
SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_shared(volatile __local int16_t *p,
int16_t cmp, int16_t val) {
int offset = ((uintptr_t)p >> 1 & 1);
volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
int shift = offset * 16;
int32_t mask = 0xffff << shift;
int32_t shifted_val = val << shift;
int32_t shifted_cmp = cmp << shift;
uint32_t old = shifted_cmp;
uint32_t upd = shifted_val;
uint32_t got;
while ((got=atomic_cmpxchg_i32_shared(p32, old, upd)) != old) {
old = got;
upd = (old & ~mask) | shifted_val;
}
return old >> shift;
}
// Convenience macro for arithmetic.
#define DEFINE_16BIT_ATOMIC(name, T, op) \
SCALAR_FUN_ATTR T \
atomic_##name##_i16_global(volatile __global T *p, T val) { \
int offset = ((uintptr_t)p >> 1 & 1); \
volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3); \
int shift = offset * 16; \
int32_t mask = 0xffff << shift; \
int32_t old = 0; \
int32_t upd = mask & (op(old >> shift, val) << shift); \
int32_t saw; \
while ((saw=atomic_cmpxchg_i32_global(p32, old, upd)) != old) { \
old = saw; \
upd = (old & ~mask) | ((op(old >> shift, val)) << shift); \
} \
return old >> shift; \
} \
SCALAR_FUN_ATTR T \
atomic_##name##_i16_shared(volatile __local T *p, T val) { \
int offset = ((uintptr_t)p >> 1 & 1); \
volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3); \
int shift = offset * 16; \
int32_t mask = 0xffff << shift; \
int32_t old = 0; \
int32_t upd = mask & ((op(old >> shift, val)) << shift); \
int32_t saw; \
while ((saw=atomic_cmpxchg_i32_shared(p32, old, upd)) != old) { \
old = saw; \
upd = (old & ~mask) | ((op(old >> shift, val)) << shift); \
} \
return old >> shift; \
}
DEFINE_16BIT_ATOMIC(add, int16_t, add16);
DEFINE_16BIT_ATOMIC(smax, int16_t, smax16);
DEFINE_16BIT_ATOMIC(smin, int16_t, smin16);
DEFINE_16BIT_ATOMIC(umax, uint16_t, umax16);
DEFINE_16BIT_ATOMIC(umin, uint16_t, umin16);
SCALAR_FUN_ATTR int16_t atomic_and_i16_global(volatile __global int16_t *p, int16_t val) {
volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
int shift = ((uintptr_t)p >> 1 & 1) * 16;
int32_t mask = 0xffff << shift;
return atomic_and_i32_global(p32, ~mask | (val<<shift)) >> shift;
}
SCALAR_FUN_ATTR int16_t atomic_and_i16_shared(volatile __local int16_t *p, int16_t val) {
volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
int shift = ((uintptr_t)p >> 1 & 1) * 16;
int32_t mask = 0xffff << shift;
return atomic_and_i32_shared(p32, ~mask | (val<<shift)) >> shift;
}
SCALAR_FUN_ATTR int16_t atomic_or_i16_global(volatile __global int16_t *p, int16_t val) {
volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
int shift = ((uintptr_t)p >> 1 & 1) * 16;
return atomic_or_i32_global(p32, (uint16_t)val<<shift) >> shift;
}
SCALAR_FUN_ATTR int16_t atomic_or_i16_shared(volatile __local int16_t *p, int16_t val) {
volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
int shift = ((uintptr_t)p >> 1 & 1) * 16;
return atomic_or_i32_shared(p32, (uint16_t)val<<shift) >> shift;
}
SCALAR_FUN_ATTR int16_t atomic_xor_i16_global(volatile __global int16_t *p, int16_t val) {
volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
int shift = ((uintptr_t)p >> 1 & 1) * 16;
return atomic_xor_i32_global(p32, (uint16_t)val<<shift) >> shift;
}
SCALAR_FUN_ATTR int16_t atomic_xor_i16_shared(volatile __local int16_t *p, int16_t val) {
volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
int shift = ((uintptr_t)p >> 1 & 1) * 16;
return atomic_xor_i32_shared(p32, (uint16_t)val<<shift) >> shift;
}
SCALAR_FUN_ATTR f16 atomic_fadd_f16_global(volatile __global uint16_t *p, f16 val) {
int offset = ((uintptr_t)p >> 1 & 1);
volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
int shift = offset * 16;
int32_t mask = 0xffff << shift;
int32_t old = 0;
int32_t upd = mask & ((int32_t)fptobits_f16_i16(val) << shift);
int32_t saw;
while ((saw=atomic_cmpxchg_i32_global(p32, old, upd)) != old) {
old = saw;
upd = (old & ~mask) | (int32_t)fptobits_f16_i16(bitstofp_i16_f16((uint32_t)old >> shift) + val) << shift;
}
return bitstofp_i16_f16((uint32_t)old >> shift);
}
SCALAR_FUN_ATTR f16 atomic_fadd_f16_shared(volatile __local uint16_t *p, f16 val) {
int offset = ((uintptr_t)p >> 1 & 1);
volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
int shift = offset * 16;
int32_t mask = 0xffff << shift;
int32_t old = 0;
int32_t upd = mask & ((int32_t)fptobits_f16_i16(val) << shift);
int32_t saw;
while ((saw=atomic_cmpxchg_i32_shared(p32, old, upd)) != old) {
old = saw;
upd = (old & ~mask) | (int32_t)fptobits_f16_i16(bitstofp_i16_f16((uint32_t)old >> shift) + val) << shift;
}
return bitstofp_i16_f16((uint32_t)old >> shift);
}
// End of atomics16.h
|