File: atomics16.h

package info (click to toggle)
haskell-futhark 0.25.32-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 18,236 kB
  • sloc: haskell: 100,484; ansic: 12,100; python: 3,440; yacc: 785; sh: 561; javascript: 558; lisp: 399; makefile: 277
file content (177 lines) | stat: -rw-r--r-- 8,800 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Start of atomics16.h

SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_global(volatile __global int16_t *p,
                                                  int16_t cmp, int16_t val);
SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_shared(volatile __local int16_t *p,
                                                  int16_t cmp, int16_t val);
SCALAR_FUN_ATTR int16_t atomic_add_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_add_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR f16 atomic_fadd_f16_global(volatile __global uint16_t *p, f16 x);
SCALAR_FUN_ATTR f16 atomic_fadd_f16_shared(volatile __local uint16_t *p, f16 x);
SCALAR_FUN_ATTR int16_t atomic_smax_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_smax_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_smin_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_smin_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umax_i16_global(volatile __global uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umax_i16_shared(volatile __local uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umin_i16_global(volatile __global uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR uint16_t atomic_umin_i16_shared(volatile __local uint16_t *p, uint16_t x);
SCALAR_FUN_ATTR int16_t atomic_and_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_and_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_or_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_or_i16_shared(volatile __local int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_xor_i16_global(volatile __global int16_t *p, int16_t x);
SCALAR_FUN_ATTR int16_t atomic_xor_i16_shared(volatile __local int16_t *p, int16_t x);

SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_global(volatile __global int16_t *p,
                                                  int16_t cmp, int16_t val) {
  int offset = ((uintptr_t)p >> 1 & 1);
  volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);

  int shift = offset * 16;
  int32_t mask = 0xffff << shift;
  int32_t shifted_val = val << shift;
  int32_t shifted_cmp = cmp << shift;

  uint32_t old = shifted_cmp;
  uint32_t upd = shifted_val;
  uint32_t got;

  while ((got=atomic_cmpxchg_i32_global(p32, old, upd)) != old) {
    old = got;
    upd = (old & ~mask) | shifted_val;
  }

  return old >> shift;
}

SCALAR_FUN_ATTR int16_t atomic_cmpxchg_i16_shared(volatile __local int16_t *p,
                                                  int16_t cmp, int16_t val) {
  int offset = ((uintptr_t)p >> 1 & 1);
  volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);

  int shift = offset * 16;
  int32_t mask = 0xffff << shift;
  int32_t shifted_val = val << shift;
  int32_t shifted_cmp = cmp << shift;

  uint32_t old = shifted_cmp;
  uint32_t upd = shifted_val;
  uint32_t got;

  while ((got=atomic_cmpxchg_i32_shared(p32, old, upd)) != old) {
    old = got;
    upd = (old & ~mask) | shifted_val;
  }

  return old >> shift;
}

// Convenience macro for arithmetic.
#define DEFINE_16BIT_ATOMIC(name, T, op)                                \
  SCALAR_FUN_ATTR T                                                     \
  atomic_##name##_i16_global(volatile __global T *p, T val) {           \
    int offset = ((uintptr_t)p >> 1 & 1);                               \
    volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3); \
    int shift = offset * 16;                                            \
    int32_t mask = 0xffff << shift;                                     \
    int32_t old = 0;                                                    \
    int32_t upd = mask & (op(old >> shift, val) << shift);              \
    int32_t saw;                                                        \
    while ((saw=atomic_cmpxchg_i32_global(p32, old, upd)) != old) {     \
      old = saw;                                                        \
      upd = (old & ~mask) | ((op(old >> shift, val)) << shift);         \
    }                                                                   \
    return old >> shift;                                                \
  }                                                                     \
  SCALAR_FUN_ATTR T                                                     \
  atomic_##name##_i16_shared(volatile __local T *p, T val) {            \
    int offset = ((uintptr_t)p >> 1 & 1);                               \
    volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3); \
    int shift = offset * 16;                                            \
    int32_t mask = 0xffff << shift;                                     \
    int32_t old = 0;                                                    \
    int32_t upd = mask & ((op(old >> shift, val)) << shift);            \
    int32_t saw;                                                        \
    while ((saw=atomic_cmpxchg_i32_shared(p32, old, upd)) != old) {     \
      old = saw;                                                        \
      upd = (old & ~mask) | ((op(old >> shift, val)) << shift);         \
    }                                                                   \
    return old >> shift;                                                \
  }

DEFINE_16BIT_ATOMIC(add, int16_t, add16);
DEFINE_16BIT_ATOMIC(smax, int16_t, smax16);
DEFINE_16BIT_ATOMIC(smin, int16_t, smin16);
DEFINE_16BIT_ATOMIC(umax, uint16_t, umax16);
DEFINE_16BIT_ATOMIC(umin, uint16_t, umin16);

SCALAR_FUN_ATTR int16_t atomic_and_i16_global(volatile __global int16_t *p, int16_t val) {
  volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
  int shift = ((uintptr_t)p >> 1 & 1) * 16;
  int32_t mask = 0xffff << shift;
  return atomic_and_i32_global(p32, ~mask | (val<<shift)) >> shift;
}

SCALAR_FUN_ATTR int16_t atomic_and_i16_shared(volatile __local int16_t *p, int16_t val) {
  volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
  int shift = ((uintptr_t)p >> 1 & 1) * 16;
  int32_t mask = 0xffff << shift;
  return atomic_and_i32_shared(p32, ~mask | (val<<shift)) >> shift;
}

SCALAR_FUN_ATTR int16_t atomic_or_i16_global(volatile __global int16_t *p, int16_t val) {
  volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
  int shift = ((uintptr_t)p >> 1 & 1) * 16;
  return atomic_or_i32_global(p32, (uint16_t)val<<shift) >> shift;
}

SCALAR_FUN_ATTR int16_t atomic_or_i16_shared(volatile __local int16_t *p, int16_t val) {
  volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
  int shift = ((uintptr_t)p >> 1 & 1) * 16;
  return atomic_or_i32_shared(p32, (uint16_t)val<<shift) >> shift;
}

SCALAR_FUN_ATTR int16_t atomic_xor_i16_global(volatile __global int16_t *p, int16_t val) {
  volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
  int shift = ((uintptr_t)p >> 1 & 1) * 16;
  return atomic_xor_i32_global(p32, (uint16_t)val<<shift) >> shift;
}

SCALAR_FUN_ATTR int16_t atomic_xor_i16_shared(volatile __local int16_t *p, int16_t val) {
  volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
  int shift = ((uintptr_t)p >> 1 & 1) * 16;
  return atomic_xor_i32_shared(p32, (uint16_t)val<<shift) >> shift;
}

SCALAR_FUN_ATTR f16 atomic_fadd_f16_global(volatile __global uint16_t *p, f16 val) {
  int offset = ((uintptr_t)p >> 1 & 1);
  volatile __global int32_t *p32 = (volatile __global int32_t*)((uintptr_t)p & ~0x3);
  int shift = offset * 16;
  int32_t mask = 0xffff << shift;
  int32_t old = 0;
  int32_t upd = mask & ((int32_t)fptobits_f16_i16(val) << shift);
  int32_t saw;
  while ((saw=atomic_cmpxchg_i32_global(p32, old, upd)) != old) {
    old = saw;
    upd = (old & ~mask) | (int32_t)fptobits_f16_i16(bitstofp_i16_f16((uint32_t)old >> shift) + val) << shift;
  }
  return bitstofp_i16_f16((uint32_t)old >> shift);
}

SCALAR_FUN_ATTR f16 atomic_fadd_f16_shared(volatile __local uint16_t *p, f16 val) {
  int offset = ((uintptr_t)p >> 1 & 1);
  volatile __local int32_t *p32 = (volatile __local int32_t*)((uintptr_t)p & ~0x3);
  int shift = offset * 16;
  int32_t mask = 0xffff << shift;
  int32_t old = 0;
  int32_t upd = mask & ((int32_t)fptobits_f16_i16(val) << shift);
  int32_t saw;
  while ((saw=atomic_cmpxchg_i32_shared(p32, old, upd)) != old) {
    old = saw;
    upd = (old & ~mask) | (int32_t)fptobits_f16_i16(bitstofp_i16_f16((uint32_t)old >> shift) + val) << shift;
  }
  return bitstofp_i16_f16((uint32_t)old >> shift);
}

// End of atomics16.h