File: lal_pre_cuda_hip.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (355 lines) | stat: -rw-r--r-- 10,438 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// **************************************************************************
//                               pre_cuda_hip.h
//                             -------------------
//                           W. Michael Brown (ORNL)
//                           Nitin Dhamankar (Intel)
//
//  Device-side preprocessor definitions for CUDA and HIP builds
//
// __________________________________________________________________________
//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
//    begin                :
//    email                : brownw@ornl.gov
// ***************************************************************************/

//*************************************************************************
//                       Device Configuration Definitions
//                    See lal_preprocessor.h for definitions
//*************************************************************************/

// -------------------------------------------------------------------------
//                           CUDA and HIP DEFINITIONS
// -------------------------------------------------------------------------

#if defined(NV_KERNEL) || defined(USE_HIP)

// -------------------------------------------------------------------------
//                             DEVICE CONFIGURATION
// -------------------------------------------------------------------------


#if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)
#define CONFIG_ID 303
#define SIMD_SIZE 64
#else
#define CONFIG_ID 103
#define SIMD_SIZE 32
#endif

#define MEM_THREADS SIMD_SIZE
#define SHUFFLE_AVAIL 1
#define FAST_MATH 1

#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define THREADS_PER_THREE 2

#define BLOCK_PAIR 256
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define PPPM_BLOCK_1D 64
#define BLOCK_NBOR_BUILD 128
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128

#define MAX_SHARED_TYPES 11
#define MAX_BIO_SHARED_TYPES 128
#define PPPM_MAX_SPLINE 8

// -------------------------------------------------------------------------
//                          LEGACY DEVICE CONFIGURATION
// -------------------------------------------------------------------------

#ifdef __CUDA_ARCH__

#if (__CUDA_ARCH__ < 200)

#undef CONFIG_ID
#define CONFIG_ID 101
#define MEM_THREADS 16
#undef THREADS_PER_ATOM
#define THREADS_PER_ATOM 1
#undef THREADS_PER_CHARGE
#define THREADS_PER_CHARGE 16
#undef BLOCK_PAIR
#define BLOCK_PAIR 64
#undef BLOCK_BIO_PAIR
#define BLOCK_BIO_PAIR 64
#undef BLOCK_NBOR_BUILD
#define BLOCK_NBOR_BUILD 64
#undef MAX_SHARED_TYPES
#define MAX_SHARED_TYPES 8
#undef SHUFFLE_AVAIL
#define SHUFFLE_AVAIL 0

#elseif (__CUDA_ARCH__ < 300)

#undef CONFIG_ID
#define CONFIG_ID 102
#undef BLOCK_PAIR
#define BLOCK_PAIR 128
#undef BLOCK_BIO_PAIR
#define BLOCK_BIO_PAIR 128
#undef MAX_SHARED_TYPES
#define MAX_SHARED_TYPES 8
#undef SHUFFLE_AVAIL
#define SHUFFLE_AVAIL 0

#endif

#endif

// -------------------------------------------------------------------------
//                              KERNEL MACROS
// -------------------------------------------------------------------------

#ifdef USE_HIP
#include <hip/hip_runtime.h>
#endif

#define fast_mul(X,Y) (X)*(Y)

#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ < 200)
#define fast_mul __mul24
#endif
#endif

#define EVFLAG 1
#define NOUNROLL
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define NUM_BLOCKS_X gridDim.x

#define __kernel extern "C" __global__
#ifdef __local
#undef __local
#endif
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__

#define simd_size() SIMD_SIZE

#define simdsync()

#ifdef NV_KERNEL
#if (__CUDACC_VER_MAJOR__ >= 9)
#undef simdsync
#define simdsync() __syncwarp(0xffffffff)
#endif
#endif

#ifdef __HIP_PLATFORM_NVCC__
#undef simdsync()
#define simdsync() __syncwarp(0xffffffff)
#endif

// -------------------------------------------------------------------------
//                         KERNEL MACROS - TEXTURES
// -------------------------------------------------------------------------

#if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)
#define _texture(name, type)  __device__ type* name
#define _texture_2d(name, type)  __device__ type* name
#else
#define _texture(name, type)  texture<type> name
#define _texture_2d(name, type) texture<type,1> name
#endif

#if (__CUDACC_VER_MAJOR__ < 11)
  #ifdef _DOUBLE_DOUBLE
  #define fetch4(ans,i,pos_tex) {                        \
    int4 xy = tex1Dfetch(pos_tex,i*2);                   \
    int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
    ans.x=__hiloint2double(xy.y, xy.x);                  \
    ans.y=__hiloint2double(xy.w, xy.z);                  \
    ans.z=__hiloint2double(zt.y, zt.x);                  \
    ans.w=__hiloint2double(zt.w, zt.z);                  \
  }
  #define fetch(ans,i,q_tex) {                           \
    int2 qt = tex1Dfetch(q_tex,i);                       \
    ans=__hiloint2double(qt.y, qt.x);                    \
  }
  #else
  #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
  #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
  #endif
#else
  #define fetch4(ans,i,x) ans=x[i]
  #define fetch(ans,i,q) ans=q[i]
  #undef _texture
  #undef _texture_2d
  #define _texture(name, type)
  #define _texture_2d(name, type)
  #define pos_tex x_
  #define quat_tex qif
  #define q_tex q_
  #define vel_tex v_
  #define mu_tex mu_
#endif

#if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)

#undef fetch4
#undef fetch

#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
#define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
#else
#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
#define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
#endif

#endif

// -------------------------------------------------------------------------
//                           KERNEL MACROS - MATH
// -------------------------------------------------------------------------

#ifdef CUDA_PRE_THREE
struct __builtin_align__(16) _double4
{
  double x, y, z, w;
};
typedef struct _double4 double4;
#endif

#ifdef _DOUBLE_DOUBLE

#define ucl_exp exp
#define ucl_powr pow
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))

#else

#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#define ucl_exp expf
#define ucl_powr powf

#endif

// -------------------------------------------------------------------------
//                         KERNEL MACROS - SHUFFLE
// -------------------------------------------------------------------------

#if SHUFFLE_AVAIL == 1

#ifndef USE_HIP
#if (__CUDACC_VER_MAJOR__ < 9)
#define CUDA_PRE_NINE
#endif
#endif

#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)

  #ifdef _SINGLE_SINGLE
    #define shfl_down __shfl_down
    #define shfl_xor __shfl_xor
  #else
    ucl_inline double shfl_down(double var, unsigned int delta, int width) {
      int2 tmp;
      tmp.x = __double2hiint(var);
      tmp.y = __double2loint(var);
      tmp.x = __shfl_down(tmp.x,delta,width);
      tmp.y = __shfl_down(tmp.y,delta,width);
      return __hiloint2double(tmp.x,tmp.y);
    }
    ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
      int2 tmp;
      tmp.x = __double2hiint(var);
      tmp.y = __double2loint(var);
      tmp.x = __shfl_xor(tmp.x,lanemask,width);
      tmp.y = __shfl_xor(tmp.y,lanemask,width);
      return __hiloint2double(tmp.x,tmp.y);
    }
  #endif
  #define simd_broadcast_i __shfl
  #define simd_broadcast_f __shfl
  #ifdef _DOUBLE_DOUBLE
    ucl_inline double simd_broadcast_d(double var, unsigned int src,
                                       int width) {
      int2 tmp;
      tmp.x = __double2hiint(var);
      tmp.y = __double2loint(var);
      tmp.x = __shfl(tmp.x,src,width);
      tmp.y = __shfl(tmp.y,src,width);
      return __hiloint2double(tmp.x,tmp.y);
    }
  #endif

#else

  #ifdef _SINGLE_SINGLE
  ucl_inline float shfl_down(float var, unsigned int delta, int width) {
    return __shfl_down_sync(0xffffffff, var, delta, width);
  }
  ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) {
    return __shfl_xor_sync(0xffffffff, var, lanemask, width);
  }
  #else
  ucl_inline double shfl_down(double var, unsigned int delta, int width) {
    int2 tmp;
    tmp.x = __double2hiint(var);
    tmp.y = __double2loint(var);
    tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width);
    tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width);
    return __hiloint2double(tmp.x,tmp.y);
  }
  ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
    int2 tmp;
    tmp.x = __double2hiint(var);
    tmp.y = __double2loint(var);
    tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width);
    tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width);
    return __hiloint2double(tmp.x,tmp.y);
  }
  #endif
  #define simd_broadcast_i(var, src, width) \
    __shfl_sync(0xffffffff, var, src, width)
  #define simd_broadcast_f(var, src, width) \
    __shfl_sync(0xffffffff, var, src, width)
  #ifdef _DOUBLE_DOUBLE
  ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) {
    int2 tmp;
    tmp.x = __double2hiint(var);
    tmp.y = __double2loint(var);
    tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width);
    tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width);
    return __hiloint2double(tmp.x,tmp.y);
  }
  #endif
#endif

#endif

// -------------------------------------------------------------------------
//                            END CUDA / HIP DEFINITIONS
// -------------------------------------------------------------------------

#endif