File: lal_aux_fun1.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (560 lines) | stat: -rw-r--r-- 38,359 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
// **************************************************************************
//                                 aux_fun1.h
//                             -------------------
//                           W. Michael Brown (ORNL)
//
//  Device code for pair style auxiliary functions
//
// __________________________________________________________________________
//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
//    begin                : Sat Oct 22 2011
//    email                : brownw@ornl.gov
// ***************************************************************************/

#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif

#define atom_info(t_per_atom, ii, tid, offset)                               \
  tid=THREAD_ID_X;                                                           \
  offset=tid & (t_per_atom-1);                                               \
  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset,  \
                  i, numj, n_stride, nbor_end, nbor_begin)                   \
  i=dev_nbor[ii];                                                            \
  nbor_begin=ii+nbor_pitch;                                                  \
  numj=dev_nbor[nbor_begin];                                                 \
  if (dev_nbor==dev_packed) {                                                \
    nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1);                        \
    n_stride=fast_mul(t_per_atom,nbor_pitch);                                \
    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
    nbor_begin+=offset;                                                      \
  } else {                                                                   \
    nbor_begin+=nbor_pitch;                                                  \
    nbor_begin=dev_nbor[nbor_begin];                                         \
    nbor_end=nbor_begin+numj;                                                \
    n_stride=t_per_atom;                                                     \
    nbor_begin+=offset;                                                      \
  }

#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
                    i, numj, stride, nbor_end, nbor_begin)                   \
    i=nbor_mem[ii];                                                          \
    nbor_begin=ii+nbor_stride;                                               \
    numj=nbor_mem[nbor_begin];                                               \
    nbor_begin+=nbor_stride+ii*(t_per_atom-1);                               \
    stride=fast_mul(t_per_atom,nbor_stride);                                 \
    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj &             \
                                                          (t_per_atom-1));   \
    nbor_begin+=offset;

#if (SHUFFLE_AVAIL == 0)

#define simd_reduce_add1(width, local, offset, tid, one)                    \
  local[0][tid]=one;                                                        \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) local[0][tid] += local[0][tid+s];                       \
  }                                                                         \
  if (offset==0) one=local[0][tid];

#define simd_reduce_add2(width, local, offset, tid, one, two)               \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    one=local[0][tid];                                                      \
    two=local[1][tid];                                                      \
  }

#define simd_reduce_add3(width, local, offset, tid, one, two, three)        \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  local[2][tid]=three;                                                      \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
      local[2][tid] += local[2][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    one=local[0][tid];                                                      \
    two=local[1][tid];                                                      \
    three=local[2][tid];                                                    \
  }

#define simd_reduce_add6(width, local, offset, tid, one, two, three,        \
                         four, five, six)                                   \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  local[2][tid]=three;                                                      \
  local[3][tid]=four;                                                       \
  local[4][tid]=five;                                                       \
  local[5][tid]=six;                                                        \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
      local[2][tid] += local[2][tid+s];                                     \
      local[3][tid] += local[3][tid+s];                                     \
      local[4][tid] += local[4][tid+s];                                     \
      local[5][tid] += local[5][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    one=local[0][tid];                                                      \
    two=local[1][tid];                                                      \
    three=local[2][tid];                                                    \
    four=local[3][tid];                                                     \
    five=local[4][tid];                                                     \
    six=local[5][tid];                                                      \
  }

#define simd_reduce_arr(trip, width, local, offset, tid, arr)               \
  for (int r=0; r<trip; r++)                                                \
    local[r][tid]=arr[r];                                                   \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    simdsync();                                                             \
    if (offset < s) {                                                       \
      for (int r=0; r<trip; r++)                                            \
        local[r][tid] += local[r][tid+s];                                   \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    for (int r=0; r<trip; r++)                                              \
      arr[r]=local[r][tid];                                                 \
  }

#define block_reduce_add1(width, local, tid, one)                           \
  local[0][tid]=one;                                                        \
  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
    __syncthreads();                                                        \
    if (tid < s) local[0][tid] += local[0][tid+s];                          \
  }                                                                         \
  if (tid<width) {                                                          \
    for (unsigned int s=width/2; s>0; s>>=1) {                              \
      simdsync();                                                           \
      if (tid < s) local[0][tid] += local[0][tid+s];                        \
    }                                                                       \
    if (tid==0) one=local[0][tid];                                          \
  }

#define block_reduce_add2(width, local, tid, one, two)                      \
  local[0][tid]=one;                                                        \
  local[1][tid]=two;                                                        \
  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
    __syncthreads();                                                        \
    if (tid < s) {                                                          \
      local[0][tid] += local[0][tid+s];                                     \
      local[1][tid] += local[1][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (tid<width) {                                                          \
    for (unsigned int s=width/2; s>0; s>>=1) {                              \
      simdsync();                                                           \
      if (tid < s) {                                                        \
        local[0][tid] += local[0][tid+s];                                   \
        local[1][tid] += local[1][tid+s];                                   \
      }                                                                     \
    }                                                                       \
    if (tid==0) {                                                           \
      one=local[0][tid];                                                    \
      two=local[1][tid];                                                    \
    }                                                                       \
  }

#define block_reduce_arr(trip, width, local, tid, arr)                      \
  for (int r=0; r<trip; r++)                                                \
    local[r][tid]=arr[r];                                                   \
  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
    __syncthreads();                                                        \
    if (tid < s) {                                                          \
      for (int r=0; r<trip; r++)                                            \
        local[r][tid] += local[r][tid+s];                                   \
    }                                                                       \
  }                                                                         \
  if (tid<width) {                                                          \
    for (unsigned int s=width/2; s>0; s>>=1) {                              \
      simdsync();                                                           \
      if (tid < s) {                                                        \
        for (int r=0; r<trip; r++)                                          \
          local[r][tid] += local[r][tid+s];                                 \
      }                                                                     \
    }                                                                       \
    if (tid==0) {                                                           \
      for (int r=0; r<trip; r++)                                            \
        arr[r]=local[r][tid];                                               \
    }                                                                       \
  }

#define local_allocate_store_pair()                                         \
    __local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_charge()                                       \
    __local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_bio()                                          \
    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
#define local_allocate_store_ellipse()                                      \
    __local acctyp red_acc[6][BLOCK_ELLIPSE];
#define local_allocate_store_three()                                        \
    __local acctyp red_acc[6][BLOCK_ELLIPSE];

#define store_answers(f, energy, virial, ii, inum, tid,                     \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
      if (eflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (EVFLAG && (eflag || vflag)) {                                         \
    int ei=BLOCK_ID_X;                                                      \
    if (eflag!=2 && vflag!=2) {                                             \
      const int ev_stride=NUM_BLOCKS_X;                                     \
      if (eflag) {                                                          \
        simdsync();                                                         \
        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
        if (vflag) __syncthreads();                                         \
        if (tid==0) {                                                       \
          engv[ei]=energy*(acctyp)0.5;                                      \
          ei+=ev_stride;                                                    \
        }                                                                   \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
        if (tid==0) {                                                       \
          for (int r=0; r<6; r++) {                                         \
            engv[ei]=virial[r]*(acctyp)0.5;                                 \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (EVFLAG && eflag) {                                                \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
      if (eflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (EVFLAG && (eflag || vflag)) {                                         \
    int ei=BLOCK_ID_X;                                                      \
    const int ev_stride=NUM_BLOCKS_X;                                       \
    if (eflag!=2 && vflag!=2) {                                             \
      if (eflag) {                                                          \
        simdsync();                                                         \
        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
        if (vflag) __syncthreads();                                         \
        if (tid==0) {                                                       \
          engv[ei]=energy*(acctyp)0.5;                                      \
          ei+=ev_stride;                                                    \
          engv[ei]=e_coul*(acctyp)0.5;                                      \
          ei+=ev_stride;                                                    \
        }                                                                   \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
        if (tid==0) {                                                       \
          for (int r=0; r<6; r++) {                                         \
            engv[ei]=virial[r]*(acctyp)0.5;                                 \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (EVFLAG && eflag) {                                                \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#else

#define simd_reduce_add1(width, one)                                        \
  for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);

#define simd_reduce_add2(width, one, two)                                   \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    one += shfl_down(one, s, width);                                        \
    two += shfl_down(two, s, width);                                        \
  }

#define simd_reduce_add3(width, one, two, three)                            \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    one += shfl_down(one, s, width);                                        \
    two += shfl_down(two, s, width);                                        \
    three += shfl_down(three, s, width);                                    \
  }

#define simd_reduce_add6(width, one, two, three, four, five, six)           \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    one += shfl_down(one, s, width);                                        \
    two += shfl_down(two, s, width);                                        \
    three += shfl_down(three, s, width);                                    \
    four += shfl_down(four, s, width);                                      \
    five += shfl_down(five, s, width);                                      \
    six += shfl_down(six, s, width);                                        \
  }

#define simd_reduce_arr(trip, width, arr)                                   \
  for (unsigned int s=width/2; s>0; s>>=1) {                                \
    for (int r=0; r<trip; r++)                                              \
      arr[r] += shfl_down(arr[r], s, width);                                \
  }

#if (EVFLAG == 1)

#define local_allocate_store_pair()                                         \
    __local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_charge()                                       \
    __local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_bio()                                          \
    __local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
#define local_allocate_store_ellipse()
#define local_allocate_store_three()                                        \
    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];

#define store_answers(f, energy, virial, ii, inum, tid,                     \
                      t_per_atom, offset, eflag, vflag, ans, engv)          \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
    if (vflag==2 || eflag==2) {                                             \
      if (eflag)                                                            \
        simd_reduce_add1(t_per_atom,energy);                                \
      if (vflag)                                                            \
        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (eflag || vflag) {                                                     \
    if (eflag!=2 && vflag!=2) {                                             \
      const int vwidth = simd_size();                                       \
      const int voffset = tid & (simd_size() - 1);                          \
      const int bnum = tid/simd_size();                                     \
      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
        if (bnum < active_subgs) {                                          \
          if (eflag) {                                                      \
            simd_reduce_add1(vwidth, energy);                               \
            if (voffset==0) red_acc[6][bnum] = energy;                      \
          }                                                                 \
          if (vflag) {                                                      \
            simd_reduce_arr(6, vwidth, virial);                             \
            if (voffset==0)                                                 \
              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
          }                                                                 \
        }                                                                   \
                                                                            \
        __syncthreads();                                                    \
        if (tid < active_subgs) {                                           \
            if (eflag) energy = red_acc[6][tid];                            \
          if (vflag)                                                        \
            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
        } else {                                                            \
          if (eflag) energy = (acctyp)0;                                    \
          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
        }                                                                   \
      }                                                                     \
                                                                            \
      if (bnum == 0) {                                                      \
        int ei=BLOCK_ID_X;                                                  \
        const int ev_stride=NUM_BLOCKS_X;                                   \
        if (eflag) {                                                        \
          simd_reduce_add1(vwidth, energy);                                 \
          if (tid==0) {                                                     \
            engv[ei]=energy*(acctyp)0.5;                                    \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
        if (vflag) {                                                        \
          simd_reduce_arr(6, vwidth, virial);                               \
          if (tid==0) {                                                     \
            for (int r=0; r<6; r++) {                                       \
              engv[ei]=virial[r]*(acctyp)0.5;                               \
              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (eflag) {                                                          \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
    if (vflag==2 || eflag==2) {                                             \
      if (eflag)                                                            \
        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
      if (vflag)                                                            \
        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) ans[ii]=f;                                      \
  if (eflag || vflag) {                                                     \
    if (eflag!=2 && vflag!=2) {                                             \
      const int vwidth = simd_size();                                       \
      const int voffset = tid & (simd_size() - 1);                          \
      const int bnum = tid/simd_size();                                     \
      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
        if (bnum < active_subgs) {                                          \
          if (eflag) {                                                      \
            simd_reduce_add2(vwidth, energy, e_coul);                       \
            if (voffset==0) {                                               \
              red_acc[6][bnum] = energy;                                    \
              red_acc[7][bnum] = e_coul;                                    \
            }                                                               \
          }                                                                 \
          if (vflag) {                                                      \
            simd_reduce_arr(6, vwidth, virial);                             \
            if (voffset==0)                                                 \
              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
          }                                                                 \
        }                                                                   \
                                                                            \
        __syncthreads();                                                    \
        if (tid < active_subgs) {                                           \
          if (eflag) {                                                      \
            energy = red_acc[6][tid];                                       \
            e_coul = red_acc[7][tid];                                       \
          }                                                                 \
          if (vflag)                                                        \
            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
        } else {                                                            \
          if (eflag) energy = e_coul = (acctyp)0;                           \
          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
        }                                                                   \
      }                                                                     \
                                                                            \
      if (bnum == 0) {                                                      \
        int ei=BLOCK_ID_X;                                                  \
        const int ev_stride=NUM_BLOCKS_X;                                   \
        if (eflag) {                                                        \
          simd_reduce_add2(vwidth, energy, e_coul);                         \
          if (tid==0) {                                                     \
            engv[ei]=energy*(acctyp)0.5;                                    \
            ei+=ev_stride;                                                  \
            engv[ei]=e_coul*(acctyp)0.5;                                    \
            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
        if (vflag) {                                                        \
          simd_reduce_arr(6, vwidth, virial);                               \
          if (tid==0) {                                                     \
            for (int r=0; r<6; r++) {                                       \
              engv[ei]=virial[r]*(acctyp)0.5;                               \
              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
      }                                                                     \
    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
      if (eflag) {                                                          \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
    }                                                                       \
  }

#else

#define local_allocate_store_pair()
#define local_allocate_store_charge()
#define local_allocate_store_bio()
#define local_allocate_store_ellipse()
#define local_allocate_store_three()

#define store_answers(f, energy, virial, ii, inum, tid,                     \
                      t_per_atom, offset, eflag, vflag, ans, engv)          \
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) ans[ii]=f;

#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) ans[ii]=f;

#endif

#endif