File: lal_base_amoeba.h

package info (click to toggle)
lammps 20250204%2Bdfsg.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 474,368 kB
  • sloc: cpp: 1,060,070; python: 27,785; ansic: 8,956; f90: 7,254; sh: 6,044; perl: 4,171; fortran: 2,442; xml: 1,714; makefile: 1,352; objc: 238; lisp: 188; yacc: 58; csh: 16; awk: 14; tcl: 6; javascript: 2
file content (328 lines) | stat: -rw-r--r-- 12,749 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
/***************************************************************************
                                base_amoeba.h
                             -------------------
                        Trung Dac Nguyen (Northwestern)

  Base class for pair styles needing per-particle data for position,
  charge, and type.

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                :
    email                : trung.nguyen@northwestern.edu
 ***************************************************************************/

#ifndef LAL_BASE_AMOEBA_H
#define LAL_BASE_AMOEBA_H

#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"

#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

//#define ASYNC_DEVICE_COPY

#if 0
#if !defined(USE_OPENCL) && !defined(USE_HIP)
// temporary workaround for int2 also defined in cufft
#ifdef int2
#undef int2
#endif
#include "cufft.h"
#endif
#endif

namespace LAMMPS_AL {

template <class numtyp, class acctyp>
class BaseAmoeba {
 public:
  BaseAmoeba();
  virtual ~BaseAmoeba();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const int maxspecial15, const double cell_size,
                  const double gpu_split, FILE *screen, const void *pair_program,
                  const char *kname_multipole, const char *kname_udirect2b,
                  const char *kname_umutual2b, const char *kname_polar,
                  const char *kname_fphi_uind, const char *kname_fphi_mpole,
                  const char *kname_short_nbor, const char* kname_special15);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead(const int add_kernels=0);

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success)) {
      pos_tex.bind_float(atom->x,4);
      q_tex.bind_float(atom->q,1);
    }
    ans->resize(inum,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \note olist_size=total number of local particles **/
  inline void resize_local(const int inum, const int max_nbors, bool &success) {
    nbor->resize(inum,max_nbors,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear_atomic();

  /// Returns memory usage on device per atom
  int bytes_per_atom_atomic(const int max_nbors) const;

  /// Total host memory used by library for pair style
  double host_memory_usage_atomic() const;

  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
    }
  }

  /// Zero timers
  inline void zero_timers() {
    time_pair.zero();
    atom->zero_timers();
    ans->zero_timers();
  }

  /// Copy neighbor list from host
  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
                    int **firstneigh, bool &success);

  /// Build neighbor list on device
  int build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                       tagint **special, int *nspecial15, tagint **special15,
                       bool &success);

  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
  virtual int** precompute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double **host_uind,
                double **host_uinp, double *host_pval, double *sublo, double *subhi,
                tagint *tag, int **nspecial, tagint **special,
                int *nspecial15, tagint **special15,
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **&ilist, int **&numj, const double cpu_time, bool &success,
                double *charge, double *boxlo, double *prd);

  /// Compute multipole real-space with device neighboring
  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double *host_pval,
                double *sublo, double *subhi, tagint *tag,
                int **nspecial, tagint **special, int *nspecial15, tagint **special15,
                const bool eflag, const bool vflag, const bool eatom, const bool vatom,
                int &host_start, int **ilist, int **numj, const double cpu_time,
                bool &success, const double aewald, const double felec,
                const double off2_mpole, double *charge, double *boxlo,
                double *prd, void **tep_ptr);

  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
  virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                double **host_uind, double **host_uinp, double *host_pval,
                const double aewald, const double off2_polar, void **fieldp_ptr);

  /// Compute the real space part of the induced field (umutual2b) with device neighboring
  virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                double **host_uind, double **host_uinp, double *host_pval,
                const double aewald, const double off2_polar, void **fieldp_ptr);

  /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
  virtual void precompute_kspace(const int inum_full, const int bsorder,
                                 double ***host_thetai1, double ***host_thetai2,
                                 double ***host_thetai3, int** igrid,
                                 const int nzlo_out, const int nzhi_out,
                                 const int nylo_out, const int nyhi_out,
                                 const int nxlo_out, const int nxhi_out);
  /// Interpolate the induced potential from the grid
  virtual void compute_fphi_uind(double ****host_grid_brick,
                                 void **host_fdip_phi1, void **host_fdip_phi2,
                                 void **host_fdip_sum_phi);

  /// Interpolate the multipolar potential from the grid
  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
                                  const double felec);

  /// Compute polar real-space with device neighboring
  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                double **host_uind, double **host_uinp, double *host_pval,
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom,
                const double aewald, const double felec, const double off2_polar,
                void **tep_ptr);

  // copy field and fieldp from device to host after umutual2b
  virtual void update_fieldp(void **fieldp_ptr) {
    *fieldp_ptr=_fieldp.host.begin();
     // _fieldp store both arrays, one after another
    _fieldp.update_host(_max_fieldp_size*6,false);
  }

  /// setup a plan for FFT, where size is the number of elements

  void setup_fft(const int size, const int element_type=0);

  /// compute forward/backward FFT on the device

  void compute_fft1d(void* in, void* out, const int numel, const int mode);

  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;

  /// Geryon device
  UCL_Device *ucl_device;

  /// Device Timers
  UCL_Timer time_pair;

  /// Host device load balancer
  Balance<numtyp,acctyp> hd_balancer;

  /// LAMMPS pointer for screen output
  FILE *screen;

  // --------------------------- ATOM DATA --------------------------

  /// Atom Data
  Atom<numtyp,acctyp> *atom;

  UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;

  /// cast host arrays into a single array for atom->extra
  void cast_extra_data(int* amtype, int* amgroup, double** rpole,
    double** uind, double** uinp, double* pval=nullptr);

  /// Per-atom arrays
  UCL_Vector<acctyp,acctyp> _tep, _fieldp;
  int _nmax, _max_tep_size, _max_fieldp_size;

  int _bsorder;
  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
  UCL_Vector<int,int> _igrid;
  UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
  UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
  int _max_thetai_size;
  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
  int _ngridx, _ngridy, _ngridz, _num_grid_points;

  int _end_command_queue;

  // ------------------------ FORCE/ENERGY DATA -----------------------

  Answer<numtyp,acctyp> *ans;

  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
  Neighbor *nbor;
  /// Device storage for 1-5 special neighbor counts
  UCL_D_Vec<int> dev_nspecial15;
  /// Device storage for special neighbors
  UCL_D_Vec<tagint> dev_special15, dev_special15_t;

  int add_onefive_neighbors();

  UCL_D_Vec<int> dev_short_nbor;

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
  UCL_Kernel k_fphi_uind, k_fphi_mpole;
  UCL_Kernel k_special15, k_short_nbor;
  inline int block_size() { return _block_size; }
  inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
  UCL_Texture q_tex;

 protected:
  bool _compiled;
  int _block_size, _block_bio_size, _threads_per_atom;
  int _extra_fields;
  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
  double _gpu_overhead, _driver_overhead;
  bool short_nbor_polar_avail;
  UCL_D_Vec<int> *_nbor_data;

  numtyp _aewald,_felec;
  numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;

  int _eflag, _vflag;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
     const char *kname_multipole, const char *kname_udirect2b,
     const char *kname_umutual2b, const char *kname_polar,
     const char *kname_fphi_uind, const char *kname_fphi_mpole,
     const char *kname_short_nbor, const char* kname_special15);

  virtual int multipole_real(const int eflag, const int vflag) = 0;
  virtual int udirect2b(const int eflag, const int vflag) = 0;
  virtual int umutual2b(const int eflag, const int vflag) = 0;
  virtual int fphi_uind();
  virtual int fphi_mpole();
  virtual int polar_real(const int eflag, const int vflag) = 0;

#if 0
  #if !defined(USE_OPENCL) && !defined(USE_HIP)
  cufftHandle plan;
  #endif
#endif
  bool fft_plan_created;
};

}

#endif