File: lal_base_ellipsoid.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (284 lines) | stat: -rw-r--r-- 10,022 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/***************************************************************************
                               base_ellipsoid.h
                             -------------------
                            W. Michael Brown (ORNL)

  Base class for acceleration of ellipsoid potentials

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                : Thu May 5 2011
    email                : brownw@ornl.gov
 ***************************************************************************/

#ifndef LAL_BASE_ELLIPSOID_H
#define LAL_BASE_ELLIPSOID_H

#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"

#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

namespace LAMMPS_AL {

template <class numtyp, class acctyp>
class BaseEllipsoid {
 public:
  BaseEllipsoid();
  virtual ~BaseEllipsoid();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
    * \param k_name name for the kernel for force calculation
    *
    * Returns:
    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_base(const int nlocal, const int nall, const int max_nbors,
                const int maxspecial, const double cell_size,
                const double gpu_split, FILE *screen, const int ntypes,
                int **h_form, const void *ellipsoid_program,
                const void *lj_program, const char *k_name,
                const bool ellipsoid_sphere=false);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int nall, bool &success) {
    if (atom->resize(nall, success)) {
      neigh_tex.bind_float(atom->x,4);
      pos_tex.bind_float(atom->x,4);
      quat_tex.bind_float(atom->quat,4);
      lj_pos_tex.bind_float(atom->x,4);
      lj_quat_tex.bind_float(atom->quat,4);
    }
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \param olist_size size of list of particles from CPU neighboring
    * \note host_inum is 0 if the host is performing neighboring
    * \note if GPU is neighboring nlocal+host_inum=total number local particles
    * \note if CPU is neighboring olist_size=total number of local particles
    * \note if GPU is neighboring olist_size=0 **/
  inline void resize_local(const int nlocal, const int host_inum,
                           const int max_nbors, const int olist_size,
                           bool &success) {
    ans->resize(nlocal, success);
    if (_multiple_forms) ans->force.zero();

    if (olist_size>host_olist_size) {
      if (host_olist_size) delete []host_olist;
      host_olist_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
      host_olist = new int[host_olist_size];
    }

    nbor->resize(nlocal,host_inum,max_nbors,success);
    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear_base();

  /// Output any timing information
  void output_times();

  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;

  /// Total host memory used by library for pair style
  double host_memory_usage_base() const;

  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      nbor->acc_timers(screen);
      time_nbor1.add_to_total();
      time_ellipsoid.add_to_total();
      if (_multiple_forms) {
        time_nbor2.add_to_total();
        time_ellipsoid2.add_to_total();
        if (_ellipsoid_sphere) {
          time_nbor3.add_to_total();
          time_ellipsoid3.add_to_total();
        }
        time_lj.add_to_total();
      }
      atom->acc_timers();
      ans->acc_timers();
    }
  }

  /// Zero timers
  inline void zero_timers() {
    time_nbor1.zero();
    time_ellipsoid.zero();
    if (_multiple_forms) {
      time_nbor2.zero();
      time_ellipsoid2.zero();
      if (_ellipsoid_sphere) {
        time_nbor3.zero();
        time_ellipsoid3.zero();
      }
      time_lj.zero();
    }
    atom->zero_timers();
    ans->zero_timers();
  }

  /// Pack neighbors to limit thread divergence for lj-lj and ellipse
  void pack_nbors(const int GX, const int BX, const int start, const int inum,
                  const int form_low, const int form_high,
                  const bool shared_types, int ntypes);

  /// Copy neighbor list from host
  void reset_nbors(const int nall, const int inum, const int osize, int *ilist,
                   int *numj, int *type, int **firstneigh, bool &success);

  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                       tagint **special, bool &success);

  /// Pair loop with host neighboring
  int* compute(const int f_ago, const int inum_full, const int nall,
               double **host_x, int *host_type, int *ilist, int *numj,
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               const double cpu_time, bool &success, double **quat);

  /// Pair loop with device neighboring
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                double **host_quat);

  /// Build neighbor list on accelerator
  void build_nbor_list(const int inum, const int host_inum, const int nall,
                       double **host_x, int *host_type, double *sublo,
                       double *subhi, bool &success);

  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;

  /// Geryon device
  UCL_Device *ucl_device;

  /// Device Timers
  UCL_Timer time_nbor1, time_ellipsoid, time_nbor2, time_ellipsoid2, time_lj;
  UCL_Timer time_nbor3, time_ellipsoid3;

  /// Host device load balancer
  Balance<numtyp,acctyp> hd_balancer;

  /// LAMMPS pointer for screen output
  FILE *screen;

  // --------------------------- ATOM DATA --------------------------

  /// Atom Data
  Atom<numtyp,acctyp> *atom;

  // --------------------------- TYPE DATA --------------------------

  /// cut_form.x = cutsq, cut_form.y = form
  UCL_D_Vec<numtyp2> cut_form;

  // ------------------------ FORCE/ENERGY DATA -----------------------

  Answer<numtyp,acctyp> *ans;

  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
  Neighbor *nbor;
  /// ilist with particles sorted by type
  int *host_olist;
  int host_olist_size;

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
  UCL_Program *ellipsoid_program_noev, *lj_program_noev;
  UCL_Kernel k_nbor_fast, k_nbor;
  UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
  UCL_Kernel k_lj_fast, k_lj;
  UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev;
  UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev;
  UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel;
  inline int block_size() { return _block_size; }
  inline void set_kernel(const int eflag, const int vflag) {
    #if defined(LAL_OCL_EV_JIT)
    if (_multiple_forms == false) {
      if (eflag || vflag) k_elps_sel = &k_ellipsoid;
      else k_elps_sel = &k_ellipsoid_noev;
    } else {
      if (eflag || vflag) {
        k_elps_sel = &k_ellipsoid;
        k_elps_sphere_sel = &k_ellipsoid_sphere;
        k_sphere_elps_sel = &k_sphere_ellipsoid;
        k_lj_sel = &k_lj_fast;
      } else {
        k_elps_sel = &k_ellipsoid_noev;
        k_elps_sphere_sel = &k_ellipsoid_sphere_noev;
        k_sphere_elps_sel = &k_sphere_ellipsoid_noev;
        k_lj_sel = &k_lj_fast_noev;
      }
    }
    #endif
  }


  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;

 protected:
  bool _compiled, _ellipsoid_sphere;
  int _block_size, _threads_per_atom;
  double  _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;

  // True if we want to use fast GB-sphere or sphere-sphere calculations
  bool _multiple_forms;
  int **_host_form;
  int _last_ellipse, _max_last_ellipse;

  void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
                       const void *lj_string, const char *kname,const bool e_s);

  virtual int loop(const int eflag, const int vflag) = 0;
};

}

#endif