File: lal_base_three.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (241 lines) | stat: -rw-r--r-- 8,339 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/***************************************************************************
                                base_three.h
                             -------------------
                            W. Michael Brown (ORNL)

  Base class for 3-body potentials

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                : Tue April 2, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/

#ifndef LAL_BASE_THREE_H
#define LAL_BASE_THREE_H

#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"

#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

//#define THREE_CONCURRENT

namespace LAMMPS_AL {

template <class numtyp, class acctyp>
class BaseThree {
 public:
  BaseThree();
  virtual ~BaseThree();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_two name for the kernel for 2-body force calculation
    * \param k_three name for the kernel for 3-body force calculation
    *
    * Returns:
    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card
    * - -10 if invalid thread_per_atom setting **/
  int init_three(const int nlocal, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *screen,
                 const void *pair_program, const char *k_two,
                 const char *k_three_center, const char *k_three_end,
                 const char *k_short_nbor=nullptr, const int onetype=-1,
                 const int onetype3=-1, const int spq=0,
                 const int tpa_override=0);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead(const int add_kernels=0);

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success))
      pos_tex.bind_float(atom->x,4);
    ans->resize(inum,success);
    #ifdef THREE_CONCURRENT
    ans2->resize(inum,success);
    #endif
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param inum number of particles whose nbors must be stored on device
    * \param max_nbors maximum number of neighbors
    * \param success set to false if insufficient memory
    * \note olist_size=total number of local particles **/
  inline void resize_local(const int inum, const int max_nbors, bool &success) {
    nbor->resize(inum,max_nbors,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
  /** \param inum number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param max_nbors current maximum number of neighbors
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear_atomic();

  /// Returns memory usage on device per atom
  int bytes_per_atom_atomic(const int max_nbors) const;

  /// Total host memory used by library for pair style
  double host_memory_usage_atomic() const;

  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
      #ifdef THREE_CONCURRENT
      ans2->acc_timers();
      #endif
    }
  }

  /// Zero timers
  inline void zero_timers() {
    time_pair.zero();
    atom->zero_timers();
    ans->zero_timers();
    #ifdef THREE_CONCURRENT
    ans2->zero_timers();
    #endif
  }

  /// Copy neighbor list from host
  int * reset_nbors(const int nall, const int inum, const int nlist, int *ilist,
                    int *numj, int **firstneigh, bool &success);

  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum, const int nall,
                       double **host_x, int *host_type, double *sublo,
                       double *subhi, tagint *tag, int **nspecial,
                       tagint **special, bool &success);

  /// Pair loop with host neighboring
  void compute(const int f_ago, const int inum_full, const int nall,
               const int nlist, double **host_x, int *host_type,
               int *ilist, int *numj, int **firstneigh, const bool eflag,
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);

  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial, tagint **special,
                 const bool eflag, const bool vflag, const bool eatom,
                 const bool vatom, int &host_start, int **ilist,
                 int **numj, const double cpu_time, bool &success);

  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;

  /// Geryon device
  UCL_Device *ucl_device;

  /// Device Timers
  UCL_Timer time_pair;

  /// Host device load balancer
  Balance<numtyp,acctyp> hd_balancer;

  /// LAMMPS pointer for screen output
  FILE *screen;

  // --------------------------- ATOM DATA --------------------------

  /// Atom Data
  Atom<numtyp,acctyp> *atom;

  // ------------------------ FORCE/ENERGY DATA -----------------------

  Answer<numtyp,acctyp> *ans;
  #ifdef THREE_CONCURRENT
  Answer<numtyp,acctyp> *ans2;
  #endif

  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
  Neighbor *nbor;

  UCL_Kernel k_short_nbor;

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program, *pair_program_noev;
  UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
  UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
  UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
  inline int block_pair() { return _block_pair; }
  inline int block_size() { return _block_size; }
  inline void set_kernel(const int eflag, const int vflag) {
    #if defined(LAL_OCL_EV_JIT)
    if (eflag || vflag) {
      k_sel = &k_pair;
      k_3center_sel = &k_three_center;
      k_3end_sel = &k_three_end;
    } else {
      k_sel = &k_pair_noev;
      k_3center_sel = &k_three_center_noev;
      k_3end_sel = &k_three_end_noev;
    }
    #endif
  }


  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;

 protected:
  bool _compiled;
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
  int _gpu_nbor, _onetype, _onetype3, _spq;
  double _max_bytes, _max_an_bytes;
  int _ainum, _nall;
  double _gpu_overhead, _driver_overhead;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
                       const char *two, const char *three_center,
                       const char *three_end, const char* short_nbor,
                       const int onetype, const int onetype3,
                       const int spq);

  virtual int loop(const int eflag, const int vflag, const int evatom,
                   bool &success) = 0;
};

}

#endif