1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
|
/***************************************************************************
base_three.h
-------------------
W. Michael Brown (ORNL)
Base class for 3-body potentials
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Tue April 2, 2013
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_BASE_THREE_H
#define LAL_BASE_THREE_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
//#define THREE_CONCURRENT
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseThree {
public:
BaseThree();
virtual ~BaseThree();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_two name for the kernel for 2-body force calculation
* \param k_three name for the kernel for 3-body force calculation
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card
* - -10 if invalid thread_per_atom setting **/
int init_three(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end,
const char *k_short_nbor=nullptr, const int onetype=-1,
const int onetype3=-1, const int spq=0,
const int tpa_override=0);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success))
pos_tex.bind_float(atom->x,4);
ans->resize(inum,success);
#ifdef THREE_CONCURRENT
ans2->resize(inum,success);
#endif
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param inum number of particles whose nbors must be stored on device
* \param max_nbors maximum number of neighbors
* \param success set to false if insufficient memory
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param inum number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param max_nbors current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers(screen);
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
#ifdef THREE_CONCURRENT
ans2->acc_timers();
#endif
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
#ifdef THREE_CONCURRENT
ans2->zero_timers();
#endif
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, const int nlist, int *ilist,
int *numj, int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, tagint **special,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, int **ilist,
int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
Balance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
#ifdef THREE_CONCURRENT
Answer<numtyp,acctyp> *ans2;
#endif
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
Neighbor *nbor;
UCL_Kernel k_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
inline int block_pair() { return _block_pair; }
inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) {
k_sel = &k_pair;
k_3center_sel = &k_three_center;
k_3end_sel = &k_three_end;
} else {
k_sel = &k_pair_noev;
k_3center_sel = &k_three_center_noev;
k_3end_sel = &k_three_end_noev;
}
#endif
}
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
protected:
bool _compiled;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor, _onetype, _onetype3, _spq;
double _max_bytes, _max_an_bytes;
int _ainum, _nall;
double _gpu_overhead, _driver_overhead;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *two, const char *three_center,
const char *three_end, const char* short_nbor,
const int onetype, const int onetype3,
const int spq);
virtual int loop(const int eflag, const int vflag, const int evatom,
bool &success) = 0;
};
}
#endif
|