1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
|
/***************************************************************************
device.h
-------------------
W. Michael Brown (ORNL)
Class for management of the device where the computations are performed
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_DEVICE_H
#define LAL_DEVICE_H
#include "lal_atom.h"
#include "lal_answer.h"
#include "lal_neighbor.h"
#include "lal_pppm.h"
#include "mpi.h"
#include <sstream>
#include <cstdio>
#include <string>
#include <queue>
namespace LAMMPS_AL {
template <class numtyp, class acctyp,
class grdtyp, class grdtyp4> class PPPM;
template <class numtyp, class acctyp>
class Device {
public:
Device();
~Device();
/// Initialize the device for use by this process
/** Sets up a per-device MPI communicator for load balancing and initializes
* the device (ngpu starting at first_gpu_id) that this proc will be using
* Returns:
* - 0 if successful
* - -2 if GPU not found
* - -4 if GPU library not compiled for GPU
* - -6 if GPU could not be initialized for use
* - -7 if accelerator sharing is not currently allowed on system
* - -11 if config_string has the wrong number of parameters **/
int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
const int first_gpu_id, const int gpu_mode,
const double particle_split, const int t_per_atom,
const double user_cell_size, char *config_string,
const int ocl_platform, char *device_type_flags,
const int block_pair);
/// Initialize the device for Atom storage
/** \param charge True if charges need to be stored
* \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param vel True if velocities need to be stored
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
const int nlocal, const int nall, const int maxspecial,
const bool vel=false);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
/// Initialize the neighbor list storage
/** \param charge True if charges need to be stored
* \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param host_nlocal Initial number of host particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param gpu_host 0 if host will not perform force calculations,
* 1 if gpu_nbor is true, and host needs a half nbor list,
* 2 if gpu_nbor is true, and host needs a full nbor list
* \param max_nbors Initial number of rows in the neighbor matrix
* \param cutoff cutoff+skin
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel
* \param threads_per_atom value to be used by the neighbor list only
* \param ilist_map true if ilist mapping data structures used (3-body)
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_nbor(Neighbor *nbor, const int nlocal,
const int host_nlocal, const int nall,
const int maxspecial, const int gpu_host,
const int max_nbors, const double cutoff,
const bool pre_cut, const int threads_per_atom,
const bool ilist_map = false);
/// Output a message for pair_style acceleration with device stats
void init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu);
/// Perform charge assignment asynchronously for PPPM
void set_single_precompute(PPPM<numtyp,acctyp,
float,_lgpu_float4> *pppm);
/// Perform charge assignment asynchronously for PPPM
void set_double_precompute(PPPM<numtyp,acctyp,
double,_lgpu_double4> *pppm);
/// Esimate the overhead from GPU calls from multiple procs
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
* overhead
* \param gpu_overhead Estimated gpu overhead per timestep (sec)
* \param driver_overhead Estimated overhead from driver per timestep (s) **/
void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
double &gpu_driver_overhead);
/// Returns true if double precision is supported on card
inline bool double_precision() { return gpu->double_precision(); }
/// Output a message with timing information
void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
Neighbor &nbor, const double avg_split,
const double max_bytes, const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen);
/// Output a message with timing information
void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
UCL_Timer & time_map, UCL_Timer & time_rho,
UCL_Timer &time_interp,
Answer<numtyp,acctyp> &ans,
const double max_bytes, const double cpu_time,
const double cpu_idle_time, FILE *screen);
/// Clear all memory on host and device associated with atom and nbor data
void clear();
/// Clear all memory on host and device
void clear_device();
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
inline void add_ans_object(Answer<numtyp,acctyp> *ans)
{ ans_queue.push(ans); }
/// Add "answers" (force,energies,etc.) into LAMMPS structures
inline double fix_gpu(double **f, double **tor, double *eatom, double **vatom,
double *virial, double &ecoul, int &error_flag) {
error_flag=0;
atom.data_unavail();
if (ans_queue.empty()==false) {
stop_host_timer();
double evdw=0.0;
while (ans_queue.empty()==false) {
evdw += ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,error_flag);
ans_queue.pop();
}
return evdw;
}
return 0.0;
}
/// Start timer on host
inline void start_host_timer()
{ _cpu_full=MPI_Wtime(); _host_timer_started=true; }
/// Stop timer on host
inline void stop_host_timer() {
if (_host_timer_started) {
_cpu_full=MPI_Wtime()-_cpu_full;
_host_timer_started=false;
}
}
/// Return host time
inline double host_time() { return _cpu_full; }
/// Return host memory usage in bytes
double host_memory_usage() const;
/// Return the number of procs sharing a device (size of device communicator)
inline int procs_per_gpu() const { return _procs_per_gpu; }
/// My rank within all processes
inline int world_me() const { return _world_me; }
/// Total number of processes
inline int world_size() const { return _world_size; }
/// MPI Barrier for world
inline void world_barrier() { MPI_Barrier(_comm_world); }
/// Return the replica MPI communicator
inline MPI_Comm & replica() { return _comm_replica; }
/// My rank within replica communicator
inline int replica_me() const { return _replica_me; }
/// Number of procs in replica communicator
inline int replica_size() const { return _replica_size; }
/// Return the per-GPU MPI communicator
inline MPI_Comm & gpu_comm() { return _comm_gpu; }
/// Return my rank in the device communicator
inline int gpu_rank() const { return _gpu_rank; }
/// MPI Barrier for gpu
inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
/// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH
inline int gpu_mode() const { return _gpu_mode; }
/// Index of first device used by a node
inline int first_device() const { return _first_device; }
/// Index of last device used by a node
inline int last_device() const { return _last_device; }
/// Particle split defined in fix
inline double particle_split() const { return _particle_split; }
/// Return the initialization count for the device
inline int init_count() const { return _init_count; }
/// True if device is being timed
inline bool time_device() const { return _time_device; }
/// Accelerator device configuration id
inline int config_id() const { return _config_id; }
/// Number of threads executing concurrently on same multiproc
inline int simd_size() const { return _simd_size; }
/// Return the number of threads accessing memory simulatenously
inline int num_mem_threads() const { return _num_mem_threads; }
/// 1 if horizontal vector operations enabled, 0 otherwise
inline int shuffle_avail() const { return _shuffle_avail; }
/// For OpenCL, 0 if fast-math options disabled, 1 enabled
inline int fast_math() const { return _fast_math; }
/// Return the number of threads per atom for pair styles
inline int threads_per_atom() const { return _threads_per_atom; }
/// Return the number of threads per atom for pair styles using charge
inline int threads_per_charge() const { return _threads_per_charge; }
/// Return the number of threads per atom for 3-body pair styles
inline int threads_per_three() const { return _threads_per_three; }
/// Return the min of the pair block size or the device max block size
inline int pair_block_size() const { return _block_pair; }
/// Return the block size for "bio" pair styles
inline int block_bio_pair() const { return _block_bio_pair; }
/// Return the block size for "ellipse" pair styles
inline int block_ellipse() const { return _block_ellipse; }
/// Return the block size for PPPM kernels
inline int pppm_block() const { return _pppm_block; }
/// Return the block size for neighbor build kernel
inline int block_nbor_build() const { return _block_nbor_build; }
/// Return the block size for neighbor binning
inline int block_cell_2d() const { return _block_cell_2d; }
/// Return the block size for atom mapping for neighbor builds
inline int block_cell_id() const { return _block_cell_id; }
/// Return the maximum number of atom types that can be used with shared mem
inline int max_shared_types() const { return _max_shared_types; }
/// Return the maximum number of atom types for shared mem with "bio" styles
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
/// Return the maximum order for PPPM splines
inline int pppm_max_spline() const { return _pppm_max_spline; }
/// Architecture gpu code compiled for (returns 0 for OpenCL)
inline double ptx_arch() const { return _ptx_arch; }
inline void set_simd_size(int simd_sz) { _simd_size = simd_sz; }
// -------------------------- DEVICE DATA -------------------------
/// Geryon Device
UCL_Device *gpu;
enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> atom;
// --------------------------- NBOR SHARED KERNELS ----------------
/// Shared kernels for neighbor lists
NeighborShared _neighbor_shared;
// ------------------------ LONG RANGE DATA -----------------------
// Long Range Data
int _long_range_precompute;
PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
/// Precomputations for long range charge assignment (asynchronously)
inline void precompute(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *charge, double *boxlo, double *prd) {
if (_long_range_precompute==1)
pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
else if (_long_range_precompute==2)
pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
}
inline std::string compile_string() { return _ocl_compile_string; }
std::string compile_string_nofast();
inline std::string ocl_config_name() { return _ocl_config_name; }
template <class t>
inline std::string toa(const t& in) {
std::ostringstream o;
o.precision(2);
o << in;
return o.str();
}
private:
std::queue<Answer<numtyp,acctyp> *> ans_queue;
int _init_count;
bool _device_init, _host_timer_started, _time_device;
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
_replica_size;
int _gpu_mode, _first_device, _last_device, _platform_id;
double _particle_split;
double _cpu_full;
double _ptx_arch;
double _user_cell_size; // -1 if the cutoff is used
int _config_id, _simd_size, _num_mem_threads, _shuffle_avail, _fast_math;
int _threads_per_atom, _threads_per_charge, _threads_per_three;
int _block_pair, _block_bio_pair, _block_ellipse;
int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;
UCL_Program *dev_program;
UCL_Kernel k_zero, k_info;
bool _compiled;
int compile_kernels();
int _data_in_estimate, _data_out_estimate;
std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string;
int set_ocl_params(std::string, const std::string &);
};
}
#endif
|