1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
/***************************************************************************
answer.h
-------------------
W. Michael Brown (ORNL)
Class for data management of forces, torques, energies, and virials
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_ANSWER_H
#define LAL_ANSWER_H
#include <cmath>
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
using namespace ucl_opencl;
#elif defined(USE_CUDART)
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_timer.h"
#include "geryon/hip_mat.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
using namespace ucl_cudadr;
#endif
#include "lal_precision.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Answer {
public:
Answer();
~Answer() { clear(); }
/// Current number of local atoms stored
inline int inum() const { return _inum; }
/// Set number of local atoms for future copy operations
inline void inum(const int n) { _inum=n; }
/// Return the maximum number of atoms that can be stored currently
inline int max_inum() const { return _max_local; }
/// Return the number of fields used for energy and virial
inline int ev_fields(const int mode) const {
return (mode == 1) ? _ev_fields : _e_fields;
}
/// Memory usage per atom in this class
int bytes_per_atom() const;
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions **/
bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
/// Check if we have enough device storage and realloc if not
inline void resize(const int inum, bool &success) {
_inum=inum;
if (inum>_max_local) {
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
success=success && (force.resize(_max_local*_ans_fields)==UCL_SUCCESS);
success=success && (engv.resize(_max_local*_ev_fields)==UCL_SUCCESS);
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
}
}
/// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions **/
bool add_fields(const bool charge, const bool rot);
/// Free all memory on host and device
void clear();
/// Return the total amount of host memory used by class in bytes
double host_memory_usage() const;
/// Add copy times to timers
inline void acc_timers() {
time_answer.add_to_total();
}
/// Add copy times to timers
inline void zero_timers() {
time_answer.zero();
}
/// Return the total time for host/device data transfer
inline double transfer_time() {
return time_answer.total_seconds();
}
/// Return the total time for data cast/pack
inline double cast_time() { return _time_cast; }
/// Return number of bytes used on device
inline double gpu_bytes() { return _gpu_bytes; }
// -------------------------COPY FROM GPU -------------------------------
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
const bool vf_atom, const int red_blocks);
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
const bool vf_atom, int *ilist, const int red_blocks);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial,
double &ecoul);
/// Add forces and torques from the GPU into a LAMMPS pointer
void get_answers(double **f, double **tor);
inline double get_answers(double **f, double **tor, double *eatom, double **vatom,
double *virial, double &ecoul, int &error_flag_in) {
double ta=MPI_Wtime();
time_answer.sync_stop();
_time_cpu_idle+=MPI_Wtime()-ta;
double ts=MPI_Wtime();
if (error_flag[0]) error_flag_in=error_flag[0];
double evdw=energy_virial(eatom,vatom,virial,ecoul);
get_answers(f,tor);
_time_cast+=MPI_Wtime()-ts;
return evdw;
}
/// Return the time the CPU was idle waiting for GPU
inline double cpu_idle_time() { return _time_cpu_idle; }
/// Change the command queue used for copies and timers
void cq(const int cq_index);
// ------------------------------ DATA ----------------------------------
/// Force and possibly torque
UCL_Vector<acctyp,acctyp> force;
/// Energy and virial per-atom storage
UCL_Vector<acctyp,acctyp> engv;
/// Error flag
UCL_Vector<int,int> error_flag;
/// Device timers
UCL_Timer time_answer;
/// Geryon device
UCL_Device *dev;
private:
bool alloc(const int inum);
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride;
int *_ilist;
double _time_cast, _time_cpu_idle;
double _gpu_bytes;
bool _newton;
};
}
#endif
|