File: lal_device.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (356 lines) | stat: -rw-r--r-- 14,845 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/***************************************************************************
                                  device.h
                             -------------------
                            W. Michael Brown (ORNL)

  Class for management of the device where the computations are performed

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

#ifndef LAL_DEVICE_H
#define LAL_DEVICE_H

#include "lal_atom.h"
#include "lal_answer.h"
#include "lal_neighbor.h"
#include "lal_pppm.h"
#include "mpi.h"
#include <sstream>
#include <cstdio>
#include <string>
#include <queue>

namespace LAMMPS_AL {

template <class numtyp, class acctyp,
          class grdtyp, class grdtyp4> class PPPM;

template <class numtyp, class acctyp>
class Device {
 public:
  Device();
  ~Device();

  /// Initialize the device for use by this process
  /** Sets up a per-device MPI communicator for load balancing and initializes
    * the device (ngpu starting at first_gpu_id) that this proc will be using
    * Returns:
    * -  0 if successful
    * - -2 if GPU not found
    * - -4 if GPU library not compiled for GPU
    * - -6 if GPU could not be initialized for use
    * - -7 if accelerator sharing is not currently allowed on system
    * - -11 if config_string has the wrong number of parameters **/
  int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                  const int first_gpu_id, const int gpu_mode,
                  const double particle_split, const int t_per_atom,
                  const double user_cell_size, char *config_string,
                  const int ocl_platform, char *device_type_flags,
                  const int block_pair);

  /// Initialize the device for Atom storage
  /** \param charge True if charges need to be stored
    * \param rot True if quaternions need to be stored
    * \param nlocal Total number of local particles to allocate memory for
    * \param nall Total number of local+ghost particles
    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param vel True if velocities need to be stored
    *
    * Returns:
    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
           const int nlocal, const int nall, const int maxspecial,
           const bool vel=false);

  /// Initialize the device for Atom storage only
  /** \param nlocal Total number of local particles to allocate memory for
    * \param nall Total number of local+ghost particles
    *
    * Returns:
    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);

  /// Initialize the neighbor list storage
  /** \param charge True if charges need to be stored
    * \param rot True if quaternions need to be stored
    * \param nlocal Total number of local particles to allocate memory for
    * \param host_nlocal Initial number of host particles to allocate memory for
    * \param nall Total number of local+ghost particles
    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param gpu_host 0 if host will not perform force calculations,
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
    * \param max_nbors Initial number of rows in the neighbor matrix
    * \param cutoff cutoff+skin
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel
    * \param threads_per_atom value to be used by the neighbor list only
    * \param ilist_map true if ilist mapping data structures used (3-body)
    *
    * Returns:
    * -  0 if successful
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_nbor(Neighbor *nbor, const int nlocal,
                const int host_nlocal, const int nall,
                const int maxspecial, const int gpu_host,
                const int max_nbors, const double cutoff,
                const bool pre_cut, const int threads_per_atom,
                const bool ilist_map = false);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
                    const int first_gpu, const int last_gpu);

  /// Perform charge assignment asynchronously for PPPM
  void set_single_precompute(PPPM<numtyp,acctyp,
                             float,_lgpu_float4> *pppm);

  /// Perform charge assignment asynchronously for PPPM
  void set_double_precompute(PPPM<numtyp,acctyp,
                             double,_lgpu_double4> *pppm);

  /// Esimate the overhead from GPU calls from multiple procs
  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
    *                     overhead
    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
                             double &gpu_driver_overhead);

  /// Returns true if double precision is supported on card
  inline bool double_precision() { return gpu->double_precision(); }

  /// Output a message with timing information
  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
                    Neighbor &nbor, const double avg_split,
                    const double max_bytes, const double gpu_overhead,
                    const double driver_overhead,
                    const int threads_per_atom, FILE *screen);

  /// Output a message with timing information
  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
                           UCL_Timer & time_map, UCL_Timer & time_rho,
                           UCL_Timer &time_interp,
                           Answer<numtyp,acctyp> &ans,
                           const double max_bytes, const double cpu_time,
                           const double cpu_idle_time, FILE *screen);

  /// Clear all memory on host and device associated with atom and nbor data
  void clear();

  /// Clear all memory on host and device
  void clear_device();

  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
  inline void add_ans_object(Answer<numtyp,acctyp> *ans)
    { ans_queue.push(ans); }

  /// Add "answers" (force,energies,etc.) into LAMMPS structures
  inline double fix_gpu(double **f, double **tor, double *eatom, double **vatom,
                        double *virial, double &ecoul, int &error_flag) {
    error_flag=0;
    atom.data_unavail();
    if (ans_queue.empty()==false) {
      stop_host_timer();
      double evdw=0.0;
      while (ans_queue.empty()==false) {
        evdw += ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,error_flag);
        ans_queue.pop();
      }
      return evdw;
    }
    return 0.0;
  }

  /// Start timer on host
  inline void start_host_timer()
    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }

  /// Stop timer on host
  inline void stop_host_timer() {
    if (_host_timer_started) {
      _cpu_full=MPI_Wtime()-_cpu_full;
      _host_timer_started=false;
    }
  }

  /// Return host time
  inline double host_time() { return _cpu_full; }

  /// Return host memory usage in bytes
  double host_memory_usage() const;

  /// Return the number of procs sharing a device (size of device communicator)
  inline int procs_per_gpu() const { return _procs_per_gpu; }
  /// My rank within all processes
  inline int world_me() const { return _world_me; }
  /// Total number of processes
  inline int world_size() const { return _world_size; }
  /// MPI Barrier for world
  inline void world_barrier() { MPI_Barrier(_comm_world); }
  /// Return the replica MPI communicator
  inline MPI_Comm & replica() { return _comm_replica; }
  /// My rank within replica communicator
  inline int replica_me() const { return _replica_me; }
  /// Number of procs in replica communicator
  inline int replica_size() const { return _replica_size; }
  /// Return the per-GPU MPI communicator
  inline MPI_Comm & gpu_comm() { return _comm_gpu; }
  /// Return my rank in the device communicator
  inline int gpu_rank() const { return _gpu_rank; }
  /// MPI Barrier for gpu
  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
  /// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH
  inline int gpu_mode() const { return _gpu_mode; }
  /// Index of first device used by a node
  inline int first_device() const { return _first_device; }
  /// Index of last device used by a node
  inline int last_device() const { return _last_device; }
  /// Particle split defined in fix
  inline double particle_split() const { return _particle_split; }
  /// Return the initialization count for the device
  inline int init_count() const { return _init_count; }
  /// True if device is being timed
  inline bool time_device() const { return _time_device; }

  /// Accelerator device configuration id
  inline int config_id() const { return _config_id; }
  /// Number of threads executing concurrently on same multiproc
  inline int simd_size() const { return _simd_size; }
  /// Return the number of threads accessing memory simulatenously
  inline int num_mem_threads() const { return _num_mem_threads; }
  /// 1 if horizontal vector operations enabled, 0 otherwise
  inline int shuffle_avail() const { return _shuffle_avail; }
  /// For OpenCL, 0 if fast-math options disabled, 1 enabled
  inline int fast_math() const { return _fast_math; }

  /// Return the number of threads per atom for pair styles
  inline int threads_per_atom() const { return _threads_per_atom; }
  /// Return the number of threads per atom for pair styles using charge
  inline int threads_per_charge() const { return _threads_per_charge; }
  /// Return the number of threads per atom for 3-body pair styles
  inline int threads_per_three() const { return _threads_per_three; }

  /// Return the min of the pair block size or the device max block size
  inline int pair_block_size() const { return _block_pair; }
  /// Return the block size for "bio" pair styles
  inline int block_bio_pair() const { return _block_bio_pair; }
  /// Return the block size for "ellipse" pair styles
  inline int block_ellipse() const { return _block_ellipse; }
  /// Return the block size for PPPM kernels
  inline int pppm_block() const { return _pppm_block; }
  /// Return the block size for neighbor build kernel
  inline int block_nbor_build() const { return _block_nbor_build; }
  /// Return the block size for neighbor binning
  inline int block_cell_2d() const { return _block_cell_2d; }
  /// Return the block size for atom mapping for neighbor builds
  inline int block_cell_id() const { return _block_cell_id; }

  /// Return the maximum number of atom types that can be used with shared mem
  inline int max_shared_types() const { return _max_shared_types; }
  /// Return the maximum number of atom types for shared mem with "bio" styles
  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
  /// Return the maximum order for PPPM splines
  inline int pppm_max_spline() const { return _pppm_max_spline; }

  /// Architecture gpu code compiled for (returns 0 for OpenCL)
  inline double ptx_arch() const { return _ptx_arch; }
  inline void set_simd_size(int simd_sz) { _simd_size = simd_sz; }

  // -------------------------- DEVICE DATA -------------------------

  /// Geryon Device
  UCL_Device *gpu;

  enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};

  // --------------------------- ATOM DATA --------------------------

  /// Atom Data
  Atom<numtyp,acctyp> atom;

  // --------------------------- NBOR SHARED KERNELS ----------------

  /// Shared kernels for neighbor lists
  NeighborShared _neighbor_shared;

  // ------------------------ LONG RANGE DATA -----------------------

  // Long Range Data
  int _long_range_precompute;
  PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
  PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
  /// Precomputations for long range charge assignment (asynchronously)
  inline void precompute(const int ago, const int nlocal, const int nall,
                         double **host_x, int *host_type, bool &success,
                         double *charge, double *boxlo, double *prd) {
    if (_long_range_precompute==1)
      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                              boxlo,prd);
    else if (_long_range_precompute==2)
      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                              boxlo,prd);
  }

  inline std::string compile_string() { return _ocl_compile_string; }
  std::string compile_string_nofast();
  inline std::string ocl_config_name() { return _ocl_config_name; }

  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;
    o.precision(2);
    o << in;
    return o.str();
  }

 private:
  std::queue<Answer<numtyp,acctyp> *> ans_queue;
  int _init_count;
  bool _device_init, _host_timer_started, _time_device;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
      _replica_size;
  int _gpu_mode, _first_device, _last_device, _platform_id;
  double _particle_split;
  double _cpu_full;
  double _ptx_arch;
  double _user_cell_size; // -1 if the cutoff is used

  int _config_id, _simd_size, _num_mem_threads, _shuffle_avail, _fast_math;
  int _threads_per_atom, _threads_per_charge, _threads_per_three;
  int _block_pair, _block_bio_pair, _block_ellipse;
  int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
  int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;

  UCL_Program *dev_program;
  UCL_Kernel k_zero, k_info;
  bool _compiled;
  int compile_kernels();

  int _data_in_estimate, _data_out_estimate;

  std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string;
  int set_ocl_params(std::string, const std::string &);
};

}

#endif