File: lal_atom.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (539 lines) | stat: -rw-r--r-- 16,547 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
/***************************************************************************
                                   atom.h
                             -------------------
                            W. Michael Brown (ORNL)

  Class for particle data management

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H

#include <cmath>
#include "mpi.h"

#if defined(USE_OPENCL)
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using namespace ucl_opencl;
#ifndef LAL_NO_OCL_EV_JIT
#define LAL_OCL_EV_JIT
#endif
#elif defined(USE_CUDART)
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
#include "geryon/nvc_kernel.h"
using namespace ucl_cudart;
#elif defined(USE_HIP)
#include "geryon/hip_timer.h"
#include "geryon/hip_mat.h"
#include "geryon/hip_kernel.h"
using namespace ucl_hip;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using namespace ucl_cudadr;
#endif

#ifdef USE_CUDPP
#include "cudpp.h"
#endif

#include "lal_precision.h"

namespace LAMMPS_AL {

template <class numtyp, class acctyp>
class Atom {
 public:
  Atom();
  ~Atom() { clear(); }

  /// Maximum number of atoms that can be stored with current allocation
  inline int max_atoms() const { return _max_atoms; }
  /// Current number of local+ghost atoms stored
  inline int nall() const { return _nall; }

  /// Set number of local+ghost atoms for future copy operations
  inline void nall(const int n) { _nall=n; }

  /// Memory usage per atom in this class
  int bytes_per_atom() const;

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor 0 if neighboring will be performed on host
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool init(const int nall, const bool charge, const bool rot,
            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
            const bool vel=false);

  /// Check if we have enough device storage and realloc if not
  /** Returns true if resized with any call during this timestep **/
  inline bool resize(const int nall, bool &success) {
    _nall=nall;
    if (nall>_max_atoms) {
      clear_resize();
      success = success && alloc(nall);
      _resized=true;
    }
    return _resized;
  }

  /// If already initialized by another LAMMPS style, add fields as necessary
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor 0 if neighboring will be performed on host
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
                  const bool bonds, const bool vel=false);

  /// Returns true if GPU is using charges
  bool charge() { return _charge; }

  /// Returns true if GPU is using quaternions
  bool quaternion() { return _rot; }

  /// Returns true if GPU is using velocities
  bool velocity() { return _vel; }

  /// Only free matrices of length inum or nall for resizing
  void clear_resize();

  /// Free all memory on host and device
  void clear();

  /// Return the total amount of host memory used by class in bytes
  double host_memory_usage() const;

  /// Sort arrays for neighbor list calculation on device
  void sort_neighbor(const int num_atoms);

  /// Add copy times to timers
  inline void acc_timers() {
    time_pos.add_to_total();
    if (_charge)
      time_q.add_to_total();
    if (_rot)
      time_quat.add_to_total();
    if (_vel)
      time_vel.add_to_total();
  }

  /// Add copy times to timers
  inline void zero_timers() {
    time_pos.zero();
    if (_charge)
      time_q.zero();
    if (_rot)
      time_quat.zero();
    if (_vel)
      time_vel.zero();
  }

  /// Return the total time for host/device data transfer
  /** Zeros the total so that the atom times are only included once **/
  inline double transfer_time() {
    double total=time_pos.total_seconds();
    time_pos.zero_total();
    if (_charge) {
      total+=time_q.total_seconds();
      time_q.zero_total();
    }
    if (_rot) {
      total+=time_quat.total_seconds();
      time_quat.zero_total();
    }
    if (_vel) {
      total+=time_vel.total_seconds();
      time_vel.zero_total();
    }

    return total+_time_transfer/1000.0;
  }

  /// Return the total time for data cast/pack
  /** Zeros the time so that atom times are only included once **/
  inline double cast_time()
    { double t=_time_cast; _time_cast=0.0; return t; }

  /// Pack LAMMPS atom type constants into matrix and copy to device
  template <class dev_typ, class t1>
  inline void type_pack1(const int n, const int m_size,
                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                         t1 **one) {
    int ii=0;
    for (int i=0; i<n; i++) {
      for (int j=0; j<n; j++) {
        buffer[ii]=static_cast<numtyp>(one[i][j]);
        ii++;
      }
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
  template <class dev_typ, class t1, class t2>
  inline void type_pack2(const int n, const int m_size,
                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                         t1 **one, t2 **two) {
    int ii=0;
    for (int i=0; i<n; i++) {
      for (int j=0; j<n; j++) {
        buffer[ii*2]=static_cast<numtyp>(one[i][j]);
        buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
        ii++;
      }
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
  template <class dev_typ, class t1, class t2>
  inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
                         UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
    int ii=0;
    for (int i=0; i<n; i++) {
      for (int j=0; j<n; j++) {
        for (int k=0; k<n; k++) {
          buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
          buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
          ii++;
        }
      }
    }
    UCL_H_Vec<dev_typ> view;
    view.view_offset(0,buffer,n*n*n);
    ucl_copy(dev_v,view,false);
  }

  /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
  template <class dev_typ, class t1, class t2, class t3>
  inline void type_pack4(const int n, const int m_size,
                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                         t1 **one, t2 **two, t3 **three) {
    int ii=0;
    for (int i=0; i<n; i++) {
      for (int j=0; j<n; j++) {
        buffer[ii*4]=static_cast<numtyp>(one[i][j]);
        buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
        buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
        ii++;
      }
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

  /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
  template <class dev_typ, class t1, class t2, class t3, class t4>
  inline void type_pack4(const int n, const int m_size,
                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                         t1 **one, t2 **two, t3 **three, t4 **four) {
    int ii=0;
    for (int i=0; i<n; i++) {
      for (int j=0; j<n; j++) {
        buffer[ii*4]=static_cast<numtyp>(one[i][j]);
        buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
        buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
        buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
        ii++;
      }
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

  /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
  template <class dev_typ, class t1, class t2>
  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
                         UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
    for (int i=0; i<n; i++) {
      buffer[i*2]=static_cast<numtyp>(one[i][i]);
      buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
    }
    UCL_H_Vec<dev_typ> view;
    view.view_offset(0,buffer,n);
    ucl_copy(dev_v,view,false);
  }

  // -------------------------COPY TO GPU ----------------------------------

  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }

  typedef struct { double x,y,z; } vec3d;
  typedef struct { numtyp x,y,z,w; } vec4d_t;

  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
    if (_x_avail==false) {
      double t=MPI_Wtime();
      #ifdef GPU_CAST
      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
      #else
      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
      #if (LAL_USE_OMP == 1)
      #pragma omp parallel for schedule(static)
      #endif
      for (int i=0; i<_nall; i++) {
        xp[i].x=host_p[i].x;
        xp[i].y=host_p[i].y;
        xp[i].z=host_p[i].z;
        xp[i].w=host_type[i];
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
    }
  }

  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
  inline void add_x_data(double **host_ptr, int *host_type) {
    time_pos.start();
    if (_x_avail==false) {
      #ifdef GPU_CAST
      x_cast.update_device(_nall*3,true);
      type_cast.update_device(_nall,true);
      int block_size=64;
      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
      k_cast_x.set_size(GX,block_size);
      k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
      #else
      x.update_device(_nall*4,true);
      #endif
      _x_avail=true;
    }
    time_pos.stop();
  }

  /// Calls cast_x_data and add_x_data and times the routines
  inline void cast_copy_x(double **host_ptr, int *host_type) {
    cast_x_data(host_ptr,host_type);
    add_x_data(host_ptr,host_type);
  }

  // Cast charges to write buffer
  template<class cpytyp>
  inline void cast_q_data(cpytyp *host_ptr) {
    if (_q_avail==false) {
      double t=MPI_Wtime();
      // If double precision, still memcpy for async transfers
      if (_host_view) {
        q.host.view((numtyp*)host_ptr,_nall,*dev);
        q.device.view(q.host);
      } else if (sizeof(numtyp)==sizeof(double))
        memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
      else
        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
        #pragma omp parallel for simd schedule(static)
        #elif (LAL_USE_OMP_SIMD == 1)
        #pragma omp simd
        #endif
        for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
  }

  // Copy charges to device asynchronously
  inline void add_q_data() {
    time_q.start();
    if (_q_avail==false) {
      q.update_device(_nall,true);
      _q_avail=true;
    }
    time_q.stop();
  }

  // Cast quaternions to write buffer
  template<class cpytyp>
  inline void cast_quat_data(cpytyp *host_ptr) {
    if (_quat_avail==false) {
      double t=MPI_Wtime();
      if (_host_view) {
        quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
        quat.device.view(quat.host);
      } else if (sizeof(numtyp)==sizeof(double))
        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
      else
        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
        #pragma omp parallel for simd schedule(static)
        #elif (LAL_USE_OMP_SIMD == 1)
        #pragma omp simd
        #endif
        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
  }

  // Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
    time_quat.start();
    if (_quat_avail==false) {
      quat.update_device(_nall*4,true);
      _quat_avail=true;
    }
    time_quat.stop();
  }

  /// Cast velocities and tags to write buffer
  inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
    if (_v_avail==false) {
      double t=MPI_Wtime();
      #ifdef GPU_CAST
      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
      #else
      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
      #if (LAL_USE_OMP == 1)
      #pragma omp parallel for schedule(static)
      #endif
      for (int i=0; i<_nall; i++) {
        vp[i].x=host_p[i].x;
        vp[i].y=host_p[i].y;
        vp[i].z=host_p[i].z;
        vp[i].w=host_tag[i];
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
    }
  }

  /// Copy velocities and tags to device asynchronously
  /** Copies nall() elements **/
  inline void add_v_data(double **host_ptr, tagint *host_tag) {
    time_vel.start();
    if (_v_avail==false) {
      #ifdef GPU_CAST
      v_cast.update_device(_nall*3,true);
      tag_cast.update_device(_nall,true);
      int block_size=64;
      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
      k_cast_x.set_size(GX,block_size);
      k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
      #else
      v.update_device(_nall*4,true);
      #endif
      _v_avail=true;
    }
    time_vel.stop();
  }

  /// Calls cast_v_data and add_v_data and times the routines
  inline void cast_copy_v(double **host_ptr, tagint *host_tag) {
    cast_v_data(host_ptr,host_tag);
    add_v_data(host_ptr,host_tag);
  }

  /// Add in casting time from additional data (seconds)
  inline void add_cast_time(double t) { _time_cast+=t; }

  /// Add in transfer time from additional data (ms)
  inline void add_transfer_time(double t) { _time_transfer+=t; }

  /// Return number of bytes used on device
  inline double max_gpu_bytes()
    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }

  /// Returns true if the device is addressing memory on the host
  inline bool host_view() { return _host_view; }

  // ------------------------------ DATA ----------------------------------

  /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
  UCL_Vector<numtyp,numtyp> x;
  /// Charges
  UCL_Vector<numtyp,numtyp> q;
  /// Quaterions
  UCL_Vector<numtyp,numtyp> quat;
  /// Velocities
  UCL_Vector<numtyp,numtyp> v;

  #ifdef GPU_CAST
  UCL_Vector<numtyp,numtyp> x_cast;
  UCL_Vector<int,int> type_cast;
  #endif

  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<unsigned> dev_cell_id;
  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<int> dev_particle_id;

  /// Atom tag information for device nbor builds
  UCL_D_Vec<tagint> dev_tag;

  /// Cell list identifiers for hybrid nbor builds
  UCL_H_Vec<int> host_cell_id;
  /// Cell list identifiers for hybrid nbor builds
  UCL_H_Vec<int> host_particle_id;

  /// Device timers
  UCL_Timer time_pos, time_q, time_quat, time_vel;

  /// Geryon device
  UCL_Device *dev;

 private:
  #ifdef GPU_CAST
  UCL_Program *atom_program;
  UCL_Kernel k_cast_x;
  void compile_kernels(UCL_Device &dev);
  #endif

  bool _compiled;

  // True if data has been copied to device already
  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;

  bool alloc(const int nall);

  bool _allocated, _rot, _charge, _bonds, _vel, _other;
  int _max_atoms, _nall, _gpu_nbor;
  bool _host_view;
  double _time_cast, _time_transfer;

  double _max_gpu_bytes;

  #ifdef USE_CUDPP
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;
  #endif

  #ifdef USE_HIP_DEVICE_SORT
  unsigned* sort_out_keys = nullptr;
  int* sort_out_values = nullptr;
  void* sort_temp_storage = nullptr;
  size_t sort_temp_storage_size = 0;
  size_t sort_out_size = 0;
  #endif
};

}

#endif