File: lal_neighbor.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (305 lines) | stat: -rw-r--r-- 11,377 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
/***************************************************************************
                                  neighbor.h
                             -------------------
                            Nitin Dhamankar (Intel)
                            W. Michael Brown (ORNL)
                              Peng Wang (Nvidia)

  Class for handling neighbor lists

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                :
    email                : brownw@ornl.gov, penwang@nvidia.com
 ***************************************************************************/

#ifndef LAL_NEIGHBOR_H
#define LAL_NEIGHBOR_H

#include "lal_atom.h"
#include "lal_neighbor_shared.h"
#include <sstream>

#define IJ_SIZE 131072

#if !defined(USE_OPENCL) && !defined(USE_HIP)
#ifndef LAL_USE_OLD_NEIGHBOR
// Issue with incorrect results with CUDA >= 11.2
#if (CUDA_VERSION > 11019)
#define LAL_USE_OLD_NEIGHBOR
#endif
#endif
#endif

#if defined(USE_HIP)
#define LAL_USE_OLD_NEIGHBOR
#endif

namespace LAMMPS_AL {

class Neighbor {
 public:
  Neighbor() : _allocated(false), _use_packing(false), _old_max_nbors(0), _ncells(0) {}
  ~Neighbor() { clear(); }

  /// Determine whether neighbor unpacking should be used
  /** If false, twice as much memory is reserved to allow unpacking neighbors by
    * atom for coalesced access. **/
  void packing(const bool use_packing) { _use_packing=use_packing; }

  /// Clear any old data and setup for new LAMMPS run
  /** \param inum Initial number of particles whose neighbors stored on device
    * \param host_inum Initial number of particles whose nbors copied to host
    * \param max_nbors Factor (in percentage) applied to density calculated max
    * \param gpu_nbor 0 if neighboring will be performed on host
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device
    * \param gpu_host 0 if host will not perform force calculations,
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel
    * \param threads_per_atom Number of threads used per atom for force
    *                         calculation
    * \param compile_flags Flags for JIT compiling
    * \param ilist_map true if ilist mapping data structures used (3-body) **/
  bool init(NeighborShared *shared, const int inum, const int host_inum,
            const int max_nbors, const int maxspecial, UCL_Device &dev,
            const int gpu_nbor, const int gpu_host, const bool pre_cut,
            const int block_cell_2d, const int block_cell_id,
            const int block_nbor_build, const int threads_per_atom,
            const int simd_size, const bool time_device,
            const std::string &compile_flags, const bool ilist_map);

  /// Set the cutoff+skin
  inline void set_cutoff(const double cutoff) {
    _cutoff=cutoff;

    #ifndef LAL_USE_OLD_NEIGHBOR
    _cell_size=_shared->cell_size();
    _auto_cell_size=_shared->auto_cell_size();
    const int cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
    if (cells_in_cutoff > 2) _cell_size=_cutoff*0.5;
    _old_ncellx = _old_ncelly = _old_ncellz = -1;
    #else
    _cell_size=cutoff;
    _auto_cell_size=false;
    #endif
  }

  /// Get the cutoff+skin
  inline double cutoff() { return _cutoff; }

  /// Check if there is enough memory for neighbor data and realloc if not
  /** \param inum Number of particles whose nbors will be stored on device
    * \param max_nbor Current max number of neighbors for a particle
    * \param success False if insufficient memory **/
  inline void resize(const int inum, int max_nbor, bool &success) {
    if (max_nbor == 0) max_nbor = 1;
    if (inum>_max_atoms || max_nbor>_max_nbors) {
      _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
      if (max_nbor>_max_nbors)
        _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
      alloc(success);
    }
  }

  /// Check if there is enough memory for neighbor data and realloc if not
  /** \param inum Number of particles whose nbors will be stored on device
    * \param host_inum Number of particles whose nbors will be copied to host
    * \param max_nbor Current max number of neighbors for a particle
    * \param success False if insufficient memory **/
  inline void resize(const int inum, const int host_inum, int max_nbor,
                     bool &success) {
    if (max_nbor == 0) max_nbor = 1;
    if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
      _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
      _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
      if (max_nbor>_max_nbors)
        _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
      alloc(success);
    }
  }

  inline void acc_timers(FILE *) {
    if (_nbor_time_avail) {
      if (_time_device) {
        time_nbor.add_to_total();
        if (_use_packing==false) time_kernel.add_to_total();
        if (_gpu_nbor==2) {
          time_hybrid1.add_to_total();
          time_hybrid2.add_to_total();
        }
        if (_maxspecial>0)
          time_transpose.add_to_total();
        _nbor_time_avail=false;
      }
    }
  }

  /// Free all memory on host and device
  void clear();

  /// Bytes per atom used on device
  int bytes_per_atom(const int max_nbors) const;

  /// Total host memory used by class
  double host_memory_usage() const;

  /// Returns the type of neighboring:
  /** - 0 if neighboring will be performed on host
    * - 1 if neighboring will be performed on device
    * - 2 if binning on host and neighboring on device **/
  inline int gpu_nbor() const { return _gpu_nbor; }

  /// Make a copy of unpacked nbor lists in the packed storage area (for gb)
  inline void copy_unpacked(const int inum, const int maxj)
    { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); }

  /// Copy neighbor list from host (first time or from a rebuild)
  void get_host(const int inum, int *ilist, int *numj,
                int **firstneigh, const int block_size);

  /// Copy neighbor list from host for 3-body (first time or from a rebuild)
  void get_host3(const int inum, const int nlist, int *ilist, int *numj,
                 int **firstneigh, const int block_size);

  /// Return the stride in elements for each nbor row
  inline int nbor_pitch() const { return _nbor_pitch; }

  /// Return the maximum number of atoms that can currently be stored
  inline int max_atoms() const { return _max_atoms; }

  /// Return the maximum number of nbors for a particle based on current alloc
  inline int max_nbors() const { return _max_nbors; }

  /// Return the time spent binning on the CPU for hybrid neighbor builds
  inline double bin_time() const { return _bin_time; }

  /// Loop through neighbor count array and return maximum nbors for a particle
  inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
    int mn=0;
    for (int i=0; i<inum; i++)
      mn=std::max(mn,numj[ilist[i]]);
    return mn;
  }

  /// Build nbor list on the device
  template <class numtyp, class acctyp>
  void build_nbor_list(double **x, const int inum, const int host_inum,
                       const int nall, Atom<numtyp,acctyp> &atom,
                       double *sublo, double *subhi, tagint *tag,
                       int **nspecial, tagint **special, bool &success,
                       int &max_nbors, UCL_Vector<int,int> &error_flag);

  /// Return the number of bytes used on device
  inline double gpu_bytes() {
    double res = _gpu_bytes + _c_bytes + _cell_bytes;
    if (_gpu_nbor==0)
      res += 2*IJ_SIZE*sizeof(int);

    return res;
  }

  // ------------------------------- Data -------------------------------

  /// Device neighbor matrix
  /** - 1st row is i (index into atom data)
    * - 2nd row is numj (number of neighbors)
    * - 3rd row is starting location in packed nbors
    * - Remaining rows are the neighbors arranged for coalesced access **/
  UCL_D_Vec<int> dev_nbor;
  /// Starting location in packed neighbors used only by unpack kernel
  UCL_D_Vec<int> dev_packed_begin;
  /// Packed storage for neighbor lists copied from host
  UCL_D_Vec<int> dev_packed;
  /// Host buffer for copying neighbor lists
  UCL_H_Vec<int> host_packed;
  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
  UCL_H_Vec<int> host_acc;
  /// Storage for accessing atom indices from the neighbor list (3-body)
  UCL_Vector<int,int> three_ilist;

  // ----------------- Data for GPU Neighbor Calculation ---------------

  /// Host/Device storage for device calculated neighbor lists
  /** - 1st row is numj
    * - Remaining rows are by atom, columns are nbors **/
  UCL_Vector<int,int> nbor_host;
  UCL_D_Vec<int> dev_numj_host;
  UCL_H_Vec<int> host_ilist;
  UCL_H_Vec<int*> host_jlist;
  /// Device storage for special neighbor counts
  UCL_D_Vec<int> dev_nspecial;
  /// Device storage for special neighbors
  UCL_D_Vec<tagint> dev_special, dev_special_t;
  /// Host/Device storage for number of particles per cell
  UCL_Vector<int,int> cell_counts;
  #ifndef LAL_USE_OLD_NEIGHBOR
  /// Host/Device storage for number of subgroups per cell
  UCL_Vector<int,int> cell_subgroup_counts;
  /// Host/Device storage for subgroup to cell mapping
  UCL_Vector<int,int> subgroup2cell;
  #endif
  int *cell_iter;

  /// Device timers
  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;

  /// Effective SIMD width of neighbor build kernel
  inline int simd_size() { return _simd_size; }

  template <class t>
    inline std::string toa(const t& in) {
    std::ostringstream o;
    o.precision(2);
    o << in;
    return o.str();
  }

 private:
  NeighborShared *_shared;
  UCL_Device *dev;
  bool _allocated, _use_packing, _nbor_time_avail, _time_device;
  int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
  int _old_max_nbors;
  bool _gpu_host, _alloc_packed, _ilist_map, _auto_cell_size;
  double _cutoff, _bin_time, _max_neighbor_factor, _cell_size;
  enum UCL_MEMOPT _packed_permissions;

  double _gpu_bytes, _c_bytes, _cell_bytes;
  void alloc(bool &success);

  int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
  int _ncells, _threads_per_atom, _total_atoms;

  template <class numtyp, class acctyp>
  inline void resize_max_neighbors(int maxn, bool &success);

  // For viewing host arrays for data copy operations
  UCL_H_Vec<int> _host_offset;
  UCL_D_Vec<int> _nbor_offset, _acc_view, _numj_view;

  #ifndef LAL_USE_OLD_NEIGHBOR
  UCL_H_Vec<int> _host_bin_stencil;
  UCL_Const _bin_stencil;
  int _old_ncellx, _old_ncelly, _old_ncellz;
  #endif

  int _simd_size;
  inline void set_nbor_block_size(const int mn) {
    #ifdef LAL_USE_OLD_NEIGHBOR
    int desired=mn/(2*_simd_size);
    desired*=_simd_size;
    if (desired<_simd_size) desired=_simd_size;
    else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
    _block_nbor_build=desired;
    #endif
  }
};

}

#endif