File: lal_base_three.cpp

package info (click to toggle)
lammps 0~20161109.git9806da6-7
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 248,460 kB
  • ctags: 80,635
  • sloc: cpp: 701,185; python: 33,420; fortran: 26,434; ansic: 11,340; sh: 6,108; perl: 4,104; makefile: 2,891; xml: 2,590; f90: 1,690; objc: 238; lisp: 169; tcl: 61; csh: 16; awk: 14
file content (366 lines) | stat: -rw-r--r-- 11,919 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/***************************************************************************
                               base_three.cpp
                             -------------------
                            W. Michael Brown (ORNL)

  Base class for pair styles with per-particle data for position and type

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                : Tue April 2, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/

#include "lal_base_three.h"
using namespace LAMMPS_AL;
#define BaseThreeT BaseThree<numtyp, acctyp>

extern Device<PRECISION,ACC_PRECISION> global_device;

template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0)  {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
  #ifdef THREE_CONCURRENT
  ans2=new Answer<numtyp,acctyp>();
  #endif
}

template <class numtyp, class acctyp>
BaseThreeT::~BaseThree() {
  delete ans;
  delete nbor;
  #ifdef THREE_CONCURRENT
  delete ans2;
  #endif
}

template <class numtyp, class acctyp>
int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
  int b=device->atom.bytes_per_atom()+ans->bytes_per_atom()+
         nbor->bytes_per_atom(max_nbors);
  #ifdef THREE_CONCURRENT
  b+=ans2->bytes_per_atom();
  #endif
  return b;
}

template <class numtyp, class acctyp>
int BaseThreeT::init_three(const int nlocal, const int nall,
                           const int max_nbors, const int maxspecial,
                           const double cell_size, const double gpu_split,
                           FILE *_screen, const void *pair_program,
                           const char *k_two, const char *k_three_center,
                           const char *k_three_end) {
  screen=_screen;

  int gpu_nbor=0;
  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
    gpu_nbor=1;
  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
  _gpu_nbor=gpu_nbor;

  int _gpu_host=0;
  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;

  _threads_per_atom=device->threads_per_atom();
  if (_threads_per_atom>1 && gpu_nbor==0) {
    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;

  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
  if (success!=0)
    return success;

  ucl_device=device->gpu;
  atom=&device->atom;

  #ifdef THREE_CONCURRENT
  _end_command_queue=ucl_device->num_queues();
  ucl_device->push_command_queue();
  if (!ans2->init(ans->max_inum(),false,false,*(device->gpu)))
    return -3;
  ans2->cq(_end_command_queue);
  #endif

  _block_pair=device->pair_block_size();
  _block_size=device->block_ellipse();
  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);

  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
  time_pair.zero();

  pos_tex.bind_float(atom->x,4);

  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  _max_an_bytes+=ans2->gpu_bytes();
  #endif

  return 0;
}

template <class numtyp, class acctyp>
void BaseThreeT::estimate_gpu_overhead() {
  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}

template <class numtyp, class acctyp>
void BaseThreeT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
  _gpu_overhead*=hd_balancer.timestep();
  _driver_overhead*=hd_balancer.timestep();
  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);

  if (_compiled) {
    k_three_center.clear();
    k_three_end.clear();
    k_three_end_vatom.clear();
    k_pair.clear();
    delete pair_program;
    _compiled=false;
  }

  time_pair.clear();
  hd_balancer.clear();

  nbor->clear();
  ans->clear();
  #ifdef THREE_CONCURRENT
  ans2->clear();
  assert(ucl_device->num_queues()==_end_command_queue+1);
  // ucl_device will clean up the command queue in its destructor
//  ucl_device->pop_command_queue();
  #endif
  device->clear();
}

// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
                              int *ilist, int *numj, int **firstneigh,
                              bool &success) {
  success=true;

  int mn=nbor->max_nbor_loop(nlist,numj,ilist);
  resize_atom(inum,nall,success);
  resize_local(nall,mn,success);
  if (!success)
    return NULL;

  // originally the requirement that nall == nlist was enforced
  // to allow direct indexing neighbors of neighbors after re-arrangement
//  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());

  // now the requirement is removed, allowing to work within pair hybrid
  nbor->get_host(nlist,ilist,numj,firstneigh,block_size());

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  bytes+=ans2->gpu_bytes();
  #endif
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;

  return ilist;
}

// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
                                       const int nall, double **host_x,
                                       int *host_type, double *sublo,
                                       double *subhi, tagint *tag,
                                       int **nspecial, tagint **special,
                                       bool &success) {
  success=true;
  resize_atom(inum,nall,success);
  resize_local(nall,host_inum,nbor->max_nbors(),success);
  if (!success)
    return 0;
  atom->cast_copy_x(host_x,host_type);

  int mn;
  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  bytes+=ans2->gpu_bytes();
  #endif
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
  return mn;
}

// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                         const int nlist, double **host_x, int *host_type,
                         int *ilist, int *numj, int **firstneigh,
                         const bool eflag, const bool vflag, const bool eatom,
                         const bool vatom, int &host_start,
                         const double cpu_time, bool &success) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return;
  }

  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
  #endif
  host_start=inum;

  if (ago==0) {
    reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
    if (!success)
      return;
  }

  atom->cast_x_data(host_x,host_type);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);

  int evatom=0;
  if (eatom || vatom)
    evatom=1;
  #ifdef THREE_CONCURRENT
  ucl_device->sync();
  #endif
  loop(eflag,vflag,evatom);
  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans);
  #ifdef THREE_CONCURRENT
  ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();
}

// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** BaseThreeT::compute(const int ago, const int inum_full,
                                 const int nall, double **host_x, int *host_type,
                                 double *sublo, double *subhi, tagint *tag,
                                 int **nspecial, tagint **special, const bool eflag,
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
                                 const double cpu_time, bool &success) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }

  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
  #endif
  host_start=inum;

  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();

  int evatom=0;
  if (eatom || vatom)
    evatom=1;
  #ifdef THREE_CONCURRENT
  ucl_device->sync();
  #endif
  loop(eflag,vflag,evatom);
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  #ifdef THREE_CONCURRENT
  ans2->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();

  return nbor->host_jlist.begin()-host_start;
}

template <class numtyp, class acctyp>
double BaseThreeT::host_memory_usage_atomic() const {
  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
         4*sizeof(numtyp)+sizeof(BaseThree<numtyp,acctyp>);
}

template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                 const char *ktwo, const char *kthree_center,
                                 const char *kthree_end) {
  if (_compiled)
    return;

  std::string vatom_name=std::string(kthree_end)+"_vatom";

  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,device->compile_string().c_str());
  k_three_center.set_function(*pair_program,kthree_center);
  k_three_end.set_function(*pair_program,kthree_end);
  k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
  k_pair.set_function(*pair_program,ktwo);
  pos_tex.get_texture(*pair_program,"pos_tex");

  #ifdef THREE_CONCURRENT
  k_three_end.cq(ucl_device->cq(_end_command_queue));
  k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
  #endif

  _compiled=true;
}

template class BaseThree<PRECISION,ACC_PRECISION>;