File: lal_balance.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (207 lines) | stat: -rw-r--r-- 5,889 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/***************************************************************************
                                  balance.h
                             -------------------
                            W. Michael Brown (ORNL)

  Class for host-device load balancing

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

#ifndef LAL_BALANCE_H
#define LAL_BALANCE_H

#include "lal_device.h"
#include <cmath>

#define _HD_BALANCE_EVERY 25
#define _HD_BALANCE_WEIGHT 0.5
#define _HD_BALANCE_GAP 1.10

namespace LAMMPS_AL {

/// Host/device load balancer
template<class numtyp, class acctyp>
class Balance {
 public:
  inline Balance() : _init_done(false), _measure_this_step(false) {}
  inline ~Balance() { clear(); }

  /// Clear any old data and setup for new LAMMPS run
  inline void init(Device<numtyp, acctyp> *gpu, const int gpu_nbor,
                   const double split);

  /// Clear all host and device data
  inline void clear() {
    if (_init_done) {
      _device_time.clear();
      _measure_this_step=false;
      _init_done=false;
    }
  }

  /// Return the timestep since initialization
  inline int timestep() { return _timestep; }

  /// Get a count of the number of particles host will handle for initial alloc
  inline int first_host_count(const int nlocal, const double gpu_split,
                              const int gpu_nbor) const {
    int host_nlocal=0;
    if (gpu_nbor>0 && gpu_split!=1.0) {
      if (gpu_split>0)
        host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
      else
        host_nlocal=static_cast<int>(ceil(0.05*nlocal));
    }
    return host_nlocal;
  }

  /// Return the number of particles the device will handle this timestep
  inline int get_gpu_count(const int ago, const int inum_full);

  /// Return the average fraction of particles handled by device on all procs
  inline double all_avg_split() {
    if (_load_balance) {
      double _all_avg_split=0.0;
      MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
                 _device->replica());
      _all_avg_split/=_device->replica_size();
      return _all_avg_split/_avg_count;
    } else
      return _actual_split;
  }

  /// If CPU neighboring, allow the device fraction to increase on 2nd timestep
  inline int ago_first(int ago) const
    { if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; }

  /// Start the timer for asynchronous device execution
  inline void start_timer() {
    if (_measure_this_step) {
      _device->gpu->sync();
      _device->gpu_barrier();
      _device->start_host_timer();
      _device_time.start();
      _device->gpu->sync();
      _device->gpu_barrier();
    }
  }

  /// Stop the timer for asynchronous device execution
  inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }

  /// Calculate the new host/device split based on the cpu and device times
  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps
            (and first 10) **/
  inline void balance(const double cpu_time);

  /// Calls balance() and then get_gpu_count()
  inline int balance(const int ago,const int inum_full,const double cpu_time) {
    balance(cpu_time);
    return get_gpu_count(ago,inum_full);
  }

 private:
  Device<numtyp,acctyp> *_device;
  UCL_Timer _device_time;
  bool _init_done;
  int _gpu_nbor;

  bool _load_balance;
  double _actual_split, _avg_split, _desired_split, _max_split;
  int _avg_count;

  bool _measure_this_step;
  int _inum, _inum_full, _timestep;
};

#define BalanceT Balance<numtyp,acctyp>

template <class numtyp, class acctyp>
void BalanceT::init(Device<numtyp, acctyp> *gpu,
                           const int gpu_nbor, const double split) {
  clear();
  _gpu_nbor=gpu_nbor;
  _init_done=true;

  _device=gpu;
  _device_time.init(*gpu->gpu);

  if (split<0.0) {
    _load_balance=true;
    _desired_split=0.90;
  } else {
    _load_balance=false;
    _desired_split=split;
  }
  _actual_split=_desired_split;
  _avg_split=0.0;
  _avg_count=0;
  _timestep=0;
}

template <class numtyp, class acctyp>
int BalanceT::get_gpu_count(const int ago, const int inum_full) {
  _measure_this_step=false;
  if (_load_balance) {
    if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
      _measure_this_step=true;
      _inum_full=inum_full;
    }
    if (ago==0) {
      _actual_split=_desired_split;
      _max_split=_desired_split;
    }
  }
  _inum=static_cast<int>(floor(_actual_split*inum_full));
  if (_inum==0) _inum++;
  _timestep++;
  return _inum;
}

template <class numtyp, class acctyp>
void BalanceT::balance(const double cpu_time) {
  if (_measure_this_step) {
    _measure_this_step=false;
    double gpu_time=_device_time.seconds();

    double max_gpu_time;
    MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
                  _device->gpu_comm());

    if (_inum_full==_inum) {
      _desired_split=1.0;
      return;
    }

    double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
    double cpu_other_time=_device->host_time()-cpu_time;
    int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
                                   cpu_time_per_atom);

    double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
    _desired_split=split*_HD_BALANCE_GAP;
    if (_desired_split>1.0)
      _desired_split=1.0;
    if (_desired_split<0.0)
      _desired_split=0.0;

    if (_gpu_nbor==0) {
      if (_desired_split<_max_split)
        _actual_split=_desired_split;
      else
        _actual_split=_max_split;
    }
  }
  _avg_split+=_desired_split;
  _avg_count++;
}

}

#endif