1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
|
/***************************************************************************
balance.h
-------------------
W. Michael Brown (ORNL)
Class for host-device load balancing
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_BALANCE_H
#define LAL_BALANCE_H
#include "lal_device.h"
#include <cmath>
#define _HD_BALANCE_EVERY 25
#define _HD_BALANCE_WEIGHT 0.5
#define _HD_BALANCE_GAP 1.10
namespace LAMMPS_AL {
/// Host/device load balancer
template<class numtyp, class acctyp>
class Balance {
public:
inline Balance() : _init_done(false), _measure_this_step(false) {}
inline ~Balance() { clear(); }
/// Clear any old data and setup for new LAMMPS run
inline void init(Device<numtyp, acctyp> *gpu, const int gpu_nbor,
const double split);
/// Clear all host and device data
inline void clear() {
if (_init_done) {
_device_time.clear();
_measure_this_step=false;
_init_done=false;
}
}
/// Return the timestep since initialization
inline int timestep() { return _timestep; }
/// Get a count of the number of particles host will handle for initial alloc
inline int first_host_count(const int nlocal, const double gpu_split,
const int gpu_nbor) const {
int host_nlocal=0;
if (gpu_nbor>0 && gpu_split!=1.0) {
if (gpu_split>0)
host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
else
host_nlocal=static_cast<int>(ceil(0.05*nlocal));
}
return host_nlocal;
}
/// Return the number of particles the device will handle this timestep
inline int get_gpu_count(const int ago, const int inum_full);
/// Return the average fraction of particles handled by device on all procs
inline double all_avg_split() {
if (_load_balance) {
double _all_avg_split=0.0;
MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
_device->replica());
_all_avg_split/=_device->replica_size();
return _all_avg_split/_avg_count;
} else
return _actual_split;
}
/// If CPU neighboring, allow the device fraction to increase on 2nd timestep
inline int ago_first(int ago) const
{ if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; }
/// Start the timer for asynchronous device execution
inline void start_timer() {
if (_measure_this_step) {
_device->gpu->sync();
_device->gpu_barrier();
_device->start_host_timer();
_device_time.start();
_device->gpu->sync();
_device->gpu_barrier();
}
}
/// Stop the timer for asynchronous device execution
inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
/// Calculate the new host/device split based on the cpu and device times
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
(and first 10) **/
inline void balance(const double cpu_time);
/// Calls balance() and then get_gpu_count()
inline int balance(const int ago,const int inum_full,const double cpu_time) {
balance(cpu_time);
return get_gpu_count(ago,inum_full);
}
private:
Device<numtyp,acctyp> *_device;
UCL_Timer _device_time;
bool _init_done;
int _gpu_nbor;
bool _load_balance;
double _actual_split, _avg_split, _desired_split, _max_split;
int _avg_count;
bool _measure_this_step;
int _inum, _inum_full, _timestep;
};
#define BalanceT Balance<numtyp,acctyp>
template <class numtyp, class acctyp>
void BalanceT::init(Device<numtyp, acctyp> *gpu,
const int gpu_nbor, const double split) {
clear();
_gpu_nbor=gpu_nbor;
_init_done=true;
_device=gpu;
_device_time.init(*gpu->gpu);
if (split<0.0) {
_load_balance=true;
_desired_split=0.90;
} else {
_load_balance=false;
_desired_split=split;
}
_actual_split=_desired_split;
_avg_split=0.0;
_avg_count=0;
_timestep=0;
}
template <class numtyp, class acctyp>
int BalanceT::get_gpu_count(const int ago, const int inum_full) {
_measure_this_step=false;
if (_load_balance) {
if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
_measure_this_step=true;
_inum_full=inum_full;
}
if (ago==0) {
_actual_split=_desired_split;
_max_split=_desired_split;
}
}
_inum=static_cast<int>(floor(_actual_split*inum_full));
if (_inum==0) _inum++;
_timestep++;
return _inum;
}
template <class numtyp, class acctyp>
void BalanceT::balance(const double cpu_time) {
if (_measure_this_step) {
_measure_this_step=false;
double gpu_time=_device_time.seconds();
double max_gpu_time;
MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
_device->gpu_comm());
if (_inum_full==_inum) {
_desired_split=1.0;
return;
}
double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
double cpu_other_time=_device->host_time()-cpu_time;
int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
cpu_time_per_atom);
double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
_desired_split=split*_HD_BALANCE_GAP;
if (_desired_split>1.0)
_desired_split=1.0;
if (_desired_split<0.0)
_desired_split=0.0;
if (_gpu_nbor==0) {
if (_desired_split<_max_split)
_actual_split=_desired_split;
else
_actual_split=_max_split;
}
}
_avg_split+=_desired_split;
_avg_count++;
}
}
#endif
|