1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
|
// ---------------------------------------------------------------------
//
// Copyright (C) 2017 - 2018 by the deal.II authors
//
// This file is part of the deal.II library.
//
// The deal.II library is free software; you can use it, redistribute
// it, and/or modify it under the terms of the GNU Lesser General
// Public License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// The full text of the license can be found in the file LICENSE at
// the top level of the deal.II distribution.
//
// ---------------------------------------------------------------------
#include <deal.II/base/process_grid.h>
#ifdef DEAL_II_WITH_SCALAPACK
#include <deal.II/lac/scalapack.templates.h>
DEAL_II_NAMESPACE_OPEN
namespace
{
/**
* Internal function to determine dimension of process grid based on the total
* number of cores, matrix dimensions and matrix block sizes
*
* Amesos heuristics:
* https://github.com/trilinos/Trilinos/blob/master/packages/amesos/src/Amesos_Scalapack.cpp#L142-L166
*
* Elemental default grid: El::Grid::Grid(mpi_comm,...)
* https://github.com/elemental/Elemental/blob/master/src/core/Grid.cpp#L67-L91
*/
inline
std::pair<int,int> compute_processor_grid_sizes(MPI_Comm mpi_comm,
const unsigned int m, const unsigned int n,
const unsigned int block_size_m, const unsigned int block_size_n)
{
// Few notes from the ScaLAPACK user guide:
// It is possible to predict the best grid shape given the number of processes available:
// Pr x Pc <= P
// This, however, depends on the task to be done.
// LU , QR and QL factorizations perform better for “flat” process grids (Pr < Pc )
// For large N, Pc = 2*Pr is a good choice, whereas for small N, one should choose small Pr
// Square or near square grids are more optimal for Cholesky factorization.
// LQ and RQ factorizations take advantage of “tall” grids (Pr > Pc )
// Below we always try to create 2D processor grids:
int n_processes;
MPI_Comm_size(mpi_comm, &n_processes);
// Get the total number of cores we can occupy in a rectangular dense matrix
// with rectangular blocks when every core owns only a single block:
const int n_processes_heuristic = int(std::ceil((1.*m)/block_size_m))*
int(std::ceil((1.*n)/block_size_n));
const int Np = std::min(n_processes_heuristic, n_processes);
// Now we need to split Np into Pr x Pc. Assume we know the shape/ratio
// Pc =: ratio * Pr
// therefore
// Np = Pc * Pc / ratio
// for quadratic matrices the ratio equals 1
const double ratio = double(n)/m;
int Pc = std::floor(std::sqrt(ratio * Np));
// one could rounds up Pc to the number which has zero remainder from the division of Np
// while ( Np % Pc != 0 )
// ++Pc;
// but this affects the grid shape dramatically, i.e. 10 cores 3x3 becomes 2x5.
// limit our estimate to be in [2, Np]
int n_process_columns = std::min (Np, std::max(2, Pc));
// finally, get the rows:
int n_process_rows = Np / n_process_columns ;
Assert (n_process_columns >=1 && n_process_rows >=1 && n_processes >= n_process_rows*n_process_columns,
ExcMessage("error in process grid: "+
std::to_string(n_process_rows)+"x"+
std::to_string(n_process_columns)+
"="+
std::to_string(n_process_rows*n_process_columns)+
" out of " +
std::to_string(n_processes)));
return std::make_pair(n_process_rows,n_process_columns);
// For example,
// 320x320 with 32x32 blocks and 16 cores:
// Pc = 1.0 * Pr => 4x4 grid
// Pc = 0.5 * Pr => 8x2 grid
// Pc = 2.0 * Pr => 3x5 grid
}
}
namespace Utilities
{
namespace MPI
{
ProcessGrid::ProcessGrid(MPI_Comm mpi_comm,
const std::pair<unsigned int,unsigned int> &grid_dimensions)
:
mpi_communicator(mpi_comm),
this_mpi_process(Utilities::MPI::this_mpi_process(mpi_communicator)),
n_mpi_processes(Utilities::MPI::n_mpi_processes(mpi_communicator)),
n_process_rows(grid_dimensions.first),
n_process_columns(grid_dimensions.second)
{
Assert (grid_dimensions.first > 0,
ExcMessage("Number of process grid rows has to be positive."));
Assert (grid_dimensions.second > 0,
ExcMessage("Number of process grid columns has to be positive."));
Assert (grid_dimensions.first*grid_dimensions.second <= n_mpi_processes,
ExcMessage("Size of process grid is larger than number of available MPI processes."));
// processor grid order.
const bool column_major = false;
// Initialize Cblas context from the provided communicator
blacs_context = Csys2blacs_handle(mpi_communicator);
const char *order = ( column_major ? "Col" : "Row" );
// Note that blacs_context can be modified below. Thus Cblacs2sys_handle
// may not return the same MPI communicator.
Cblacs_gridinit(&blacs_context, order, n_process_rows, n_process_columns);
// Blacs may modify the grid size on processes which are not used
// in the grid. So provide copies below:
int procrows_ = n_process_rows;
int proccols_ = n_process_columns;
Cblacs_gridinfo( blacs_context, &procrows_, &proccols_, &this_process_row, &this_process_column );
// If this MPI core is not on the grid, flag it as inactive and
// skip all jobs
// Note that a different condition is used in FORTRAN code here
// https://stackoverflow.com/questions/18516915/calling-blacs-with-more-processes-than-used
if (this_process_row < 0 || this_process_column < 0)
mpi_process_is_active = false;
else
mpi_process_is_active = true;
// Create an auxiliary communicator which has root and all inactive cores.
// Assume that inactive cores start with id=n_process_rows*n_process_columns
const unsigned int n_active_mpi_processes = n_process_rows*n_process_columns;
Assert (mpi_process_is_active ||
this_mpi_process >= n_active_mpi_processes,
ExcInternalError());
std::vector<int> inactive_with_root_ranks;
inactive_with_root_ranks.push_back(0);
for (unsigned int i = n_active_mpi_processes; i < n_mpi_processes; ++i)
inactive_with_root_ranks.push_back(i);
// Get the group of processes in mpi_communicator
int ierr = 0;
MPI_Group all_group;
ierr = MPI_Comm_group(mpi_communicator, &all_group);
AssertThrowMPI(ierr);
// Construct the group containing all ranks we need:
MPI_Group inactive_with_root_group;
const int n = inactive_with_root_ranks.size();
ierr = MPI_Group_incl(all_group,
n, inactive_with_root_ranks.data(),
&inactive_with_root_group);
AssertThrowMPI(ierr);
// Create the communicator based on the group
// Note that on most cores the communicator will be MPI_COMM_NULL.
// FIXME: switch to MPI_Comm_create_group for MPI-3 so that only processes within the to-be subgroup call this
ierr = MPI_Comm_create(mpi_communicator, inactive_with_root_group,
&mpi_communicator_inactive_with_root);
AssertThrowMPI(ierr);
ierr = MPI_Group_free(&all_group);
AssertThrowMPI(ierr);
ierr = MPI_Group_free(&inactive_with_root_group);
AssertThrowMPI(ierr);
// Double check that the process with rank 0 in subgroup is active:
#ifdef DEBUG
if (mpi_communicator_inactive_with_root != MPI_COMM_NULL &&
Utilities::MPI::this_mpi_process(mpi_communicator_inactive_with_root) == 0)
Assert (mpi_process_is_active, ExcInternalError());
#endif
}
ProcessGrid::ProcessGrid(MPI_Comm mpi_comm,
const unsigned int n_rows_matrix,
const unsigned int n_columns_matrix,
const unsigned int row_block_size,
const unsigned int column_block_size)
:
ProcessGrid(mpi_comm,
compute_processor_grid_sizes(mpi_comm, n_rows_matrix, n_columns_matrix,
row_block_size, column_block_size) )
{}
ProcessGrid::ProcessGrid(MPI_Comm mpi_comm,
const unsigned int n_rows,
const unsigned int n_columns)
:
ProcessGrid(mpi_comm,
std::make_pair(n_rows,n_columns))
{}
ProcessGrid::~ProcessGrid()
{
if (mpi_process_is_active)
Cblacs_gridexit(blacs_context);
if (mpi_communicator_inactive_with_root != MPI_COMM_NULL)
MPI_Comm_free(&mpi_communicator_inactive_with_root);
}
template <typename NumberType>
void ProcessGrid::send_to_inactive(NumberType *value, const int count) const
{
Assert (count>0, ExcInternalError());
if (mpi_communicator_inactive_with_root != MPI_COMM_NULL)
{
const int ierr =
MPI_Bcast(value,count,
Utilities::MPI::internal::mpi_type_id (value),
0/*from root*/,
mpi_communicator_inactive_with_root);
AssertThrowMPI(ierr);
}
}
}
}
// instantiations
template void Utilities::MPI::ProcessGrid::send_to_inactive<double>(double *, const int) const;
template void Utilities::MPI::ProcessGrid::send_to_inactive<float>(float *, const int) const;
template void Utilities::MPI::ProcessGrid::send_to_inactive<int>(int *, const int) const;
DEAL_II_NAMESPACE_CLOSE
#endif // DEAL_II_WITH_SCALAPACK
|