1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867
|
// Copyright (C) 2021-2022 Garth N. Wells and Chris N. Richardson
//
// This file is part of DOLFINx (https://www.fenicsproject.org)
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#pragma once
#include "SparsityPattern.h"
#include "Vector.h"
#include "matrix_csr_impl.h"
#include <algorithm>
#include <dolfinx/common/IndexMap.h>
#include <dolfinx/common/MPI.h>
#include <dolfinx/graph/AdjacencyList.h>
#include <mpi.h>
#include <numeric>
#include <span>
#include <utility>
#include <vector>
namespace dolfinx::la
{
/// @brief Modes for representing block structured matrices.
enum class BlockMode : int
{
compact = 0, /// Each entry in the sparsity pattern of the matrix refers to a
/// block of data of size (bs[0], bs[1]).
expanded = 1 /// The sparsity pattern is expanded by (bs[0], bs[1]), and each
/// matrix entry refers to one data item, i.e. the resulting
/// matrix has a block size of (1, 1).
};
/// @brief Distributed sparse matrix using compressed sparse row storage.
///
/// @warning The class is experimental and subject to change.
///
/// The matrix storage format is compressed sparse row, and is
/// partitioned by row across MPI ranks. The matrix can be assembled
/// into using the usual DOLFINx assembly routines. Matrix internal data
/// can be accessed for interfacing with other code.
///
/// @tparam Scalar Scalar type of matrix entries
/// @tparam Container Container type for storing matrix entries
/// @tparam ColContainer Column index container type
/// @tparam RowPtrContainer Row pointer container type
template <typename Scalar, typename Container = std::vector<Scalar>,
typename ColContainer = std::vector<std::int32_t>,
typename RowPtrContainer = std::vector<std::int64_t>>
class MatrixCSR
{
static_assert(std::is_same_v<typename Container::value_type, Scalar>);
static_assert(std::is_integral_v<typename ColContainer::value_type>);
static_assert(std::is_integral_v<typename RowPtrContainer::value_type>);
template <typename, typename, typename, typename>
friend class MatrixCSR;
public:
/// Scalar type
using value_type = Scalar;
/// Matrix entries container type
using container_type = Container;
/// Column index container type
using column_container_type = ColContainer;
/// Row pointer container type
using rowptr_container_type = RowPtrContainer;
/// @brief Insertion functor for setting values in a matrix. It is
/// typically used in finite element assembly functions.
///
/// Create a function to set values in a MatrixCSR. The function
/// signature is `int mat_set_fn(std::span<const std::int32_t rows,
/// std::span<const std::int32_t cols, std::span<const value_type>
/// data)`. The rows and columns use process local indexing, and the
/// given rows and columns must pre-exist in the sparsity pattern of
/// the matrix. Insertion into "ghost" rows (in the ghost region of
/// the row `IndexMap`) is permitted, so long as there are correct
/// entries in the sparsity pattern.
///
/// @note Using rows or columns which are not in the sparsity will
/// result in undefined behaviour (or an assert failure in Debug
/// mode).
///
/// @note Matrix block size may be (1, 1) or (BS0, BS1)
/// @note Data block size may be (1, 1) or (BS0, BS1)
///
/// @tparam BS0 Row block size of data for insertion
/// @tparam BS1 Column block size of data for insertion
///
/// @return Function for inserting values into `A`
template <int BS0 = 1, int BS1 = 1>
auto mat_set_values()
{
if ((BS0 != _bs[0] and BS0 > 1 and _bs[0] > 1)
or (BS1 != _bs[1] and BS1 > 1 and _bs[1] > 1))
{
throw std::runtime_error(
"Cannot insert blocks of different size than matrix block size");
}
return [&](std::span<const std::int32_t> rows,
std::span<const std::int32_t> cols,
std::span<const value_type> data) -> int
{
this->set<BS0, BS1>(data, rows, cols);
return 0;
};
}
/// @brief Insertion functor for adding values to a matrix. It is
/// typically used in finite element assembly functions.
///
/// Create a function to add values to a MatrixCSR. The function
/// signature is `int mat_add_fn(std::span<const std::int32_t rows,
/// std::span<const std::int32_t cols, std::span<const value_type>
/// data)`. The rows and columns use process local indexing, and the
/// given rows and columns must pre-exist in the sparsity pattern of
/// the matrix. Insertion into "ghost" rows (in the ghost region of
/// the row `IndexMap`) is permitted, so long as there are correct
/// entries in the sparsity pattern.
///
/// @note Using rows or columns which are not in the sparsity will
/// result in undefined behaviour (or an assert failure in Debug
/// mode).
///
/// @note Matrix block size may be (1, 1) or (BS0, BS1)
/// @note Data block size may be (1, 1) or (BS0, BS1)
///
/// @tparam BS0 Row block size of data for insertion
/// @tparam BS1 Column block size of data for insertion
///
/// @return Function for inserting values into `A`
template <int BS0 = 1, int BS1 = 1>
auto mat_add_values()
{
if ((BS0 != _bs[0] and BS0 > 1 and _bs[0] > 1)
or (BS1 != _bs[1] and BS1 > 1 and _bs[1] > 1))
{
throw std::runtime_error(
"Cannot insert blocks of different size than matrix block size");
}
return [&](std::span<const std::int32_t> rows,
std::span<const std::int32_t> cols,
std::span<const value_type> data) -> int
{
this->add<BS0, BS1>(data, rows, cols);
return 0;
};
}
/// @brief Create a distributed matrix.
///
/// The structure of the matrix depends entirely on the input
/// `SparsityPattern`, which must be finalized. The matrix storage is
/// distributed Compressed Sparse Row: the matrix is distributed by
/// row across processes, and on each process, there is a list of
/// column indices and matrix entries for each row stored. This
/// exactly matches the layout of the `SparsityPattern`. There is some
/// overlap of matrix rows between processes to allow for independent
/// Finite Element assembly, after which, the ghost rows should be
/// sent to the row owning processes by calling `scatter_rev()`.
///
/// @note The block size of the matrix is given by the block size of
/// the input `SparsityPattern`.
///
/// @param[in] p The sparsity pattern which describes the parallel
/// distribution and the non-zero structure.
/// @param[in] mode Block mode. When the block size is greater than
/// one, the storage can be "compact" where each matrix entry refers
/// to a block of data (stored row major), or "expanded" where each
/// matrix entry is individual. In the "expanded" case, the sparsity
/// is expanded for every entry in the block, and the block size of
/// the matrix is set to `(1, 1)`.
MatrixCSR(const SparsityPattern& p, BlockMode mode = BlockMode::compact);
/// Move constructor
/// @todo Check handling of MPI_Request
MatrixCSR(MatrixCSR&& A) = default;
/// Copy constructor
/// @todo Check handling of MPI_Request
MatrixCSR(const MatrixCSR& A) = default;
/// @brief Copy-convert matrix, possibly using to different container
/// types.
///
/// Examples of use for this constructor include copying a matrix to a
/// different value type, or copying the matrix to a GPU.
///
/// @tparam Mat
/// @param A Matrix to copy.
template <typename Scalar0, typename Container0, typename ColContainer0,
typename RowPtrContainer0>
explicit MatrixCSR(
const MatrixCSR<Scalar0, Container0, ColContainer0, RowPtrContainer0>& A)
: _index_maps(A._index_maps), _block_mode(A.block_mode()),
_bs(A.block_size()), _data(A._data.begin(), A._data.end()),
_cols(A.cols().begin(), A.cols().end()),
_row_ptr(A.row_ptr().begin(), A.row_ptr().end()),
_off_diagonal_offset(A.off_diag_offset().begin(),
A.off_diag_offset().end()),
_comm(A.comm()), _request(MPI_REQUEST_NULL), _unpack_pos(A._unpack_pos),
_val_send_disp(A._val_send_disp), _val_recv_disp(A._val_recv_disp),
_ghost_row_to_rank(A._ghost_row_to_rank)
{
}
/// @deprecated Use `std::ranges::fill(A.values(), x)` instead.
/// @brief Set all non-zero local entries to a value, including
/// entries in ghost rows.
/// @param[in] x The value to set non-zero matrix entries to
[[deprecated("Use std::ranges::fill(A.values(), v) instead.")]]
void set(value_type x)
{
std::ranges::fill(_data, x);
}
/// @brief Set values in the matrix.
///
/// @note Only entries included in the sparsity pattern used to
/// initialize the matrix can be set.
/// @note All indices are local to the calling MPI rank and entries
/// cannot be set in ghost rows.
/// @note This should be called after `scatter_rev`. Using before
/// `scatter_rev` will set the values correctly, but incoming values
/// may get added to them during a subsequent reverse scatter
/// operation.
/// @tparam BS0 Data row block size
/// @tparam BS1 Data column block size
/// @param[in] x The `m` by `n` dense block of values (row-major) to
/// set in the matrix
/// @param[in] rows The row indices of `x`
/// @param[in] cols The column indices of `x`
template <int BS0, int BS1>
void set(std::span<const value_type> x, std::span<const std::int32_t> rows,
std::span<const std::int32_t> cols)
{
auto set_fn = [](value_type& y, const value_type& x) { y = x; };
std::int32_t num_rows
= _index_maps[0]->size_local() + _index_maps[0]->num_ghosts();
assert(x.size() == rows.size() * cols.size() * BS0 * BS1);
if (_bs[0] == BS0 and _bs[1] == BS1)
{
impl::insert_csr<BS0, BS1>(_data, _cols, _row_ptr, x, rows, cols, set_fn,
num_rows);
}
else if (_bs[0] == 1 and _bs[1] == 1)
{
// Set blocked data in a regular CSR matrix (_bs[0]=1, _bs[1]=1)
// with correct sparsity
impl::insert_blocked_csr<BS0, BS1>(_data, _cols, _row_ptr, x, rows, cols,
set_fn, num_rows);
}
else
{
assert(BS0 == 1 and BS1 == 1);
// Set non-blocked data in a blocked CSR matrix (BS0=1, BS1=1)
impl::insert_nonblocked_csr(_data, _cols, _row_ptr, x, rows, cols, set_fn,
num_rows, _bs[0], _bs[1]);
}
}
/// @brief Accumulate values in the matrix
/// @note Only entries included in the sparsity pattern used to
/// initialize the matrix can be accumulated into.
/// @note All indices are local to the calling MPI rank and entries
/// may go into ghost rows.
/// @note Use `scatter_rev` after all entries have been added to send
/// ghost rows to owners. Adding more entries after `scatter_rev` is
/// allowed, but another call to `scatter_rev` will then be required.
///
/// @tparam BS0 Row block size of data
/// @tparam BS1 Column block size of data
/// @param[in] x The `m` by `n` dense block of values (row-major) to
/// add to the matrix
/// @param[in] rows The row indices of `x`
/// @param[in] cols The column indices of `x`
template <int BS0 = 1, int BS1 = 1>
void add(std::span<const value_type> x, std::span<const std::int32_t> rows,
std::span<const std::int32_t> cols)
{
auto add_fn = [](value_type& y, const value_type& x) { y += x; };
assert(x.size() == rows.size() * cols.size() * BS0 * BS1);
if (_bs[0] == BS0 and _bs[1] == BS1)
{
impl::insert_csr<BS0, BS1>(_data, _cols, _row_ptr, x, rows, cols, add_fn,
_row_ptr.size());
}
else if (_bs[0] == 1 and _bs[1] == 1)
{
// Add blocked data to a regular CSR matrix (_bs[0]=1, _bs[1]=1)
impl::insert_blocked_csr<BS0, BS1>(_data, _cols, _row_ptr, x, rows, cols,
add_fn, _row_ptr.size());
}
else
{
assert(BS0 == 1 and BS1 == 1);
// Add non-blocked data to a blocked CSR matrix (BS0=1, BS1=1)
impl::insert_nonblocked_csr(_data, _cols, _row_ptr, x, rows, cols, add_fn,
_row_ptr.size(), _bs[0], _bs[1]);
}
}
/// Number of local rows excluding ghost rows
std::int32_t num_owned_rows() const { return _index_maps[0]->size_local(); }
/// Number of local rows including ghost rows
std::int32_t num_all_rows() const { return _row_ptr.size() - 1; }
/// @brief Copy to a dense matrix.
/// @note This function is typically used for debugging and not used
/// in production.
/// @note Ghost rows are also returned, and these can be truncated
/// manually by using num_owned_rows() if required.
/// @note If the block size is greater than 1, the entries are
/// expanded.
/// @return Dense copy of the part of the matrix on the calling rank.
/// Storage is row-major.
std::vector<value_type> to_dense() const
{
const std::size_t nrows = num_all_rows();
const std::size_t ncols = _index_maps[1]->size_global();
std::vector<value_type> A(nrows * ncols * _bs[0] * _bs[1], 0.0);
for (std::size_t r = 0; r < nrows; ++r)
{
for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j)
{
for (int i0 = 0; i0 < _bs[0]; ++i0)
{
for (int i1 = 0; i1 < _bs[1]; ++i1)
{
std::array<std::int32_t, 1> local_col{_cols[j]};
std::array<std::int64_t, 1> global_col{0};
_index_maps[1]->local_to_global(local_col, global_col);
A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1]
= _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1];
}
}
}
}
return A;
}
/// @brief Transfer ghost row data to the owning ranks accumulating
/// received values on the owned rows, and zeroing any existing data
/// in ghost rows.
///
/// This process is analogous to `scatter_rev` for `Vector` except
/// that the values are always accumulated on the owning process.
void scatter_rev()
{
scatter_rev_begin();
scatter_rev_end();
}
/// @brief Begin transfer of ghost row data to owning ranks, where it
/// will be accumulated into existing owned rows.
///
/// @note Calls to this function must be followed by
/// MatrixCSR::scatter_rev_end(). Between the two calls matrix values
/// must not be changed.
///
/// @note This function does not change the matrix data. Data update
/// only occurs with `scatter_rev_end()`.
void scatter_rev_begin()
{
const std::int32_t local_size0 = _index_maps[0]->size_local();
const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts();
const int bs2 = _bs[0] * _bs[1];
// For each ghost row, pack and send values to send to neighborhood
std::vector<int> insert_pos = _val_send_disp;
_ghost_value_data.resize(_val_send_disp.back());
for (int i = 0; i < num_ghosts0; ++i)
{
int rank = _ghost_row_to_rank[i];
// Get position in send buffer to place data to send to this
// neighbour
std::int32_t val_pos = insert_pos[rank];
std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2),
std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2),
std::next(_ghost_value_data.begin(), val_pos));
insert_pos[rank]
+= bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]);
}
_ghost_value_data_in.resize(_val_recv_disp.back());
// Compute data sizes for send and receive from displacements
std::vector<int> val_send_count(_val_send_disp.size() - 1);
std::adjacent_difference(std::next(_val_send_disp.begin()),
_val_send_disp.end(), val_send_count.begin());
std::vector<int> val_recv_count(_val_recv_disp.size() - 1);
std::adjacent_difference(std::next(_val_recv_disp.begin()),
_val_recv_disp.end(), val_recv_count.begin());
int status = MPI_Ineighbor_alltoallv(
_ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(),
dolfinx::MPI::mpi_t<value_type>, _ghost_value_data_in.data(),
val_recv_count.data(), _val_recv_disp.data(),
dolfinx::MPI::mpi_t<value_type>, _comm.comm(), &_request);
dolfinx::MPI::check_error(_comm.comm(), status);
}
/// @brief End transfer of ghost row data to owning ranks.
/// @note Must be preceded by MatrixCSR::scatter_rev_begin().
/// @note Matrix data received from other processes will be
/// accumulated into locally owned rows, and ghost rows will be
/// zeroed.
void scatter_rev_end()
{
int status = MPI_Wait(&_request, MPI_STATUS_IGNORE);
dolfinx::MPI::check_error(_comm.comm(), status);
_ghost_value_data.clear();
_ghost_value_data.shrink_to_fit();
// Add to local rows
int bs2 = _bs[0] * _bs[1];
assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2);
for (std::size_t i = 0; i < _unpack_pos.size(); ++i)
for (int j = 0; j < bs2; ++j)
_data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j];
_ghost_value_data_in.clear();
_ghost_value_data_in.shrink_to_fit();
// Set ghost row data to zero
std::int32_t local_size0 = _index_maps[0]->size_local();
std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2),
_data.end(), 0);
}
/// @brief Compute the Frobenius norm squared across all processes.
///
/// @note MPI Collective
double squared_norm() const
{
const std::size_t num_owned_rows = _index_maps[0]->size_local();
const int bs2 = _bs[0] * _bs[1];
assert(num_owned_rows < _row_ptr.size());
double norm_sq_local = std::accumulate(
_data.cbegin(),
std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2), double(0),
[](auto norm, value_type y) { return norm + std::norm(y); });
double norm_sq;
MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM,
_comm.comm());
return norm_sq;
}
/// @brief Compute the product `y += Ax`.
///
/// The vectors `x` and `y` must have parallel layouts that are
/// compatible with `A`. In detail, `x` must have the same `IndexMap` as
/// the matrix columns, `A.index_map(1)` and `y` must have the same owned
/// indices as the matrix rows in `A.index_map(0)`. Only owned entries of `y`
/// are updated, so any ghost entries of `y` are not affected.
///
/// @param[in] x Vector to apply `A` to.
/// @param[in,out] y Vector to accumulate the result into.
void mult(Vector<value_type>& x, Vector<value_type>& y);
/// @brief Get MPI communicator that matrix is defined on.
MPI_Comm comm() const { return _comm.comm(); }
/// @brief Index map for the row or column space.
///
/// The row index map contains ghost entries for rows which may be
/// inserted into and the column index map contains all local and
/// ghost columns that may exist in the owned rows.
///
/// @return Row (0) or column (1) index map.
std::shared_ptr<const common::IndexMap> index_map(int dim) const
{
return _index_maps.at(dim);
}
/// @brief Get local data values.
/// @note Includes ghost values.
container_type& values() { return _data; }
/// @brief Get local values (const version).
/// @note Includes ghost values
const container_type& values() const { return _data; }
/// @brief Get local row pointers.
/// @note Includes pointers to ghost rows
const rowptr_container_type& row_ptr() const { return _row_ptr; }
/// Get local column indices
/// @note Includes columns in ghost rows
const column_container_type& cols() const { return _cols; }
/// @brief Get the start of off-diagonal (unowned columns) on each
/// row, allowing the matrix to be split (virtually) into two parts.
///
/// Operations (such as matrix-vector multiply) between the owned
/// parts of the matrix and vector can then be performed separately
/// from operations on the unowned parts.
///
/// @note Includes ghost rows, which should be 'truncated' by the
/// caller if not required.
const rowptr_container_type& off_diag_offset() const
{
return _off_diagonal_offset;
}
/// @brief Get block sizes.
/// @return Block sizes for rows (0) and columns (1).
std::array<int, 2> block_size() const { return _bs; }
/// @brief Get 'block mode'.
BlockMode block_mode() const { return _block_mode; }
private:
// Parallel distribution of the rows and columns
std::array<std::shared_ptr<const common::IndexMap>, 2> _index_maps;
// Block mode (compact or expanded)
BlockMode _block_mode;
// Block sizes
std::array<int, 2> _bs;
// Matrix data
container_type _data;
column_container_type _cols;
rowptr_container_type _row_ptr;
// Start of off-diagonal (unowned columns) on each row
rowptr_container_type _off_diagonal_offset;
// Communicator with neighborhood (ghost->owner communicator for rows)
dolfinx::MPI::Comm _comm;
// -- Precomputed data for scatter_rev/update
// Request in non-blocking communication
MPI_Request _request;
// Position in _data to add received data
std::vector<std::size_t> _unpack_pos;
// Displacements for alltoall for each neighbor when sending and
// receiving
std::vector<int> _val_send_disp, _val_recv_disp;
// Ownership of each row, by neighbor (for the neighbourhood defined
// on _comm)
std::vector<int> _ghost_row_to_rank;
// Temporary stores for data during non-blocking communication
container_type _ghost_value_data;
container_type _ghost_value_data_in;
};
//-----------------------------------------------------------------------------
template <class U, class V, class W, class X>
MatrixCSR<U, V, W, X>::MatrixCSR(const SparsityPattern& p, BlockMode mode)
: _index_maps({p.index_map(0),
std::make_shared<common::IndexMap>(p.column_index_map())}),
_block_mode(mode), _bs({p.block_size(0), p.block_size(1)}),
_data(p.num_nonzeros() * _bs[0] * _bs[1], 0),
_cols(p.graph().first.begin(), p.graph().first.end()),
_row_ptr(p.graph().second.begin(), p.graph().second.end()),
_comm(MPI_COMM_NULL)
{
if (_block_mode == BlockMode::expanded)
{
// Rebuild IndexMaps
for (int i = 0; i < 2; ++i)
{
auto im = _index_maps[i];
std::int32_t size_local = im->size_local() * _bs[i];
std::span ghost_i = im->ghosts();
std::vector<std::int64_t> ghosts;
const std::vector<int> ghost_owner_i(im->owners().begin(),
im->owners().end());
std::vector<int> src_rank;
for (std::size_t j = 0; j < ghost_i.size(); ++j)
{
for (int k = 0; k < _bs[i]; ++k)
{
ghosts.push_back(ghost_i[j] * _bs[i] + k);
src_rank.push_back(ghost_owner_i[j]);
}
}
std::array<std::vector<int>, 2> src_dest0
= {std::vector(_index_maps[i]->src().begin(),
_index_maps[i]->src().end()),
std::vector(_index_maps[i]->dest().begin(),
_index_maps[i]->dest().end())};
_index_maps[i] = std::make_shared<common::IndexMap>(
_index_maps[i]->comm(), size_local, src_dest0, ghosts, src_rank);
}
// Convert sparsity pattern and set _bs to 1
column_container_type new_cols;
new_cols.reserve(_data.size());
rowptr_container_type new_row_ptr{0};
new_row_ptr.reserve(_row_ptr.size() * _bs[0]);
std::span<const std::int32_t> num_diag_nnz = p.off_diagonal_offsets();
for (std::size_t i = 0; i < _row_ptr.size() - 1; ++i)
{
// Repeat row _bs[0] times
for (int q0 = 0; q0 < _bs[0]; ++q0)
{
_off_diagonal_offset.push_back(new_row_ptr.back()
+ num_diag_nnz[i] * _bs[1]);
for (auto j = _row_ptr[i]; j < _row_ptr[i + 1]; ++j)
{
for (int q1 = 0; q1 < _bs[1]; ++q1)
new_cols.push_back(_cols[j] * _bs[1] + q1);
}
new_row_ptr.push_back(new_cols.size());
}
}
_cols = new_cols;
_row_ptr = new_row_ptr;
_bs[0] = 1;
_bs[1] = 1;
}
else
{
// Compute off-diagonal offset for each row (compact)
std::span<const std::int32_t> num_diag_nnz = p.off_diagonal_offsets();
_off_diagonal_offset.reserve(num_diag_nnz.size());
std::ranges::transform(num_diag_nnz, _row_ptr,
std::back_inserter(_off_diagonal_offset),
std::plus{});
}
// Some short-hand
std::array local_size
= {_index_maps[0]->size_local(), _index_maps[1]->size_local()};
std::array local_range
= {_index_maps[0]->local_range(), _index_maps[1]->local_range()};
std::span ghosts1 = _index_maps[1]->ghosts();
std::span ghosts0 = _index_maps[0]->ghosts();
std::span src_ranks = _index_maps[0]->src();
std::span dest_ranks = _index_maps[0]->dest();
// Create neighbourhood communicator (owner <- ghost)
MPI_Comm comm;
MPI_Dist_graph_create_adjacent(_index_maps[0]->comm(), dest_ranks.size(),
dest_ranks.data(), MPI_UNWEIGHTED,
src_ranks.size(), src_ranks.data(),
MPI_UNWEIGHTED, MPI_INFO_NULL, false, &comm);
_comm = dolfinx::MPI::Comm(comm, false);
// Build map from ghost row index position to owning (neighborhood)
// rank
_ghost_row_to_rank.reserve(_index_maps[0]->owners().size());
for (int r : _index_maps[0]->owners())
{
auto it = std::ranges::lower_bound(src_ranks, r);
assert(it != src_ranks.end() and *it == r);
std::size_t pos = std::distance(src_ranks.begin(), it);
_ghost_row_to_rank.push_back(pos);
}
// Compute size of data to send to each neighbor
std::vector<std::int32_t> data_per_proc(src_ranks.size(), 0);
for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
{
assert(_ghost_row_to_rank[i] < (int)data_per_proc.size());
std::size_t pos = local_size[0] + i;
data_per_proc[_ghost_row_to_rank[i]] += _row_ptr[pos + 1] - _row_ptr[pos];
}
// Compute send displacements
_val_send_disp.resize(src_ranks.size() + 1, 0);
std::partial_sum(data_per_proc.begin(), data_per_proc.end(),
std::next(_val_send_disp.begin()));
// For each ghost row, pack and send indices to neighborhood
std::vector<std::int64_t> ghost_index_data(2 * _val_send_disp.back());
{
std::vector<int> insert_pos = _val_send_disp;
for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i)
{
int rank = _ghost_row_to_rank[i];
std::int32_t row_id = local_size[0] + i;
for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j)
{
// Get position in send buffer
std::int32_t idx_pos = 2 * insert_pos[rank];
// Pack send data (row, col) as global indices
ghost_index_data[idx_pos] = ghosts0[i];
if (std::int32_t col_local = _cols[j]; col_local < local_size[1])
ghost_index_data[idx_pos + 1] = col_local + local_range[1][0];
else
ghost_index_data[idx_pos + 1] = ghosts1[col_local - local_size[1]];
insert_pos[rank] += 1;
}
}
}
// Communicate data with neighborhood
std::vector<std::int64_t> ghost_index_array;
std::vector<int> recv_disp;
{
std::vector<int> send_sizes;
std::ranges::transform(data_per_proc, std::back_inserter(send_sizes),
[](auto x) { return 2 * x; });
std::vector<int> recv_sizes(dest_ranks.size());
send_sizes.reserve(1);
recv_sizes.reserve(1);
MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, recv_sizes.data(), 1,
MPI_INT, _comm.comm());
// Build send/recv displacement
std::vector<int> send_disp{0};
std::partial_sum(send_sizes.begin(), send_sizes.end(),
std::back_inserter(send_disp));
recv_disp = {0};
std::partial_sum(recv_sizes.begin(), recv_sizes.end(),
std::back_inserter(recv_disp));
ghost_index_array.resize(recv_disp.back());
MPI_Neighbor_alltoallv(ghost_index_data.data(), send_sizes.data(),
send_disp.data(), MPI_INT64_T,
ghost_index_array.data(), recv_sizes.data(),
recv_disp.data(), MPI_INT64_T, _comm.comm());
}
// Store receive displacements for future use, when transferring
// data values
_val_recv_disp.resize(recv_disp.size());
int bs2 = _bs[0] * _bs[1];
std::ranges::transform(recv_disp, _val_recv_disp.begin(),
[&bs2](auto d) { return bs2 * d / 2; });
std::ranges::transform(_val_send_disp, _val_send_disp.begin(),
[&bs2](auto d) { return d * bs2; });
// Global-to-local map for ghost columns
std::vector<std::pair<std::int64_t, std::int32_t>> global_to_local;
global_to_local.reserve(ghosts1.size());
for (std::int64_t idx : ghosts1)
global_to_local.push_back({idx, global_to_local.size() + local_size[1]});
std::ranges::sort(global_to_local);
// Compute location in which data for each index should be stored
// when received
for (std::size_t i = 0; i < ghost_index_array.size(); i += 2)
{
// Row must be on this process
std::int32_t local_row = ghost_index_array[i] - local_range[0][0];
assert(local_row >= 0 and local_row < local_size[0]);
// Column may be owned or unowned
std::int32_t local_col = ghost_index_array[i + 1] - local_range[1][0];
if (local_col < 0 or local_col >= local_size[1])
{
auto it = std::ranges::lower_bound(
global_to_local, std::pair(ghost_index_array[i + 1], -1),
[](auto a, auto b) { return a.first < b.first; });
assert(it != global_to_local.end()
and it->first == ghost_index_array[i + 1]);
local_col = it->second;
}
auto cit0 = std::next(_cols.begin(), _row_ptr[local_row]);
auto cit1 = std::next(_cols.begin(), _row_ptr[local_row + 1]);
// Find position of column index and insert data
auto cit = std::lower_bound(cit0, cit1, local_col);
assert(cit != cit1);
assert(*cit == local_col);
std::size_t d = std::distance(_cols.begin(), cit);
_unpack_pos.push_back(d);
}
_unpack_pos.shrink_to_fit();
}
//-----------------------------------------------------------------------------
// The matrix A is distributed across P processes by blocks of rows:
// A = | A_0 |
// | A_1 |
// | ... |
// | A_P-1 |
//
// Each submatrix A_i is owned by a single process "i" and can be further
// decomposed into diagonal (Ai[0]) and off diagonal (Ai[1]) blocks:
// Ai = |Ai[0] Ai[1]|
//
// If A is square, the diagonal block Ai[0] is also square and contains
// only owned columns and rows. The block Ai[1] contains ghost columns
// (unowned dofs).
// Likewise, a local vector x can be decomposed into owned and ghost blocks:
// xi = | x[0] |
// | x[1] |
//
// So the product y = Ax can be computed into two separate steps:
// y[0] = |Ai[0] Ai[1]| | x[0] | = Ai[0] x[0] + Ai[1] x[1]
// | x[1] |
//
/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors
/// x,y
template <typename Scalar, typename V, typename W, typename X>
void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
la::Vector<Scalar>& y)
{
// start communication (update ghosts)
x.scatter_fwd_begin();
std::int32_t nrowslocal = num_owned_rows();
std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
nrowslocal);
std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
std::span<const Scalar> _x = x.array();
std::span<Scalar> _y = y.array();
std::span<const std::int64_t> Arow_begin(Arow_ptr.data(), nrowslocal);
std::span<const std::int64_t> Arow_end(Arow_ptr.data() + 1, nrowslocal);
// First stage: spmv - diagonal
// yi[0] += Ai[0] * xi[0]
if (_bs[1] == 1)
{
impl::spmv<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
_bs[0], 1);
}
else
{
impl::spmv<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
_bs[0], _bs[1]);
}
// finalize ghost update
x.scatter_fwd_end();
// Second stage: spmv - off-diagonal
// yi[0] += Ai[1] * xi[1]
if (_bs[1] == 1)
{
impl::spmv<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
_bs[0], 1);
}
else
{
impl::spmv<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
_bs[0], _bs[1]);
}
}
} // namespace dolfinx::la
|