File: aligned_vector.h

package info (click to toggle)
deal.ii 9.7.1-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 326,024 kB
  • sloc: cpp: 440,899; ansic: 77,337; python: 3,307; perl: 1,041; sh: 1,022; xml: 252; makefile: 97; javascript: 14
file content (2239 lines) | stat: -rw-r--r-- 81,051 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
// ------------------------------------------------------------------------
//
// SPDX-License-Identifier: LGPL-2.1-or-later
// Copyright (C) 2014 - 2025 by the deal.II authors
//
// This file is part of the deal.II library.
//
// Part of the source code is dual licensed under Apache-2.0 WITH
// LLVM-exception OR LGPL-2.1-or-later. Detailed license information
// governing the source code and code contributions can be found in
// LICENSE.md and CONTRIBUTING.md at the top level directory of deal.II.
//
// ------------------------------------------------------------------------


#ifndef dealii_aligned_vector_h
#define dealii_aligned_vector_h

#include <deal.II/base/config.h>

#include <deal.II/base/exceptions.h>
#include <deal.II/base/memory_consumption.h>
#include <deal.II/base/mpi.h>
#include <deal.II/base/parallel.h>
#include <deal.II/base/utilities.h>

// boost::serialization::make_array used to be in array.hpp, but was
// moved to a different file in BOOST 1.64
#include <boost/version.hpp>
#if BOOST_VERSION >= 106400
#  include <boost/serialization/array_wrapper.hpp>
#else
#  include <boost/serialization/array.hpp>
#endif
#include <boost/serialization/split_member.hpp>

#include <cstring>
#include <memory>
#include <type_traits>



DEAL_II_NAMESPACE_OPEN


/**
 * This is a replacement class for std::vector to be used in combination with
 * VectorizedArray and derived data types. It allocates memory aligned to
 * addresses of a vectorized data type (in order to avoid segmentation faults
 * when a variable of type VectorizedArray which the compiler assumes to be
 * aligned to certain memory addresses does not actually follow these rules).
 * This could also be achieved by proving std::vector with a user-defined
 * allocator. On the other hand, writing an own small vector class lets us
 * implement parallel copy and move operations with TBB, insert deal.II-style
 * assertions, and cut some unnecessary functionality. Note that this vector
 * is a bit more memory-consuming than std::vector because of alignment, so it
 * is recommended to only use this vector on long vectors.
 */
template <class T>
class AlignedVector
{
public:
  /**
   * Declare standard types used in all containers. These types parallel those
   * in the <tt>C++</tt> standard libraries <tt>vector<...></tt> class.
   */
  using value_type      = T;
  using pointer         = value_type *;
  using const_pointer   = const value_type *;
  using iterator        = value_type *;
  using const_iterator  = const value_type *;
  using reference       = value_type &;
  using const_reference = const value_type &;
  using size_type       = std::size_t;

  /**
   * Empty constructor. Sets the vector size to zero.
   */
  AlignedVector();

  /**
   * Range constructor.
   *
   * @note Unlike std::vector, this constructor uses random-access iterators so
   * that the copy may be parallelized.
   *
   * @dealiiOperationIsMultithreaded
   */
  template <
    typename RandomAccessIterator,
    typename = std::enable_if_t<std::is_convertible_v<
      typename std::iterator_traits<RandomAccessIterator>::iterator_category,
      std::random_access_iterator_tag>>>
  AlignedVector(RandomAccessIterator begin, RandomAccessIterator end);

  /**
   * Set the vector size to the given size and initializes all elements with
   * T().
   *
   * @dealiiOperationIsMultithreaded
   */
  explicit AlignedVector(const size_type size, const T &init = T());

  /**
   * Destructor.
   */
  ~AlignedVector() = default;

  /**
   * Copy constructor.
   *
   * @dealiiOperationIsMultithreaded
   */
  AlignedVector(const AlignedVector<T> &vec);

  /**
   * Move constructor. Create a new aligned vector by stealing the contents of
   * @p vec.
   */
  AlignedVector(AlignedVector<T> &&vec) noexcept;

  /**
   * Assignment to the input vector @p vec.
   *
   * @dealiiOperationIsMultithreaded
   */
  AlignedVector &
  operator=(const AlignedVector<T> &vec);

  /**
   * Move assignment operator.
   */
  AlignedVector &
  operator=(AlignedVector<T> &&vec) noexcept;

  /**
   * Change the size of the vector. If the new size is larger than the previous
   * size, then new elements will be added to the end of the vector; these
   * elements will remain uninitialized (i.e., left in an undefined state) if
   * `std::is_trivially_default_constructible_v<T>` is `true`, and will be
   * default initialized if that type trait is `false`. See
   * [here](https://en.cppreference.com/w/cpp/types/is_default_constructible)
   * for a precise definition of `std::is_trivially_default_constructible`.
   *
   * If the new size is less than the previous size, then the `new_size`th and
   * all subsequent elements will be destroyed.
   *
   * As a consequence of the outline above, the "_fast" suffix of this function
   * refers to the fact that for trivially default constructible types `T`, this
   * function omits the initialization of new elements.
   *
   * @note This method can only be invoked for classes @p T that define a
   * default constructor, @p T(). Otherwise, compilation will fail.
   */
  void
  resize_fast(const size_type new_size);

  /**
   * Change the size of the vector. It keeps old elements previously
   * available, and initializes each newly added element to a
   * default-constructed object of type @p T.
   *
   * If the new vector size is shorter than the old one, the memory is
   * not immediately released unless the new size is zero; however,
   * the size of the current object is of course set to the requested
   * value. The destructors of released elements are also called.
   *
   * @dealiiOperationIsMultithreaded
   */
  void
  resize(const size_type new_size);

  /**
   * Change the size of the vector. It keeps old elements previously
   * available, and initializes each newly added element with the
   * provided initializer.
   *
   * If the new vector size is shorter than the old one, the memory is
   * not immediately released unless the new size is zero; however,
   * the size of the current object is of course set to the requested
   * value.
   *
   * @note This method can only be invoked for classes that define the copy
   * assignment operator. Otherwise, compilation will fail.
   *
   * @dealiiOperationIsMultithreaded
   */
  void
  resize(const size_type new_size, const T &init);

  /**
   * Reserve memory space for @p new_allocated_size elements.
   *
   * If the argument @p new_allocated_size is less than the current number of stored
   * elements (as indicated by calling size()), then this function does not
   * do anything at all. Except if the argument @p new_allocated_size is set
   * to zero, then all previously allocated memory is released (this operation
   * then being equivalent to directly calling the clear() function).
   *
   * In order to avoid too frequent reallocation (which involves copy of the
   * data), this function doubles the amount of memory occupied when the given
   * size is larger than the previously allocated size.
   *
   * Note that this function only changes the amount of elements the object
   * *can* store, but not the number of elements it *actually* stores. As
   * a consequence, no constructors or destructors of newly created objects
   * are run, though the existing elements may be moved to a new location (which
   * involves running the move constructor at the new location and the
   * destructor at the old location).
   */
  void
  reserve(const size_type new_allocated_size);

  /**
   * Releases the memory allocated but not used.
   */
  void
  shrink_to_fit();

  /**
   * Releases all previously allocated memory and leaves the vector in a state
   * equivalent to the state after the default constructor has been called.
   */
  void
  clear();

  /**
   * Inserts an element at the end of the vector, increasing the vector size
   * by one. Note that the allocated size will double whenever the previous
   * space is not enough to hold the new element.
   */
  void
  push_back(const T in_data);

  /**
   * Return the last element of the vector (read and write access).
   */
  reference
  back();

  /**
   * Return the last element of the vector (read-only access).
   */
  const_reference
  back() const;

  /**
   * Inserts several elements at the end of the vector given by a range of
   * elements.
   */
  template <typename ForwardIterator>
  void
  insert_back(ForwardIterator begin, ForwardIterator end);

  /**
   * Insert the range specified by @p begin and @p end after the element @p position.
   *
   * @note Unlike std::vector, this function uses random-access iterators so
   * that the copy may be parallelized.
   *
   * @dealiiOperationIsMultithreaded
   */
  template <
    typename RandomAccessIterator,
    typename = std::enable_if_t<std::is_convertible_v<
      typename std::iterator_traits<RandomAccessIterator>::iterator_category,
      std::random_access_iterator_tag>>>
  iterator
  insert(const_iterator       position,
         RandomAccessIterator begin,
         RandomAccessIterator end);

  /**
   * Fills the vector with size() copies of a default constructed object.
   *
   * @note Unlike the other fill() function, this method can also be
   * invoked for classes that do not define a copy assignment
   * operator.
   *
   * @dealiiOperationIsMultithreaded
   */
  void
  fill();

  /**
   * Fills the vector with size() copies of the given input.
   *
   * @note This method can only be invoked for classes that define the copy
   * assignment operator. Otherwise, compilation will fail.
   *
   * @dealiiOperationIsMultithreaded
   */
  void
  fill(const T &element);

  /**
   * This function replicates the state found on the process indicated by
   * @p root_process across all processes of the MPI communicator. The current
   * state found on any of the processes other than @p root_process is lost
   * in this process. One can imagine this operation to act like a call to
   * Utilities::MPI::broadcast() from the root process to all other processes,
   * though in practice the function may try to move the data into shared
   * memory regions on each of the machines that host MPI processes and
   * let all MPI processes on this machine then access this shared memory
   * region instead of keeping their own copy.
   *
   * The intent of this function is to quickly exchange large arrays from
   * one process to others, rather than having to compute or create it on
   * all processes. This is specifically the case for data loaded from
   * disk -- say, large data tables -- that are more easily dealt with by
   * reading once and then distributing across all processes in an MPI
   * universe, than letting each process read the data from disk itself.
   * Specifically, the use of shared memory regions allows for replicating
   * the data only once per multicore machine in the MPI universe, rather
   * than replicating data once for each MPI process. This results in
   * large memory savings if the data is large on today's machines that
   * can easily house several dozen MPI processes per shared memory
   * space. This use case is outlined in the TableBase class documentation
   * as the current function is called from
   * TableBase::replicate_across_communicator(). Indeed, the primary rationale
   * for this function is to enable sharing data tables based on TableBase
   * across MPI processes.
   *
   * This function does not imply a model of keeping data on different processes
   * in sync, as LinearAlgebra::distributed::Vector and other vector classes do
   * where there exists a notion of certain elements of the vector owned by each
   * process and possibly ghost elements that are mirrored from its owning
   * process to other processes. Rather, the elements of the current object are
   * simply copied to the other processes, and it is useful to think of this
   * operation as creating a set of `const` AlignedVector objects on all
   * processes that should not be changed any more after the replication
   * operation, as this is the only way to ensure that the vectors remain the
   * same on all processes. This is particularly true because of the use of
   * shared memory regions where any modification of a vector element on one MPI
   * process may also result in a modification of elements visible on other
   * processes, assuming they are located within one shared memory node.
   *
   * @note The use of shared memory between MPI processes requires
   *   that the detected MPI installation supports the necessary operations.
   *   This is the case for MPI 3.0 and higher.
   *
   * @note This function is not cheap. It needs to create sub-communicators
   *   of the provided @p communicator object, which is generally an expensive
   *   operation. Likewise, the generation of shared memory spaces is not
   *   a cheap operation. As a consequence, this function primarily makes
   *   sense when the goal is to share large read-only data tables among
   *   processes; examples are data tables that are loaded at start-up
   *   time and then used over the course of the run time of the program.
   *   In such cases, the start-up cost of running this function can be
   *   amortized over time, and the potential memory savings from not having to
   *   store the table on each process may be substantial on machines with
   *   large core counts on which many MPI processes run on the same machine.
   *
   * @note This function only makes sense if the data type `T` is
   *   "self-contained", i.e., all if its information is stored in its
   *   member variables, and if none of the member variables are pointers
   *   to other parts of the memory. This is because if a type `T` does
   *   have pointers to other parts of memory, then moving `T` into
   *   a shared memory space does not result in the other processes having
   *   access to data that the object points to with its member variable
   *   pointers: These continue to live only on one process, and are
   *   typically in memory areas not accessible to the other processes.
   *   As a consequence, the usual use case for this function is to share
   *   arrays of simple objects such as `double`s or `int`s.
   *
   * @note After calling this function, objects on different MPI processes
   *   share a common state. That means that certain operations become
   *   "collective", i.e., they must be called on all participating
   *   processors at the same time. In particular, you can no longer call
   *   resize(), reserve(), or clear() on one MPI process -- you have to do
   *   so on all processes at the same time, because they have to communicate
   *   for these operations. If you do not do so, you will likely get
   *   a deadlock that may be difficult to debug. By extension, this rule of
   *   only collectively resizing extends to this function itself: You can
   *   not call it twice in a row because that implies that first all but the
   *   `root_process` throw away their data, which is not a collective
   *   operation. Generally, these restrictions on what can and can not be
   *   done hint at the correctness of the comments above: You should treat
   *   an AlignedVector on which the current function has been called as
   *   `const`, on which no further operations can be performed until
   *   the destructor is called.
   */
  void
  replicate_across_communicator(const MPI_Comm     communicator,
                                const unsigned int root_process);

  /**
   * Swaps the given vector with the calling vector.
   */
  void
  swap(AlignedVector<T> &vec) noexcept;

  /**
   * Return whether the vector is empty, i.e., its size is zero.
   */
  bool
  empty() const;

  /**
   * Return the size of the vector.
   */
  size_type
  size() const;

  /**
   * Return the capacity of the vector, i.e., the size this vector can hold
   * without reallocation. Note that capacity() >= size().
   */
  size_type
  capacity() const;

  /**
   * Read-write access to entry @p index in the vector.
   */
  reference
  operator[](const size_type index);

  /**
   * Read-only access to entry @p index in the vector.
   */
  const_reference
  operator[](const size_type index) const;

  /**
   * Return a pointer to the underlying data buffer.
   */
  pointer
  data();

  /**
   * Return a const pointer to the underlying data buffer.
   */
  const_pointer
  data() const;

  /**
   * Return a read and write pointer to the beginning of the data array.
   */
  iterator
  begin();

  /**
   * Return a read and write pointer to the end of the data array.
   */
  iterator
  end();

  /**
   * Return a read-only pointer to the beginning of the data array.
   */
  const_iterator
  begin() const;

  /**
   * Return a read-only pointer to the end of the data array.
   */
  const_iterator
  end() const;

  /**
   * Return the memory consumption of the allocated memory in this class. If
   * the underlying type @p T allocates memory by itself, this memory is not
   * counted.
   */
  size_type
  memory_consumption() const;

  /**
   * Write the data of this object to a stream for the purpose of
   * serialization using the [BOOST serialization
   * library](https://www.boost.org/doc/libs/1_74_0/libs/serialization/doc/index.html).
   */
  template <class Archive>
  void
  save(Archive &ar, const unsigned int version) const;

  /**
   * Read the data of this object from a stream for the purpose of
   * serialization using the [BOOST serialization
   * library](https://www.boost.org/doc/libs/1_74_0/libs/serialization/doc/index.html).
   */
  template <class Archive>
  void
  load(Archive &ar, const unsigned int version);

#ifdef DOXYGEN
  /**
   * Write and read the data of this object from a stream for the purpose
   * of serialization using the [BOOST serialization
   * library](https://www.boost.org/doc/libs/1_74_0/libs/serialization/doc/index.html).
   */
  template <class Archive>
  void
  serialize(Archive &archive, const unsigned int version);
#else
  // This macro defines the serialize() method that is compatible with
  // the templated save() and load() method that have been implemented.
  BOOST_SERIALIZATION_SPLIT_MEMBER()
#endif

  /**
   * Exception message for changing the vector after a call to
   * replicate_across_communicator().
   *
   * @ingroup Exceptions
   */
  DeclExceptionMsg(ExcAlignedVectorChangeAfterReplication,
                   "Changing the vector after a call to "
                   "replicate_across_communicator() is not allowed.");

private:
  /**
   * Make a new allocation, move the data from the old memory region to the new
   * region, and release the old memory.
   */
  void
  allocate_and_move(const std::size_t old_size,
                    const std::size_t new_size,
                    const std::size_t new_allocated_size);

  /**
   * A class that is used as the "deleter" for a `std::unique_ptr` object that
   * AlignedVector uses to store the memory used for the elements.
   *
   * There are two ways the AlignedVector class can handle memory:
   * - Allocation via `new[]` in reserve() where we call `posix_memalign()`
   *   to obtain a chunk of memory and then do placement-`new` to initialize
   *   memory. If this is what we have done, then we need to call the
   *   destructors of the currently active elements by hand, and then
   *   call `std::free()` to return memory. In order to call the destructors
   *   of currently used elements, the deleter object needs to have access
   *   to the owning `AlignedVector` object to know which of the allocated
   *   elements are currently actually used.
   * - We have called `replicate_across_communicator()`, in which case the
   *   elements have been moved into a memory "window" managed by MPI.
   *   In that case, one process (the root process of an MPI communicator
   *   that ties together all processes on one node) needs to call the
   *   destructor of all elements in this shared memory space, and then
   *   all processes on that communicator need to first destroy the
   *   MPI window object, and then the MPI communicator for the shared
   *   memory node's processes. In this approach, we need to store the
   *   following data: A pointer to the owning AlignedVector object to know
   *   which elements need to be destroyed, copies of the MPI window
   *   and communicator objects, and a couple of ancillary pieces of data.
   *
   * A common idiom towards using `std::unique_ptr` with complex de-allocation
   * semantics is to use `std::unique_ptr<T, std::function<void (T*)>`
   * and then use a lambda function to initialize the `std::function`
   * deleter object. This approach is used in numerous other places in
   * deal.II, but this has two downsides:
   * - `std::function` is relatively memory-hungry. It takes 24 bytes by
   *   itself, but then also needs to allocate memory dynamically, for
   *   example to store the capture object of a lambda function. This ends
   *   up to be quite a lot of memory given that we frequently use small
   *   Vector objects (which build on AlignedVector).
   * - More importantly, this breaks move operations. In a move constructor
   *   or move assignment of AlignedVector, we want to steal the memory pointed
   *   to, but we then need to install a new deleter object because the deleter
   *   needs to know about the owning object to determine which elements to
   *   call the destructor for. The problem is that we can't use the old
   *   deleter (it is a lambda function that still points to the previous
   *   owner that we are moving *from*) but we also don't know what deleter
   *   to install otherwise -- we don't know the innards of the lambda function
   *   that was previously installed; in fact, we don't even know whether it
   *   was for regular or MPI shared-memory data management.
   *
   * The way out of this is to write a custom deleter. It stores a pointer to
   * the owning AlignedVector object. It then also stores a `std::unique_ptr`
   * to an object of a class derived from a base class that implements the
   * concrete action necessary to facilitate the "deleter" action. Based on
   * the arguments given to the constructor of the Deleter class, the
   * constructor then either allocates a "regular" or an MPI-based action
   * object, and the action is facilitated by an overloaded `virtual` function.
   *
   * In the end, this solves both of our problems:
   * - The deleter object only requires 16 bytes (two pointers) plus whatever
   *   dynamic memory is necessary to store the "action" objects.
   * - In move operations, the only thing that needs to be done is to tell
   *   the deleter about the change of owning AlignedVector object, which we
   *   can achieve via the reset_owning_object() function.
   *
   * This scheme can be further optimized in the following way: In the most
   * common case, memory is allocated via `posix_memalign()` and needs to
   * destroyed via `std::free()`. Rather than derive an action class for
   * this common case, dynamically allocate an object for this case, and
   * call it, we can just special case this situation: If the pointer to the
   * action object is `nullptr`, then we just execute the default action.
   * This means that for the most common case, we do not need any dynamic
   * memory allocation at all, and in that case the deleter object truly
   * only uses 16 bytes: One pointer to the owning AlignedVector object,
   * and a pointer to the action object that happens to be a `nullptr`.
   * Only in the case of the MPI shared memory data management does the
   * action pointer point somewhere, but this case is expensive anyway and so
   * the extra dynamic memory allocation does little harm.
   *
   * (One could in that case go even further: There is no really only one
   * possible non-default action left, namely the MPI-based destruction of
   * the shared-memory window. Instead of having the Deleter::DeleterActionBase
   * class below, and the derived Deleter::MPISharedMemDeleterAction class, both
   * with `virtual` functions, we could just get rid of the base class and make
   * the member functions of the derived class non-`virtual`. We would then
   * either store a pointer to an MPISharedMemDeleterAction object if the
   * non-default action is requested, or a `nullptr` for the default action.
   * Avoiding `virtual` functions would make these objects smaller by a few
   * bytes, and the call to the `delete_array()` function marginally faster.
   * That said, given that the Deleter::MPISharedMemDeleterAction object already
   * stores all sorts of stuff and its execution is not cheap, these additional
   * optimizations are probably not worth it. Instead, we kept the class
   * hierarchy so that one could define other non-standard actions in the future
   * through other derived classes.)
   */
  class Deleter
  {
  public:
    /**
     * Constructor. When this constructor is called, it installs an
     * action that corresponds to "regular" memory allocation that
     * needs to be handled by using `std::free()`.
     */
    Deleter(AlignedVector<T> *owning_object);

#ifdef DEAL_II_WITH_MPI
    /**
     * Constructor. When this constructor is called, it installs an
     * action that corresponds to MPI-based shared memory allocation that
     * needs to be handled by letting MPI de-allocate the shared memory
     * window, plus destroying the MPI communicator, and doing other
     * clean-up work.
     */
    Deleter(AlignedVector<T> *owning_object,
            const bool        is_shmem_root,
            T                *aligned_shmem_pointer,
            MPI_Comm          shmem_group_communicator,
            MPI_Win           shmem_window);
#endif

    /**
     * The operator called by `std::unique_ptr` to destroy the data it
     * is storing. This function dispatches to the different actions that
     * this class implements.
     */
    void
    operator()(T *ptr);

    /**
     * Reset the pointer to the owning AlignedVector object. This function
     * is used in move operations when the pointer to the data is transferred
     * from one AlignedVector object -- i.e., the pointer itself remains
     * unchanged, but the deleter object needs to be updated to know who
     * the new owner now is.
     */
    void
    reset_owning_object(const AlignedVector<T> *new_aligned_vector_ptr);

  private:
    /**
     * Base class for the action necessary to de-allocate memory.
     */
    class DeleterActionBase
    {
    public:
      /**
       * Destructor, made `virtual` to allow for derived classes.
       */
      virtual ~DeleterActionBase() = default;

      /**
       * The function that implements the action of de-allocating memory.
       * It receives as arguments a pointer to the owning AlignedVector object
       * as well as a pointer to the memory being de-allocated.
       */
      virtual void
      delete_array(const AlignedVector<T> *owning_aligned_vector, T *ptr) = 0;
    };

#ifdef DEAL_II_WITH_MPI

    /**
     * A class that implements the deleter action for MPI shared-memory
     * allocated data.
     */
    class MPISharedMemDeleterAction : public DeleterActionBase
    {
    public:
      /**
       * Constructor. Store the various pieces of information necessary to
       * identify the MPI window in which the data resides.
       */
      MPISharedMemDeleterAction(const bool is_shmem_root,
                                T         *aligned_shmem_pointer,
                                MPI_Comm   shmem_group_communicator,
                                MPI_Win    shmem_window);

      /**
       * The function that implements the action of de-allocating memory.
       * It receives as arguments a pointer to the owning AlignedVector object
       * as well as a pointer to the memory being de-allocated.
       */
      virtual void
      delete_array(const AlignedVector<T> *aligned_vector, T *ptr) override;

    private:
      /**
       * Variables necessary to identify the MPI shared-memory window plus
       * all ancillary information to destroy this window.
       */
      const bool is_shmem_root;
      T         *aligned_shmem_pointer;
      MPI_Comm   shmem_group_communicator;
      MPI_Win    shmem_window;
    };
#endif

    /**
     * A pointer to the object that facilitates the actual action of
     * destroying the memory.
     */
    std::unique_ptr<DeleterActionBase> deleter_action_object;

    /**
     * A (non-owned) pointer to the surrounding AlignedVector object that owns
     * the memory this deleter object is responsible for deleting.
     */
    const AlignedVector<T> *owning_aligned_vector;
  };

  /**
   * Pointer to actual data array, using the custom deleter class above.
   */
  std::unique_ptr<T[], Deleter> elements;

  /**
   * Pointer to one past the last valid value.
   */
  T *used_elements_end;

  /**
   * Pointer to the end of the allocated memory.
   */
  T *allocated_elements_end;

  /**
   * Flag indicating if replicate_across_communicator() has been called.
   */
  bool replicated_across_communicator;
};


// ------------------------------- inline functions --------------------------

/**
 * This namespace defines the copy and set functions used in AlignedVector.
 * These functions operate in parallel when there are enough elements in the
 * vector.
 */
namespace internal
{
  /**
   * A class that given a range of memory locations calls the placement-new
   * operator on these memory locations and copy-constructs objects of type
   * `T` there.
   *
   * This class is based on the specialized for loop base class
   * ParallelForLoop in parallel.h whose purpose is the following: When
   * calling a parallel for loop on AlignedVector with apply_to_subranges, it
   * generates different code for every different argument we might choose (as
   * it is templated). This gives a lot of code (e.g. it triples the memory
   * required for compiling the file matrix_free.cc and the final object size
   * is several times larger) which is completely useless. Therefore, this
   * class channels all copy commands through one call to apply_to_subrange
   * for all possible types, which makes the copy operation much cleaner
   * (thanks to a virtual function, whose cost is negligible in this context).
   *
   * @relatesalso AlignedVector
   */
  template <typename RandomAccessIterator, typename T>
  class AlignedVectorCopyConstruct
    : private dealii::parallel::ParallelForInteger
  {
    static const std::size_t minimum_parallel_grain_size =
      160000 / sizeof(T) + 1;

  public:
    /**
     * Constructor. Issues a parallel call if there are sufficiently many
     * elements, otherwise works in serial. Copies the data from the half-open
     * interval between @p source_begin and @p source_end to array starting at
     * @p destination (by calling the copy constructor with placement new).
     *
     * The elements from the source array are simply copied via the placement
     * new copy constructor.
     */
    AlignedVectorCopyConstruct(RandomAccessIterator source_begin,
                               RandomAccessIterator source_end,
                               T *const             destination)
      : source_(source_begin)
      , destination_(destination)
    {
      Assert(source_end >= source_begin, ExcInternalError());
      Assert(source_end == source_begin || destination != nullptr,
             ExcInternalError());
      const std::size_t size = source_end - source_begin;
      if (size < minimum_parallel_grain_size)
        AlignedVectorCopyConstruct::apply_to_subrange(0, size);
      else
        apply_parallel(0, size, minimum_parallel_grain_size);
    }

    /**
     * This method moves elements from the source to the destination given in
     * the constructor on a subrange given by two integers.
     */
    virtual void
    apply_to_subrange(const std::size_t begin,
                      const std::size_t end) const override
    {
      if (end == begin)
        return;

      // We can use memcpy() with trivially copyable objects.
      if constexpr (std::is_trivially_copyable_v<T> == true &&
                    (std::is_same_v<T *, RandomAccessIterator> ||
                     std::is_same_v<const T *, RandomAccessIterator>) == true)
        std::memcpy(destination_ + begin,
                    source_ + begin,
                    (end - begin) * sizeof(T));
      else
        for (std::size_t i = begin; i < end; ++i)
          new (&destination_[i]) T(*(source_ + i));
    }

  private:
    RandomAccessIterator source_;
    T *const             destination_;
  };


  /**
   * Like AlignedVectorCopyConstruct, but use the move-constructor of `T`
   * to create new objects.
   *
   * @relatesalso AlignedVector
   */
  template <typename RandomAccessIterator, typename T>
  class AlignedVectorMoveConstruct
    : private dealii::parallel::ParallelForInteger
  {
    static const std::size_t minimum_parallel_grain_size =
      160000 / sizeof(T) + 1;

  public:
    /**
     * Constructor. Issues a parallel call if there are sufficiently many
     * elements, otherwise works in serial. Moves the data from the half-open
     * interval between @p source_begin and @p source_end to array starting at
     * @p destination (by calling the move constructor with placement new).
     *
     * The data is moved between the two arrays by invoking the destructor on
     * the source range (preparing for a subsequent call to free).
     */
    AlignedVectorMoveConstruct(RandomAccessIterator source_begin,
                               RandomAccessIterator source_end,
                               T *const             destination)
      : source_(source_begin)
      , destination_(destination)
    {
      Assert(source_end >= source_begin, ExcInternalError());
      Assert(source_end == source_begin || destination != nullptr,
             ExcInternalError());
      const std::size_t size = source_end - source_begin;
      if (size < minimum_parallel_grain_size)
        AlignedVectorMoveConstruct::apply_to_subrange(0, size);
      else
        apply_parallel(0, size, minimum_parallel_grain_size);
    }

    /**
     * This method moves elements from the source to the destination given in
     * the constructor on a subrange given by two integers.
     */
    virtual void
    apply_to_subrange(const std::size_t begin,
                      const std::size_t end) const override
    {
      if (end == begin)
        return;

      // We can use memcpy() with trivially copyable objects.
      if constexpr (std::is_trivially_copyable_v<T> == true &&
                    (std::is_same_v<T *, RandomAccessIterator> ||
                     std::is_same_v<const T *, RandomAccessIterator>) == true)
        std::memcpy(destination_ + begin,
                    source_ + begin,
                    (end - begin) * sizeof(T));
      else
        // For everything else just use the move constructor. The original
        // object remains alive and will be destroyed elsewhere.
        for (std::size_t i = begin; i < end; ++i)
          new (&destination_[i]) T(std::move(*(source_ + i)));
    }

  private:
    RandomAccessIterator source_;
    T *const             destination_;
  };


  /**
   * A class that given a range of memory locations either calls
   * the placement-new operator on these memory locations (if
   * `initialize_memory==true`) or just copies the given initializer
   * into this memory location (if `initialize_memory==false`). The
   * latter is appropriate for classes that have only trivial constructors,
   * such as the built-in types `double`, `int`, etc., and structures
   * composed of such types.
   *
   * @tparam initialize_memory Determines whether the set command should
   * initialize memory (with a call to the copy constructor) or rather use the
   * copy assignment operator. A template is necessary to select the
   * appropriate operation since some classes might define only one of those
   * two operations.
   *
   * @relatesalso AlignedVector
   */
  template <typename T, bool initialize_memory>
  class AlignedVectorInitialize : private dealii::parallel::ParallelForInteger
  {
    static const std::size_t minimum_parallel_grain_size =
      160000 / sizeof(T) + 1;

  public:
    /**
     * Constructor. Issues a parallel call if there are sufficiently many
     * elements, otherwise work in serial.
     */
    AlignedVectorInitialize(const std::size_t size,
                            const T          &element,
                            T *const          destination)
      : element_(element)
      , destination_(destination)
      , trivial_element(false)
    {
      if (size == 0)
        return;
      Assert(destination != nullptr, ExcInternalError());

      // do not use memcmp() for long double because on some systems it does not
      // completely fill its memory and may lead to false positives in e.g.
      // valgrind
      if constexpr (std::is_trivially_default_constructible_v<T> == true &&
                    std::is_same_v<T, long double> == false)
        {
          const unsigned char zero[sizeof(T)] = {};
          if (std::memcmp(zero, &element, sizeof(T)) == 0)
            trivial_element = true;
        }
      if (size < minimum_parallel_grain_size)
        AlignedVectorInitialize::apply_to_subrange(0, size);
      else
        apply_parallel(0, size, minimum_parallel_grain_size);
    }

    /**
     * This sets elements on a subrange given by two integers.
     */
    virtual void
    apply_to_subrange(const std::size_t begin,
                      const std::size_t end) const override
    {
      // Only use memset() with types whose default constructors don't do
      // anything.
      if constexpr (std::is_trivially_default_constructible_v<T> == true)
        if (trivial_element)
          {
            std::memset(destination_ + begin, 0, (end - begin) * sizeof(T));
            return;
          }

      copy_construct_or_assign(begin,
                               end,
                               std::bool_constant<initialize_memory>());
    }

  private:
    const T   &element_;
    mutable T *destination_;
    bool       trivial_element;

    // copy assignment operation
    void
    copy_construct_or_assign(const std::size_t begin,
                             const std::size_t end,
                             std::bool_constant<false>) const
    {
      for (std::size_t i = begin; i < end; ++i)
        destination_[i] = element_;
    }

    // copy constructor (memory initialization)
    void
    copy_construct_or_assign(const std::size_t begin,
                             const std::size_t end,
                             std::bool_constant<true>) const
    {
      for (std::size_t i = begin; i < end; ++i)
        new (&destination_[i]) T(element_);
    }
  };



  /**
   * Like AlignedVectorInitialize, but use default-constructed objects
   * as initializers.
   *
   * @tparam initialize_memory Sets whether the set command should
   * initialize memory (with a call to the copy constructor) or rather use the
   * copy assignment operator. A template is necessary to select the
   * appropriate operation since some classes might define only one of those
   * two operations.
   *
   * @relatesalso AlignedVector
   */
  template <typename T, bool initialize_memory>
  class AlignedVectorDefaultInitialize
    : private dealii::parallel::ParallelForInteger
  {
    static const std::size_t minimum_parallel_grain_size =
      160000 / sizeof(T) + 1;

  public:
    /**
     * Constructor. Issues a parallel call if there are sufficiently many
     * elements, otherwise work in serial.
     */
    AlignedVectorDefaultInitialize(const std::size_t size, T *const destination)
      : destination_(destination)
    {
      if (size == 0)
        return;
      Assert(destination != nullptr, ExcInternalError());

      if (size < minimum_parallel_grain_size)
        AlignedVectorDefaultInitialize::apply_to_subrange(0, size);
      else
        apply_parallel(0, size, minimum_parallel_grain_size);
    }

    /**
     * This initializes elements on a subrange given by two integers.
     */
    virtual void
    apply_to_subrange(const std::size_t begin,
                      const std::size_t end) const override
    {
      // Only use memset() with types whose default constructors don't do
      // anything.
      if constexpr (std::is_trivially_default_constructible_v<T> == true)
        std::memset(destination_ + begin, 0, (end - begin) * sizeof(T));
      else
        default_construct_or_assign(begin,
                                    end,
                                    std::bool_constant<initialize_memory>());
    }

  private:
    mutable T *destination_;

    // copy assignment operation
    void
    default_construct_or_assign(const std::size_t begin,
                                const std::size_t end,
                                std::bool_constant<false>) const
    {
      for (std::size_t i = begin; i < end; ++i)
        destination_[i] = std::move(T());
    }

    // copy constructor (memory initialization)
    void
    default_construct_or_assign(const std::size_t begin,
                                const std::size_t end,
                                std::bool_constant<true>) const
    {
      for (std::size_t i = begin; i < end; ++i)
        new (&destination_[i]) T;
    }
  };

} // end of namespace internal


#ifndef DOXYGEN



template <typename T>
inline AlignedVector<T>::Deleter::Deleter(AlignedVector<T> *owning_object)
  : deleter_action_object(nullptr) // encode default action by using a nullptr
  , owning_aligned_vector(owning_object)
{}


#  ifdef DEAL_II_WITH_MPI

template <typename T>
inline AlignedVector<T>::Deleter::Deleter(AlignedVector<T> *owning_object,
                                          const bool        is_shmem_root,
                                          T       *aligned_shmem_pointer,
                                          MPI_Comm shmem_group_communicator,
                                          MPI_Win  shmem_window)
  : deleter_action_object(
      std::make_unique<MPISharedMemDeleterAction>(is_shmem_root,
                                                  aligned_shmem_pointer,
                                                  shmem_group_communicator,
                                                  shmem_window))
  , owning_aligned_vector(owning_object)
{}
#  endif


template <typename T>
inline void
AlignedVector<T>::Deleter::operator()(T *ptr)
{
  // If no special action has been registered (i.e., if the action pointer is
  // nullptr), then just perform the default action right here.
  if (deleter_action_object == nullptr)
    {
      if (ptr != nullptr)
        {
          Assert(owning_aligned_vector->used_elements_end != nullptr,
                 ExcInternalError());

          if (std::is_trivially_destructible_v<T> == false)
            for (T *p = owning_aligned_vector->used_elements_end - 1; p >= ptr;
                 --p)
              p->~T();

          std::free(ptr);
        }
    }
  else
    // Otherwise, let the action object do what is necessary
    deleter_action_object->delete_array(owning_aligned_vector, ptr);
}



template <typename T>
inline void
AlignedVector<T>::Deleter::reset_owning_object(
  const AlignedVector<T> *new_aligned_vector_ptr)
{
  owning_aligned_vector = new_aligned_vector_ptr;
}


#  ifdef DEAL_II_WITH_MPI

template <typename T>
inline AlignedVector<T>::Deleter::MPISharedMemDeleterAction::
  MPISharedMemDeleterAction(const bool is_shmem_root,
                            T         *aligned_shmem_pointer,
                            MPI_Comm   shmem_group_communicator,
                            MPI_Win    shmem_window)
  : is_shmem_root(is_shmem_root)
  , aligned_shmem_pointer(aligned_shmem_pointer)
  , shmem_group_communicator(shmem_group_communicator)
  , shmem_window(shmem_window)
{}



template <typename T>
inline void
AlignedVector<T>::Deleter::MPISharedMemDeleterAction::delete_array(
  const AlignedVector<T> *aligned_vector,
  T                      *ptr)
{
  (void)ptr;
  // It would be nice to assert that aligned_vector->elements.get() equals ptr,
  // but it is not guaranteed to work: clang, for example, sets elements.get()
  // to nullptr and then calls the deleter on a previously made copy. Hence we
  // must assume here that elements.get() (which is managed by the unique_ptr)
  // may be nullptr at this point.
  //
  // used_elements_end is a member variable of AlignedVector (i.e., we control
  // it, not unique_ptr) so it is still set to its correct value.

  if (is_shmem_root)
    if (std::is_trivially_destructible_v<T> == false)
      for (T *p = aligned_vector->used_elements_end - 1; p >= ptr; --p)
        p->~T();

  int ierr;
  ierr = MPI_Win_free(&shmem_window);
  AssertThrowMPI(ierr);

  Utilities::MPI::free_communicator(shmem_group_communicator);
}

#  endif


template <class T>
inline AlignedVector<T>::AlignedVector()
  : elements(nullptr, Deleter(this))
  , used_elements_end(nullptr)
  , allocated_elements_end(nullptr)
  , replicated_across_communicator(false)
{}



template <class T>
template <typename RandomAccessIterator, typename>
inline AlignedVector<T>::AlignedVector(RandomAccessIterator begin,
                                       RandomAccessIterator end)
  : elements(nullptr, Deleter(this))
  , used_elements_end(nullptr)
  , allocated_elements_end(nullptr)
  , replicated_across_communicator(false)
{
  allocate_and_move(0u, end - begin, end - begin);
  used_elements_end = allocated_elements_end;
  dealii::internal::AlignedVectorCopyConstruct<RandomAccessIterator, T>(begin,
                                                                        end,
                                                                        data());
}


template <class T>
inline AlignedVector<T>::AlignedVector(const size_type size, const T &init)
  : elements(nullptr, Deleter(this))
  , used_elements_end(nullptr)
  , allocated_elements_end(nullptr)
  , replicated_across_communicator(false)
{
  if (size > 0)
    resize(size, init);
}



template <class T>
inline AlignedVector<T>::AlignedVector(const AlignedVector<T> &vec)
  : elements(nullptr, Deleter(this))
  , used_elements_end(nullptr)
  , allocated_elements_end(nullptr)
  , replicated_across_communicator(false)
{
  // copy the data from vec
  reserve(vec.size());
  used_elements_end = allocated_elements_end;
  internal::AlignedVectorCopyConstruct<T *, T>(vec.elements.get(),
                                               vec.used_elements_end,
                                               elements.get());
}



template <class T>
inline AlignedVector<T>::AlignedVector(AlignedVector<T> &&vec) noexcept
  : AlignedVector<T>()
{
  // forward to the move operator
  *this = std::move(vec);
}



template <class T>
inline AlignedVector<T> &
AlignedVector<T>::operator=(const AlignedVector<T> &vec)
{
  const size_type new_size = vec.used_elements_end - vec.elements.get();

  // First throw away everything and re-allocate memory but leave that
  // memory uninitialized for now:
  resize(0);
  reserve(new_size);

  // Then copy the elements over by using the copy constructor on these
  // elements:
  internal::AlignedVectorCopyConstruct<T *, T>(vec.elements.get(),
                                               vec.used_elements_end,
                                               elements.get());

  // Finally adjust the pointer to the end of the elements that are used:
  used_elements_end = elements.get() + new_size;

  return *this;
}



template <class T>
inline AlignedVector<T> &
AlignedVector<T>::operator=(AlignedVector<T> &&vec) noexcept
{
  clear();

  // Move the actual data in the 'elements' object. One problem is that this
  // also moves the deleter object, but the deleter object
  // references 'this' (i.e., the 'this' pointer of the *moved-from*
  // object). The way this is implemented is that we have to move the
  // deleter as well, and then reset the pointer inside the deleter
  // that references the outer object.
  elements = std::move(vec.elements);
  elements.get_deleter().reset_owning_object(this);

  // Then also steal the other pointers and clear them in the original object:
  used_elements_end      = vec.used_elements_end;
  allocated_elements_end = vec.allocated_elements_end;

  vec.used_elements_end      = nullptr;
  vec.allocated_elements_end = nullptr;

  return *this;
}



template <class T>
inline void
AlignedVector<T>::resize_fast(const size_type new_size)
{
  const size_type old_size = size();

  if (new_size == 0)
    clear();
  else if (new_size == old_size)
    {
    } // nothing to do here
  else if (new_size < old_size)
    {
      // call destructor on fields that are released, if the type requires it.
      // doing it backward releases the elements in reverse order as compared to
      // how they were created
      if (std::is_trivially_destructible_v<T> == false)
        for (T *p = used_elements_end - 1; p >= elements.get() + new_size; --p)
          p->~T();
      used_elements_end = elements.get() + new_size;
    }
  else // new_size > old_size
    {
      // Allocate more space, and claim that space as used
      reserve(new_size);
      used_elements_end = elements.get() + new_size;

      // Leave the new array entries as-is (with undefined values) unless T's
      // default constructor is nontrivial (i.e., it is not a no-op)
      if (std::is_trivially_default_constructible_v<T> == false)
        dealii::internal::AlignedVectorDefaultInitialize<T, true>(
          new_size - old_size, elements.get() + old_size);
    }
}



template <class T>
inline void
AlignedVector<T>::resize(const size_type new_size)
{
  const size_type old_size = size();

  if (new_size == 0)
    clear();
  else if (new_size == old_size)
    {
    } // nothing to do here
  else if (new_size < old_size)
    {
      // call destructor on fields that are released, if the type requires it.
      // doing it backward releases the elements in reverse order as compared to
      // how they were created
      if (std::is_trivially_destructible_v<T> == false)
        for (T *p = used_elements_end - 1; p >= elements.get() + new_size; --p)
          p->~T();
      used_elements_end = elements.get() + new_size;
    }
  else // new_size > old_size
    {
      // Allocate more space, and claim that space as used
      reserve(new_size);
      used_elements_end = elements.get() + new_size;

      // finally set the values to the default initializer
      dealii::internal::AlignedVectorDefaultInitialize<T, true>(
        new_size - old_size, elements.get() + old_size);
    }
}



template <class T>
inline void
AlignedVector<T>::resize(const size_type new_size, const T &init)
{
  const size_type old_size = size();

  if (new_size == 0)
    clear();
  else if (new_size == old_size)
    {
    } // nothing to do here
  else if (new_size < old_size)
    {
      // call destructor on fields that are released, if the type requires it.
      // doing it backward releases the elements in reverse order as compared to
      // how they were created
      if (std::is_trivially_destructible_v<T> == false)
        for (T *p = used_elements_end - 1; p >= elements.get() + new_size; --p)
          p->~T();
      used_elements_end = elements.get() + new_size;
    }
  else // new_size > old_size
    {
      // Allocate more space, and claim that space as used
      reserve(new_size);
      used_elements_end = elements.get() + new_size;

      // finally set the desired init values
      dealii::internal::AlignedVectorInitialize<T, true>(
        new_size - old_size, init, elements.get() + old_size);
    }
}



template <class T>
inline void
AlignedVector<T>::allocate_and_move(const std::size_t old_size,
                                    const std::size_t new_size,
                                    const std::size_t new_allocated_size)
{
  // allocate and align along 64-byte boundaries (this is enough for all
  // levels of vectorization currently supported by deal.II)
  T *new_data_ptr;
  Utilities::System::posix_memalign(reinterpret_cast<void **>(&new_data_ptr),
                                    64,
                                    new_size * sizeof(T));

  // Now create a deleter that encodes what should happen when the object is
  // released: We need to destroy the objects that are currently alive (in
  // reverse order, and then release the memory. Note that we catch the
  // 'this' pointer because the number of elements currently alive might
  // change over time.
  Deleter deleter(this);

  // copy whatever elements we need to retain
  if (new_allocated_size > 0)
    dealii::internal::AlignedVectorMoveConstruct<T *, T>(
      elements.get(), elements.get() + old_size, new_data_ptr);

  // Now reset all the member variables of the current object
  // based on the allocation above. Assigning to a std::unique_ptr
  // object also releases the previously pointed to memory.
  //
  // Note that at the time of releasing the old memory, 'used_elements_end'
  // still points to its previous value, and this is important for the
  // deleter object of the previously allocated array (see how it loops over
  // the to-be-destroyed elements at the Deleter::DefaultDeleterAction
  // class).
  elements               = decltype(elements)(new_data_ptr, std::move(deleter));
  used_elements_end      = elements.get() + old_size;
  allocated_elements_end = elements.get() + new_size;
}



template <class T>
inline void
AlignedVector<T>::reserve(const size_type new_allocated_size)
{
  const size_type old_size           = used_elements_end - elements.get();
  const size_type old_allocated_size = allocated_elements_end - elements.get();
  if (new_allocated_size > old_allocated_size)
    {
      // if we continuously increase the size of the vector, we might be
      // reallocating a lot of times. therefore, try to increase the size more
      // aggressively
      const size_type new_size =
        std::max(new_allocated_size, 2 * old_allocated_size);

      allocate_and_move(old_size, new_size, new_allocated_size);
    }
  else if (new_allocated_size == 0)
    clear();
  else // size_alloc < allocated_size
    {
    } // nothing to do here
}



template <class T>
inline void
AlignedVector<T>::shrink_to_fit()
{
  if constexpr (running_in_debug_mode())
    {
      Assert(replicated_across_communicator == false,
             ExcAlignedVectorChangeAfterReplication());
    }
  const size_type used_size      = used_elements_end - elements.get();
  const size_type allocated_size = allocated_elements_end - elements.get();
  if (allocated_size > used_size)
    allocate_and_move(used_size, used_size, used_size);
}



template <class T>
inline void
AlignedVector<T>::clear()
{
  // Just release the memory (which also calls the destructor of the elements),
  // and then set the auxiliary pointers to invalid values.
  //
  // Note that at the time of releasing the old memory, 'used_elements_end'
  // still points to its previous value, and this is important for the
  // deleter object of the previously allocated array (see how it loops over
  // the to-be-destroyed elements a few lines above).
  elements.reset();
  used_elements_end      = nullptr;
  allocated_elements_end = nullptr;
}



template <class T>
inline void
AlignedVector<T>::push_back(const T in_data)
{
  Assert(used_elements_end <= allocated_elements_end, ExcInternalError());
  if (used_elements_end == allocated_elements_end)
    reserve(std::max(2 * capacity(), static_cast<size_type>(16)));
  new (used_elements_end++) T(in_data);
}



template <class T>
inline typename AlignedVector<T>::reference
AlignedVector<T>::back()
{
  AssertIndexRange(0, size());
  T *field = used_elements_end - 1;
  return *field;
}



template <class T>
inline typename AlignedVector<T>::const_reference
AlignedVector<T>::back() const
{
  AssertIndexRange(0, size());
  const T *field = used_elements_end - 1;
  return *field;
}



template <class T>
template <typename ForwardIterator>
inline void
AlignedVector<T>::insert_back(ForwardIterator begin, ForwardIterator end)
{
  const size_type old_size = size();
  reserve(old_size + (end - begin));
  for (; begin != end; ++begin, ++used_elements_end)
    new (used_elements_end) T(*begin);
}



template <class T>
template <typename RandomAccessIterator, typename>
inline typename AlignedVector<T>::iterator
AlignedVector<T>::insert(const_iterator       position,
                         RandomAccessIterator begin,
                         RandomAccessIterator end)
{
  Assert(replicated_across_communicator == false,
         ExcAlignedVectorChangeAfterReplication());
  Assert(this->begin() <= position && position <= this->end(),
         ExcMessage("The position iterator is not valid."));
  const auto offset = position - this->begin();

  const size_type old_size   = size();
  const size_type range_size = end - begin;
  const size_type new_size   = old_size + range_size;
  if (range_size != 0)
    {
      // This is similar to allocate_and_move(), except that we need to move
      // whatever was before position and whatever is after it into two
      // different places
      T *new_data_ptr = nullptr;
      Utilities::System::posix_memalign(
        reinterpret_cast<void **>(&new_data_ptr), 64, new_size * sizeof(T));

      // Correctly handle the case where the range is inside the present array
      // by creating a temporary.
      AlignedVector<T> temporary(begin, end);
      dealii::internal::AlignedVectorMoveConstruct<T *, T>(
        elements.get(), elements.get() + offset, new_data_ptr);
      dealii::internal::AlignedVectorMoveConstruct<T *, T>(
        temporary.begin(), temporary.end(), new_data_ptr + offset);
      dealii::internal::AlignedVectorMoveConstruct<T *, T>(
        elements.get() + offset,
        elements.get() + old_size,
        new_data_ptr + offset + range_size);

      Deleter deleter(this);
      elements          = decltype(elements)(new_data_ptr, std::move(deleter));
      used_elements_end = elements.get() + new_size;
      allocated_elements_end = elements.get() + new_size;
    }
  return this->begin() + offset;
}



template <class T>
inline void
AlignedVector<T>::fill()
{
  dealii::internal::AlignedVectorDefaultInitialize<T, false>(size(),
                                                             elements.get());
}



template <class T>
inline void
AlignedVector<T>::fill(const T &value)
{
  dealii::internal::AlignedVectorInitialize<T, false>(size(),
                                                      value,
                                                      elements.get());
}



template <class T>
inline void
AlignedVector<T>::replicate_across_communicator(const MPI_Comm     communicator,
                                                const unsigned int root_process)
{
#  ifdef DEAL_II_WITH_MPI

  // Let the root process broadcast its size. If it is zero, then all
  // processes just clear() their memory and reset themselves to a non-shared
  // empty object -- there is no point to run through complicated MPI
  // calls if the end result is an empty array. Otherwise, we continue on.
  const size_type new_size =
    Utilities::MPI::broadcast(communicator, size(), root_process);
  if (new_size == 0)
    {
      clear();
      return;
    }


  // **** Step 0 ****
  // All but the root process no longer need their data, so release the memory
  // used to store the previous elements.
  if (Utilities::MPI::this_mpi_process(communicator) != root_process)
    {
      elements.reset();
      used_elements_end      = nullptr;
      allocated_elements_end = nullptr;
    }

  // **** Step 1 ****
  // Create communicators for each group of processes that can use
  // shared memory areas. Within each of these groups, we don't care about
  // which rank each of the old processes gets except that we would like to
  // make sure that the (global) root process will have rank=0 within
  // its own sub-communicator. We can do that through the third argument of
  // MPI_Comm_split_type (the "key") which is an integer meant to indicate the
  // order of processes within the split communicators, and we should set it to
  // zero for the root processes and one for all others -- which means that
  // for all of these other processes, MPI can choose whatever order it
  // wants because they have the same key (MPI then documents that these ties
  // will be broken according to these processes' rank in the old group).
  //
  // At least that's the theory. In practice, the MPI implementation where
  // this function was developed on does not seem to do that. (Bug report
  // is here: https://github.com/open-mpi/ompi/issues/8854)
  // We work around this by letting MPI_Comm_split_type choose whatever
  // rank it wants, and then reshuffle with MPI_Comm_split in a second
  // step -- not elegant, nor efficient, but seems to work:
  MPI_Comm shmem_group_communicator;
  {
    MPI_Comm shmem_group_communicator_temp;
    int      ierr = MPI_Comm_split_type(communicator,
                                   MPI_COMM_TYPE_SHARED,
                                   /* key */ 0,
                                   MPI_INFO_NULL,
                                   &shmem_group_communicator_temp);
    AssertThrowMPI(ierr);

    const int key =
      (Utilities::MPI::this_mpi_process(communicator) == root_process ? 0 : 1);
    ierr = MPI_Comm_split(shmem_group_communicator_temp,
                          /* color */ 0,
                          key,
                          &shmem_group_communicator);
    AssertThrowMPI(ierr);

    // Verify the explanation from above
    if (Utilities::MPI::this_mpi_process(communicator) == root_process)
      Assert(Utilities::MPI::this_mpi_process(shmem_group_communicator) == 0,
             ExcInternalError());

    // And get rid of the temporary communicator
    Utilities::MPI::free_communicator(shmem_group_communicator_temp);
  }
  const bool is_shmem_root =
    Utilities::MPI::this_mpi_process(shmem_group_communicator) == 0;

  // **** Step 2 ****
  // We then have to send the state of the current object from the
  // root process to one exemplar in each shmem group. To this end,
  // we create another subcommunicator that includes the ranks zero
  // of all shmem groups, and because of the trick above, we know
  // that this also includes the original root process.
  //
  // There are different ways of creating a "shmem_roots_communicator".
  // The conceptually easiest way is to create an MPI_Group that only
  // includes the shmem roots and then create a communicator from this
  // via MPI_Comm_create or MPI_Comm_create_group. The problem
  // with this is that we would have to exchange among all processes
  // which ones are shmem roots and which are not. This is awkward.
  //
  // A simpler way is to use MPI_Comm_split that uses "colors" to
  // indicate which sub-communicator each process wants to be in.
  // We use color=0 to indicate the group of shmem roots, and color=1
  // for all other processes -- the latter will simply not ever do
  // anything among themselves with the communicator so created.
  //
  // Using MPI_Comm_split has the additional benefit that, just as above,
  // we can choose where each rank will end up in shmem_roots_communicator.
  // We again set key=0 for the original root_process, and key=1 for all other
  // ranks; then, the global root becomes rank=0 on the
  // shmem_roots_communicator. We don't care how the other processes are
  // ordered.
  MPI_Comm shmem_roots_communicator;
  {
    const int key =
      (Utilities::MPI::this_mpi_process(communicator) == root_process ? 0 : 1);

    const int ierr = MPI_Comm_split(communicator,
                                    /*color=*/
                                    (is_shmem_root ? 0 : 1),
                                    key,
                                    &shmem_roots_communicator);
    AssertThrowMPI(ierr);

    // Again verify the explanation from above
    if (Utilities::MPI::this_mpi_process(communicator) == root_process)
      Assert(Utilities::MPI::this_mpi_process(shmem_roots_communicator) == 0,
             ExcInternalError());
  }

  const unsigned int shmem_roots_root_rank = 0;
  const bool         is_shmem_roots_root =
    (is_shmem_root && (Utilities::MPI::this_mpi_process(
                         shmem_roots_communicator) == shmem_roots_root_rank));

  // Now let the original root_process broadcast the current object to all
  // shmem roots. We know that the last rank is the original root process that
  // has all of the data.
  if (is_shmem_root)
    {
      if (std::is_trivially_copyable_v<T> == true)
        {
          // The data is trivially copyable, i.e., we can copy things directly
          // without having to go through the serialization/deserialization
          // machinery of Utilities::MPI::broadcast.
          //
          // In that case, first tell all of the other shmem roots how many
          // elements we will have to deal with, and let them resize their
          // (non-shared) arrays.
          const size_type new_size =
            Utilities::MPI::broadcast(shmem_roots_communicator,
                                      size(),
                                      shmem_roots_root_rank);
          if (is_shmem_roots_root == false)
            resize(new_size);

          // Then directly copy from the root process into these buffers
          int ierr = MPI_Bcast(elements.get(),
                               sizeof(T) * new_size,
                               MPI_CHAR,
                               shmem_roots_root_rank,
                               shmem_roots_communicator);
          AssertThrowMPI(ierr);
        }
      else
        {
          // The objects to be sent around are not "trivial", and so we have
          // to go through the serialization/deserialization machinery. On all
          // but the sending process, overwrite the current state with the
          // vector just broadcast.
          //
          // On the root rank, this would lead to resetting the 'entries'
          // pointer, which would trigger the deleter which would lead to a
          // deadlock. So we just send the result of the broadcast() call to
          // nirvana on the root process and keep our current state.
          if (Utilities::MPI::this_mpi_process(shmem_roots_communicator) == 0)
            std::ignore = Utilities::MPI::broadcast(shmem_roots_communicator,
                                                    *this,
                                                    shmem_roots_root_rank);
          else
            *this = Utilities::MPI::broadcast(shmem_roots_communicator,
                                              *this,
                                              shmem_roots_root_rank);
        }
    }

  // We no longer need the shmem roots communicator, so get rid of it
  Utilities::MPI::free_communicator(shmem_roots_communicator);


  // **** Step 3 ****
  // At this point, all shmem groups have one shmem root process that has
  // a copy of the data. This is the point where each shmem group should
  // establish a shmem area to put the data into. As mentioned above,
  // we know that the shmem roots are the last rank in their respective
  // shmem_group_communicator.
  //
  // The process for all of this works as follows: While all processes in
  // the shmem group participate in the generation of the shmem memory window,
  // only the shmem root actually allocates any memory -- the rest just
  // allocate zero bytes of their own. We allocate space for exactly
  // size() elements (computed on the shmem_root that already has the data)
  // and add however many bytes are necessary so that we know that we can align
  // things to 64-byte boundaries. The worst case happens if the memory system
  // gives us a pointer to an address one byte past a desired alignment
  // boundary, and in that case aligning the memory will require us to waste the
  // first (align_by-1) bytes. So we have to ask for
  //   size() * sizeof(T) + (align_by - 1)
  // bytes.
  //
  // Before MPI 4.0, there was no way to specify that we want memory aligned to
  // a certain number of bytes. This is going to come back to bite us further
  // down below when we try to get a properly aligned pointer to our memory
  // region, see the commentary there. Starting with MPI 4.0, one can set a
  // flag in an MPI_Info structure that requests a desired alignment, so we do
  // this for forward compatibility; MPI implementations ignore flags they don't
  // know anything about, and so setting this flag is backward compatible also
  // to older MPI versions.
  MPI_Win        shmem_window;
  void          *base_ptr;
  const MPI_Aint align_by = 64;
  const MPI_Aint alloc_size =
    Utilities::MPI::broadcast(shmem_group_communicator,
                              (size() * sizeof(T) + (align_by - 1)),
                              0);

  {
    int ierr;

    MPI_Info mpi_info;
    ierr = MPI_Info_create(&mpi_info);
    AssertThrowMPI(ierr);
    ierr = MPI_Info_set(mpi_info,
                        "mpi_minimum_memory_alignment",
                        std::to_string(align_by).c_str());
    AssertThrowMPI(ierr);
    ierr = MPI_Win_allocate_shared((is_shmem_root ? alloc_size : 0),
                                   /* disp_unit = */ 1,
                                   mpi_info,
                                   shmem_group_communicator,
                                   &base_ptr,
                                   &shmem_window);
    AssertThrowMPI(ierr);

    ierr = MPI_Info_free(&mpi_info);
    AssertThrowMPI(ierr);
  }


  // **** Step 4 ****
  // The next step is to teach all non-shmem root processes what the pointer to
  // the array is that the shmem-root created. MPI has a nifty way for this
  // given that only a single process actually allocated memory in the window:
  // When calling MPI_Win_shared_query, the MPI documentation says that
  // "When rank is MPI_PROC_NULL, the pointer, disp_unit, and size returned are
  // the pointer, disp_unit, and size of the memory segment belonging the lowest
  // rank that specified size > 0. If all processes in the group attached to the
  // window specified size = 0, then the call returns size = 0 and a baseptr as
  // if MPI_ALLOC_MEM was called with size = 0."
  //
  // This will allow us to obtain the pointer to the shmem root's memory area,
  // which is the only one we care about. (None of the other processes have
  // even allocated any memory.)
  //
  // We don't need to do this on the shmem root process: This process has
  // already gotten its base_ptr correctly set above, and we can determine the
  // array size by just calling size().
  if (is_shmem_root == false)
    {
      int       disp_unit;
      MPI_Aint  alloc_size; // not actually used
      const int ierr = MPI_Win_shared_query(
        shmem_window, MPI_PROC_NULL, &alloc_size, &disp_unit, &base_ptr);
      AssertThrowMPI(ierr);

      // Make sure we actually got a pointer, and check that the disp_unit is
      // equal to 1 (as set above)
      Assert(base_ptr != nullptr, ExcInternalError());
      Assert(disp_unit == 1, ExcInternalError());
    }


  // **** Step 5 ****
  // Now that all processes know the address of the space that is visible to
  // everyone, we need to figure out whether it is properly aligned and if not,
  // find the next aligned address.
  //
  // std::align does that, but it also modifies its last two arguments. The
  // documentation of that function at
  // https://en.cppreference.com/w/cpp/memory/align is not entirely clear, but I
  // *think* that the following should do given that we do not use base_ptr and
  // available_space any further after the call to std::align.
  std::size_t available_space       = alloc_size;
  void       *base_ptr_backup       = base_ptr;
  T          *aligned_shmem_pointer = static_cast<T *>(
    std::align(align_by, new_size * sizeof(T), base_ptr, available_space));
  Assert(aligned_shmem_pointer != nullptr, ExcInternalError());

  // There is one step to guard against. It is *conceivable* that the base_ptr
  // we have previously obtained from MPI_Win_shared_query is mapped so
  // awkwardly into the different MPI processes' memory spaces that it is
  // aligned in one memory space, but not another. In that case, different
  // processes would align base_ptr differently, and adjust available_space
  // differently. We can check that by making sure that the max (or min) over
  // all processes is equal to every process's value. If that's not the case,
  // then the whole idea of aligning above is wrong and we need to rethink what
  // it means to align data in a shared memory space.
  //
  // One might be tempted to think that this is not how MPI implementations
  // actually arrange things. Alas, when developing this functionality in 2021,
  // this is really how at least OpenMPI ends up doing things. (This is with an
  // OpenMPI implementation of MPI 3.1, so it does not support the flag we set
  // in the MPI_Info structure above when allocating the memory window.) Indeed,
  // when running this code on three processes, one ends up with base_ptr values
  // of
  //     base_ptr=0x7f0842f02108
  //     base_ptr=0x7fc0a47881d0
  //     base_ptr=0x7f64872db108
  // which, most annoyingly, are aligned to 8 and 16 byte boundaries -- so there
  // is no common offset std::align could find that leads to a 64-byte
  // aligned memory address in all three memory spaces. That's a tremendous
  // nuisance and there is really nothing we can do about this other than just
  // fall back on the (unaligned) base_ptr in that case.
  if (Utilities::MPI::min(available_space, shmem_group_communicator) !=
      Utilities::MPI::max(available_space, shmem_group_communicator))
    aligned_shmem_pointer = static_cast<T *>(base_ptr_backup);


  // **** Step 6 ****
  // If this is the shmem root process, we need to copy the data into the
  // shared memory space.
  if (is_shmem_root)
    {
      if (std::is_trivially_copyable_v<T> == true)
        std::memcpy(aligned_shmem_pointer, elements.get(), sizeof(T) * size());
      else
        for (std::size_t i = 0; i < size(); ++i)
          new (&aligned_shmem_pointer[i]) T(std::move(elements[i]));
    }

  // Make sure that the shared memory host has copied the data before we try to
  // access it.
  const int ierr = MPI_Barrier(shmem_group_communicator);
  AssertThrowMPI(ierr);

  // **** Step 7 ****
  // Finally, we need to set the pointers of this object to what we just
  // learned. This also releases all memory that may have been in use
  // previously.
  //
  // The part that is a bit tricky is how to write the deleter of this
  // shared memory object. When we want to get rid of it, we need to
  // also release the MPI_Win object along with the shmem_group_communicator
  // object. That's because as long as we use the shared memory, we still need
  // to hold on to the MPI_Win object, and the MPI_Win object is based on the
  // communicator. (The former is definitely true, the latter is not quite clear
  // from the MPI documentation, but seems reasonable.) So we need to have a
  // deleter for the pointer that ensures that upon release of the memory, we
  // not only call the destructor of these memory elements (but only once, on
  // the shmem root!) but also destroy the MPI_Win and the communicator. All of
  // that is encapsulated in the following call where the deleter makes copies
  // of the arguments in the lambda capture.
  elements = decltype(elements)(aligned_shmem_pointer,
                                Deleter(this,
                                        is_shmem_root,
                                        aligned_shmem_pointer,
                                        shmem_group_communicator,
                                        shmem_window));

  // We then also have to set the other two pointers that define the state of
  // the current object. Note that the new buffer size is exactly as large as
  // necessary, i.e., can store size() elements, regardless of the number of
  // allocated elements in the original objects.
  used_elements_end      = elements.get() + new_size;
  allocated_elements_end = used_elements_end;

  // **** Consistency check ****
  // At this point, each process should have a copy of the data.
  // Verify this in some sort of round-about way
  if constexpr (running_in_debug_mode())
    {
      replicated_across_communicator      = true;
      const std::vector<char> packed_data = Utilities::pack(*this);
      const int               hash =
        std::accumulate(packed_data.begin(), packed_data.end(), int(0));
      Assert(Utilities::MPI::max(hash, communicator) == hash,
             ExcInternalError());
    }

#  else
  // No MPI -> nothing to replicate
  (void)communicator;
  (void)root_process;
#  endif
}



template <class T>
inline void
AlignedVector<T>::swap(AlignedVector<T> &vec) noexcept
{
  // Swap the data in the 'elements' objects. Then also make sure that
  // their respective deleter objects point to the right place.
  std::swap(elements, vec.elements);
  elements.get_deleter().reset_owning_object(this);
  vec.elements.get_deleter().reset_owning_object(&vec);

  // Now also swap the remaining members.
  std::swap(used_elements_end, vec.used_elements_end);
  std::swap(allocated_elements_end, vec.allocated_elements_end);
}



template <class T>
inline bool
AlignedVector<T>::empty() const
{
  return used_elements_end == elements.get();
}



template <class T>
inline typename AlignedVector<T>::size_type
AlignedVector<T>::size() const
{
  return used_elements_end - elements.get();
}



template <class T>
inline typename AlignedVector<T>::size_type
AlignedVector<T>::capacity() const
{
  return allocated_elements_end - elements.get();
}



template <class T>
inline typename AlignedVector<T>::reference
AlignedVector<T>::operator[](const size_type index)
{
  AssertIndexRange(index, size());
  return elements[index];
}



template <class T>
inline typename AlignedVector<T>::const_reference
AlignedVector<T>::operator[](const size_type index) const
{
  AssertIndexRange(index, size());
  return elements[index];
}



template <typename T>
inline typename AlignedVector<T>::pointer
AlignedVector<T>::data()
{
  return elements.get();
}



template <typename T>
inline typename AlignedVector<T>::const_pointer
AlignedVector<T>::data() const
{
  return elements.get();
}



template <class T>
inline typename AlignedVector<T>::iterator
AlignedVector<T>::begin()
{
  return elements.get();
}



template <class T>
inline typename AlignedVector<T>::iterator
AlignedVector<T>::end()
{
  return used_elements_end;
}



template <class T>
inline typename AlignedVector<T>::const_iterator
AlignedVector<T>::begin() const
{
  return elements.get();
}



template <class T>
inline typename AlignedVector<T>::const_iterator
AlignedVector<T>::end() const
{
  return used_elements_end;
}



template <class T>
template <class Archive>
inline void
AlignedVector<T>::save(Archive &ar, const unsigned int) const
{
  size_type vec_size = size();
  ar       &vec_size;
  if (vec_size > 0)
    ar &boost::serialization::make_array(elements.get(), vec_size);
}



template <class T>
template <class Archive>
inline void
AlignedVector<T>::load(Archive &ar, const unsigned int)
{
  size_type vec_size = 0;
  ar       &vec_size;

  if (vec_size > 0)
    {
      reserve(vec_size);
      ar &boost::serialization::make_array(elements.get(), vec_size);
      used_elements_end = elements.get() + vec_size;
    }
}



template <class T>
inline typename AlignedVector<T>::size_type
AlignedVector<T>::memory_consumption() const
{
  size_type memory = sizeof(*this);
  for (const T *t = elements.get(); t != used_elements_end; ++t)
    memory += dealii::MemoryConsumption::memory_consumption(*t);
  memory += sizeof(T) * (allocated_elements_end - used_elements_end);
  return memory;
}


#endif // ifndef DOXYGEN


/**
 * Relational operator == for AlignedVector
 *
 * @relatesalso AlignedVector
 */
template <class T>
bool
operator==(const AlignedVector<T> &lhs, const AlignedVector<T> &rhs)
{
  if (lhs.size() != rhs.size())
    return false;
  for (typename AlignedVector<T>::const_iterator lit = lhs.begin(),
                                                 rit = rhs.begin();
       lit != lhs.end();
       ++lit, ++rit)
    if (*lit != *rit)
      return false;
  return true;
}



/**
 * Relational operator != for AlignedVector
 *
 * @relatesalso AlignedVector
 */
template <class T>
bool
operator!=(const AlignedVector<T> &lhs, const AlignedVector<T> &rhs)
{
  return !(operator==(lhs, rhs));
}


DEAL_II_NAMESPACE_CLOSE

#endif