1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>
#include <cstdlib>
// DualView helps you manage data and computations that take place on
// two different memory spaces. Examples include CUDA device memory
// and (CPU) host memory (currently implemented), or Intel Knights
// Landing MCDRAM and DRAM (not yet implemented). For example, if you
// have ported only some parts of you application to run in CUDA,
// DualView can help manage moving data between the parts of your
// application that work best with CUDA, and the parts that work
// better on the CPU.
//
// A DualView takes the same template parameters as a View, but
// contains two Views: One that lives in the DualView's memory space,
// and one that lives in that memory space's host mirror space. If
// both memory spaces are the same, then the two Views just alias one
// another. This means that you can use DualView all the time, even
// when not running in a memory space like CUDA. DualView's
// operations to help you manage memory take almost no time in that
// case. This makes your code even more performance portable.
using view_type = Kokkos::DualView<double*>;
using idx_type = Kokkos::DualView<int**>;
template <class ExecutionSpace>
struct localsum {
// If the functor has a public 'execution_space' alias, that defines
// the functor's execution space (where it runs in parallel). This
// overrides Kokkos' default execution space.
using execution_space = ExecutionSpace;
using memory_space = std::conditional_t<
std::is_same_v<ExecutionSpace, Kokkos::DefaultExecutionSpace>,
idx_type::memory_space, idx_type::host_mirror_space>;
// Get the view types on the particular device for which the functor
// is instantiated.
//
// "const_data_type" is an alias in View (and DualView) which is
// the const version of the first template parameter of the View.
// For example, the const_data_type version of double** is const
// double**.
Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space>
idx;
// "data_type" is an alias in ViewTraits (and DualView) which is the
// array version of the value(s) stored in the View.
Kokkos::View<view_type::data_type, view_type::array_layout, memory_space>
dest;
Kokkos::View<view_type::const_data_type, view_type::array_layout,
memory_space, Kokkos::MemoryRandomAccess>
src;
// Constructor takes DualViews, synchronizes them to the device,
// then marks them as modified on the device.
localsum(idx_type dv_idx, view_type dv_dest, view_type dv_src) {
// Extract the view on the correct Device (i.e., the correct
// memory space) from the DualView. DualView has a template
// method, view(), which is templated on the memory space. If the
// DualView has a View from that memory space, view() returns the
// View in that space.
idx = dv_idx.view<memory_space>();
dest = dv_dest.template view<memory_space>();
src = dv_src.template view<memory_space>();
// Synchronize the DualView to the correct Device.
//
// DualView's sync() method is templated on a memory space, and
// synchronizes the DualView in a one-way fashion to that memory
// space. "Synchronizing" means copying, from the other memory
// space to the Device memory space. sync() does _nothing_ if the
// Views on the two memory spaces are in sync. DualView
// determines this by the user manually marking one side or the
// other as modified; see the modify() call below.
dv_idx.sync<memory_space>();
dv_dest.template sync<memory_space>();
dv_src.template sync<memory_space>();
// Mark dest as modified on Device.
dv_dest.template modify<memory_space>();
}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
double tmp = 0.0;
for (int j = 0; j < (int)idx.extent(1); ++j) {
const double val = src(idx(i, j));
tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val);
}
dest(i) += tmp;
}
};
class ParticleType {
public:
double q;
double m;
double q_over_m;
KOKKOS_INLINE_FUNCTION
ParticleType(double q_ = -1, double m_ = 1) : q(q_), m(m_), q_over_m(q / m) {}
protected:
};
using ParticleTypes = Kokkos::DualView<ParticleType[10]>;
int main(int narg, char* arg[]) {
Kokkos::initialize(narg, arg);
// If View is non-trivial constructible type then add braces so it is out of
// scope before Kokkos::finalize() call
{
ParticleTypes test("Test");
Kokkos::fence();
test.view_host()(0) = ParticleType(-1e4, 1);
Kokkos::fence();
int size = 1000000;
// Create DualViews. This will allocate on both the device and its
// host_mirror_device.
idx_type idx("Idx", size, 64);
view_type dest("Dest", size);
view_type src("Src", size);
srand(134231);
// Get a reference to the host view of idx directly (equivalent to
// idx.view<idx_type::host_mirror_space>() )
idx_type::t_host h_idx = idx.view_host();
using size_type = view_type::size_type;
for (int i = 0; i < size; ++i) {
for (size_type j = 0; j < static_cast<size_type>(h_idx.extent(1)); ++j) {
h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
}
}
// Mark idx as modified on the host_mirror_space so that a
// sync to the device will actually move data. The sync happens in
// the functor's constructor.
idx.modify<idx_type::host_mirror_space>();
// Run on the device. This will cause a sync of idx to the device,
// since it was marked as modified on the host.
Kokkos::Timer timer;
Kokkos::parallel_for(size,
localsum<view_type::execution_space>(idx, dest, src));
Kokkos::fence();
double sec1_dev = timer.seconds();
timer.reset();
Kokkos::parallel_for(size,
localsum<view_type::execution_space>(idx, dest, src));
Kokkos::fence();
double sec2_dev = timer.seconds();
// Run on the host's default execution space (could be the same as device).
// This will cause a sync back to the host of dest. Note that if the Device
// is CUDA, the data layout will not be optimal on host, so performance is
// lower than what it would be for a pure host compilation.
timer.reset();
Kokkos::parallel_for(
size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
Kokkos::fence();
double sec1_host = timer.seconds();
timer.reset();
Kokkos::parallel_for(
size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
Kokkos::fence();
double sec2_host = timer.seconds();
printf("Device Time with Sync: %f without Sync: %f \n", sec1_dev, sec2_dev);
printf("Host Time with Sync: %f without Sync: %f \n", sec1_host,
sec2_host);
}
Kokkos::finalize();
}
|