1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>
// These two View types are both 2-D arrays of double. However, they
// have different layouts in memory. left_type has "layout left,"
// which means "column major," the same as in Fortran, the BLAS, or
// LAPACK. right_type has "layout right," which means "row major,"
// the same as in C, C++, or Java.
using left_type = Kokkos::View<double**, Kokkos::LayoutLeft>;
using right_type = Kokkos::View<double**, Kokkos::LayoutRight>;
// This is a one-dimensional View, so the layout matters less.
// However, it still has a layout! Since its layout is not specified
// explicitly in the type, its layout is a function of the memory
// space. For example, the default Cuda layout is LayoutLeft, and the
// default Host layout is LayoutRight.
using view_type = Kokkos::View<double*>;
// parallel_for functor that fills the given View with some data. It
// expects to access the View by rows in parallel: each call i of
// operator() accesses a row.
template <class ViewType>
struct init_view {
ViewType a;
init_view(ViewType a_) : a(a_) {}
using size_type = typename ViewType::size_type;
KOKKOS_INLINE_FUNCTION
void operator()(const typename ViewType::size_type i) const {
// On CPUs this loop could be vectorized so j should do stride 1
// access on a for optimal performance. I.e. a should be LayoutRight.
// On GPUs threads should do coalesced loads and stores. That means
// that i should be the stride one access for optimal performance.
for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
a(i, j) = 1.0 * a.extent(0) * i + 1.0 * j;
}
}
};
// Compute a contraction of v1 and v2 into a:
//
// a(i) := sum_j (v1(i,j) * v2(j,i))
//
// Since the functor is templated on the ViewTypes itself it doesn't matter what
// there layouts are. That means you can use different layouts on different
// architectures.
template <class ViewType1, class ViewType2>
struct contraction {
view_type a;
typename ViewType1::const_type v1;
typename ViewType2::const_type v2;
contraction(view_type a_, ViewType1 v1_, ViewType2 v2_)
: a(a_), v1(v1_), v2(v2_) {}
using size_type = typename view_type::size_type;
// As with the initialization functor the performance of this operator
// depends on the architecture and the chosen data layouts.
// On CPUs optimal would be to vectorize the inner loop, so j should be the
// stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft.
// In order to get coalesced access on GPUs where i corresponds closely to
// the thread Index, i must be the stride 1 dimension. That means v1 should be
// LayoutLeft and v2 LayoutRight.
KOKKOS_INLINE_FUNCTION
void operator()(const view_type::size_type i) const {
for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
a(i) = v1(i, j) * v2(j, i);
}
}
};
// Compute a dot product. This is used for result verification.
struct dot {
view_type a;
dot(view_type a_) : a(a_) {}
using value_type = double; // Specify type for reduction target, lsum
KOKKOS_INLINE_FUNCTION
void operator()(const view_type::size_type i, double& lsum) const {
lsum += a(i) * a(i);
}
};
int main(int narg, char* arg[]) {
// When initializing Kokkos, you may pass in command-line arguments,
// just like with MPI_Init(). Kokkos reserves the right to remove
// arguments from the list that start with '--kokkos-'.
Kokkos::initialize(narg, arg);
{
int size = 10000;
view_type a("A", size);
// Define two views with LayoutLeft and LayoutRight.
left_type l("L", size, 10000);
right_type r("R", size, 10000);
// Initialize the data in the views.
Kokkos::parallel_for(size, init_view<left_type>(l));
Kokkos::parallel_for(size, init_view<right_type>(r));
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
// fast on GPUs and slow on CPUs
Kokkos::Timer time1;
Kokkos::parallel_for(size, contraction<left_type, right_type>(a, l, r));
Kokkos::fence();
double sec1 = time1.seconds();
double sum1 = 0;
Kokkos::parallel_reduce(size, dot(a), sum1);
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
// fast on CPUs and slow on GPUs
Kokkos::Timer time2;
Kokkos::parallel_for(size, contraction<right_type, left_type>(a, r, l));
Kokkos::fence();
double sec2 = time2.seconds();
double sum2 = 0;
Kokkos::parallel_reduce(size, dot(a), sum2);
// Kokkos' reductions are deterministic.
// The results should always be equal.
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n", sec1,
sec2, sum2 == sum1);
}
Kokkos::finalize();
}
|