1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
// @Kokkos_Feature_Level_Required:12
// Unit test for hierarchical parallelism
// Create concurrent work hierarchically and verify if
// contributions of paticipating processing units corresponds to expected value
// Use a scratch pad memory for each team
#include <gtest/gtest.h>
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES
import kokkos.core;
#else
#include <Kokkos_Core.hpp>
#endif
namespace Test {
template <class ExecSpace>
struct ThreadScratch {
using policy_t = Kokkos::TeamPolicy<ExecSpace>;
using team_t = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
using data_t = Kokkos::View<size_t **, ExecSpace>;
using scratch_t = Kokkos::View<size_t *, ExecSpace,
Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
int sX, sY;
data_t v;
const int scratch_level = 1;
KOKKOS_FUNCTION
void operator()(const team_t &team) const {
// Allocate and use scratch pad memory
scratch_t v_S(team.thread_scratch(scratch_level), sY);
int n = team.league_rank();
for (int i = 0; i < sY; ++i) v_S(i) = 0;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), [&](const int m) {
// FIXME_SYCL This deadlocks in the subgroup_barrier when running on CUDA
// devices.
#ifdef KOKKOS_ENABLE_SYCL
for (int k = 0; k < sY; ++k) v_S(k) += sX * sY * n + sY * m + k;
#else
Kokkos::parallel_for(
Kokkos::ThreadVectorRange(team, sY),
[&](const int k) { v_S(k) += sX * sY * n + sY * m + k; });
#endif
});
team.team_barrier();
for (int i = 0; i < sY; ++i) {
v(n, team.team_rank()) += v_S(i);
}
}
void run(const int pN, const int sX_, const int sY_) {
sX = sX_;
sY = sY_;
int scratchSize = scratch_t::shmem_size(sY);
// So this works with deprecated code enabled:
policy_t policy =
policy_t(pN, Kokkos::AUTO, 1)
.set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize));
int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag());
ASSERT_GT(max_team_size, 0);
v = data_t("Matrix", pN, max_team_size);
Kokkos::parallel_for(
"Test12a_ThreadScratch",
policy_t(pN, max_team_size)
.set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize)),
*this);
Kokkos::fence();
auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v);
size_t check = 0;
const size_t s = static_cast<size_t>(pN) * sX * sY;
for (int n = 0; n < pN; ++n)
for (int m = 0; m < max_team_size; ++m) {
check += v_H(n, m);
}
ASSERT_EQ(s * (s - 1) / 2, check);
}
};
KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH()
TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) {
#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC
GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the "
"OpenACC backend";
#endif
ThreadScratch<TEST_EXECSPACE> test;
// FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to
// pass in the Release and RelWithDebInfo builds. Does not need the team_size
// to be a multiple of 32 for the Debug builds.
#ifdef KOKKOS_ENABLE_OPENMPTARGET
test.run(1, 32, 9);
test.run(2, 64, 22);
test.run(14, 128, 321);
#else
test.run(1, 55, 9);
test.run(2, 4, 22);
test.run(14, 277, 321);
#endif
}
KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP()
} // namespace Test
|