1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES
import kokkos.core;
import kokkos.random;
#else
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#endif
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include "PerfTest_Category.hpp"
#include <utility>
namespace Test {
template <class Scalar>
std::pair<double, Scalar> custom_reduction_test(int N, int R) {
Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
Kokkos::View<Scalar*> a("A", N);
Kokkos::fill_random(a, rand_pool, 1.0);
Scalar max;
int team_size = 32;
if (team_size > Kokkos::DefaultExecutionSpace().concurrency())
team_size = Kokkos::DefaultExecutionSpace().concurrency();
// Warm up
Kokkos::parallel_reduce(
Kokkos::TeamPolicy<>(N / 1024, team_size),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
Scalar& lmax) {
Scalar team_max = Scalar(0);
for (int rr = 0; rr < R; rr++) {
int i = team.league_rank();
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange(team, 32),
[&](const int& j, Scalar& thread_max) {
Scalar t_max = Scalar(0);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(team, 32),
[&](const int& k, Scalar& max_) {
const Scalar val = a((i * 32 + j) * 32 + k);
if (val > max_) max_ = val;
if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
},
Kokkos::Max<Scalar>(t_max));
if (t_max > thread_max) thread_max = t_max;
},
Kokkos::Max<Scalar>(team_max));
}
if (team_max > lmax) lmax = team_max;
},
Kokkos::Max<Scalar>(max));
// Timing
Kokkos::Timer timer;
Kokkos::parallel_reduce(
Kokkos::TeamPolicy<>(N / 1024, team_size),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
Scalar& lmax) {
Scalar team_max = Scalar(0);
for (int rr = 0; rr < R; rr++) {
int i = team.league_rank();
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange(team, 32),
[&](const int& j, Scalar& thread_max) {
Scalar t_max = Scalar(0);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(team, 32),
[&](const int& k, Scalar& max_) {
const Scalar val = a((i * 32 + j) * 32 + k);
if (val > max_) max_ = val;
if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
},
Kokkos::Max<Scalar>(t_max));
if (t_max > thread_max) thread_max = t_max;
},
Kokkos::Max<Scalar>(team_max));
}
if (team_max > lmax) lmax = team_max;
},
Kokkos::Max<Scalar>(max));
return std::make_pair(timer.seconds(), max);
}
int get_N(benchmark::State& state) {
return (Test::command_line_num_args() > 1)
? std::stoi(Test::command_line_arg(1))
: state.range(0);
}
int get_R(benchmark::State& state) {
return (Test::command_line_num_args() > 2)
? std::stoi(Test::command_line_arg(2))
: state.range(1);
}
template <class Scalar>
static void CustomReduction(benchmark::State& state) {
size_t N = get_N(state);
size_t R = get_R(state);
for (auto _ : state) {
auto results = custom_reduction_test<double>(N, R);
// data processed in gigabytes
const double data_processed =
N * R * sizeof(Scalar) / results.first / 1'000'000'000;
state.SetIterationTime(results.first);
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
data_processed, benchmark::Counter::kIsIterationInvariantRate);
state.counters["Max"] = benchmark::Counter(results.second);
}
}
BENCHMARK(CustomReduction<double>)
->ArgNames({"N", "R"})
->Args({100'000, 1'000})
->UseManualTime();
} // namespace Test
|