File: PerfTest_CustomReduction.cpp

package info (click to toggle)
kokkos 5.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,140 kB
  • sloc: cpp: 225,293; sh: 1,250; python: 78; makefile: 16; fortran: 4; ansic: 2
file content (124 lines) | stat: -rw-r--r-- 4,253 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES
import kokkos.core;
import kokkos.random;
#else
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#endif
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"
#include "PerfTest_Category.hpp"
#include <utility>

namespace Test {
template <class Scalar>
std::pair<double, Scalar> custom_reduction_test(int N, int R) {
  Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
  Kokkos::View<Scalar*> a("A", N);
  Kokkos::fill_random(a, rand_pool, 1.0);

  Scalar max;

  int team_size = 32;
  if (team_size > Kokkos::DefaultExecutionSpace().concurrency())
    team_size = Kokkos::DefaultExecutionSpace().concurrency();
  // Warm up
  Kokkos::parallel_reduce(
      Kokkos::TeamPolicy<>(N / 1024, team_size),
      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
                    Scalar& lmax) {
        Scalar team_max = Scalar(0);
        for (int rr = 0; rr < R; rr++) {
          int i = team.league_rank();
          Kokkos::parallel_reduce(
              Kokkos::TeamThreadRange(team, 32),
              [&](const int& j, Scalar& thread_max) {
                Scalar t_max = Scalar(0);
                Kokkos::parallel_reduce(
                    Kokkos::ThreadVectorRange(team, 32),
                    [&](const int& k, Scalar& max_) {
                      const Scalar val = a((i * 32 + j) * 32 + k);
                      if (val > max_) max_ = val;
                      if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
                    },
                    Kokkos::Max<Scalar>(t_max));
                if (t_max > thread_max) thread_max = t_max;
              },
              Kokkos::Max<Scalar>(team_max));
        }
        if (team_max > lmax) lmax = team_max;
      },
      Kokkos::Max<Scalar>(max));

  // Timing
  Kokkos::Timer timer;
  Kokkos::parallel_reduce(
      Kokkos::TeamPolicy<>(N / 1024, team_size),
      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
                    Scalar& lmax) {
        Scalar team_max = Scalar(0);
        for (int rr = 0; rr < R; rr++) {
          int i = team.league_rank();
          Kokkos::parallel_reduce(
              Kokkos::TeamThreadRange(team, 32),
              [&](const int& j, Scalar& thread_max) {
                Scalar t_max = Scalar(0);
                Kokkos::parallel_reduce(
                    Kokkos::ThreadVectorRange(team, 32),
                    [&](const int& k, Scalar& max_) {
                      const Scalar val = a((i * 32 + j) * 32 + k);
                      if (val > max_) max_ = val;
                      if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
                    },
                    Kokkos::Max<Scalar>(t_max));
                if (t_max > thread_max) thread_max = t_max;
              },
              Kokkos::Max<Scalar>(team_max));
        }
        if (team_max > lmax) lmax = team_max;
      },
      Kokkos::Max<Scalar>(max));

  return std::make_pair(timer.seconds(), max);
}

int get_N(benchmark::State& state) {
  return (Test::command_line_num_args() > 1)
             ? std::stoi(Test::command_line_arg(1))
             : state.range(0);
}

int get_R(benchmark::State& state) {
  return (Test::command_line_num_args() > 2)
             ? std::stoi(Test::command_line_arg(2))
             : state.range(1);
}

template <class Scalar>
static void CustomReduction(benchmark::State& state) {
  size_t N = get_N(state);
  size_t R = get_R(state);

  for (auto _ : state) {
    auto results = custom_reduction_test<double>(N, R);
    // data processed in gigabytes
    const double data_processed =
        N * R * sizeof(Scalar) / results.first / 1'000'000'000;

    state.SetIterationTime(results.first);
    state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
        data_processed, benchmark::Counter::kIsIterationInvariantRate);
    state.counters["Max"] = benchmark::Counter(results.second);
  }
}

BENCHMARK(CustomReduction<double>)
    ->ArgNames({"N", "R"})
    ->Args({100'000, 1'000})
    ->UseManualTime();

}  // namespace Test