File: test_inclusive_scan.cpp

package info (click to toggle)
kokkos 4.7.01-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 16,636 kB
  • sloc: cpp: 223,676; sh: 2,446; makefile: 2,437; python: 91; fortran: 4; ansic: 2
file content (196 lines) | stat: -rw-r--r-- 6,067 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <cstddef>
#include <cstdint>
#include <tuple>
#include <type_traits>

#include <benchmark/benchmark.h>

#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES
import kokkos.std_algorithms;
#else
#include <Kokkos_StdAlgorithms.hpp>
#endif
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
// FIXME: Benchmark_Context.hpp should be moved to a common location
#include "../../core/perf_test/Benchmark_Context.hpp"

namespace {

namespace KE = Kokkos::Experimental;

using ExecSpace     = Kokkos::DefaultExecutionSpace;
using HostExecSpace = Kokkos::DefaultHostExecutionSpace;

// A tag struct to identify when inclusive scan with the implicit sum
// based binary operation needs to be called.
template <class ValueType>
struct ImpSumBinOp;

template <class ValueType>
struct SumFunctor {
  KOKKOS_FUNCTION
  ValueType operator()(const ValueType& a, const ValueType& b) const {
    return (a + b);
  }
};

template <class ValueType>
struct MaxFunctor {
  KOKKOS_FUNCTION
  ValueType operator()(const ValueType& a, const ValueType& b) const {
    if (a > b)
      return a;
    else
      return b;
  }
};

// Helper to obtain last element of a view
template <class T>
T obtain_last_elem(const Kokkos::View<T*, ExecSpace>& v) {
  T last_element;
  Kokkos::deep_copy(last_element, Kokkos::subview(v, v.extent(0) - 1));
  return last_element;
}

// Helper to allocate input and output views
template <class T>
auto prepare_views(const std::size_t kProbSize) {
  Kokkos::View<T*, ExecSpace> in{"input", kProbSize};
  Kokkos::View<T*, ExecSpace> out{"output", kProbSize};

  auto h_in = Kokkos::create_mirror_view(in);

  for (std::size_t i = 0; i < kProbSize; ++i) {
    h_in(i) = i;
  }

  Kokkos::deep_copy(in, h_in);

  return std::make_tuple(in, out, h_in);
}

// Perform scan with a reference implementation
template <class T, class ViewType, class ScanFunctor = SumFunctor<T>>
T ref_scan(const ViewType& h_in, ScanFunctor scan_functor = ScanFunctor()) {
  std::size_t view_size = h_in.extent(0);

  Kokkos::View<T*, HostExecSpace> h_out("output", view_size);

  // FIXME: We have GCC 8.4.0 based check in our ORNL Jenkins CI.
  // std::inclusive_scan is available only from GCC 9.3. Since, GCC 9.1
  // std::inclusive_scan that takes execution policy is available. However,
  // there is error with <execution> header before GCC 10.1.
  h_out(0) = h_in(0);

  for (std::size_t i = 1; i < view_size; ++i) {
    h_out(i) = scan_functor(h_in(i), h_out(i - 1));
  }

  return h_out(view_size - 1);
}

// Inclusive Scan with default binary operation (sum) or user provided functor
// Note: The nature of the functor must be compatible with the
// elements in the input and output views
template <class T, template <class> class ScanFunctor = ImpSumBinOp>
auto inclusive_scan(const Kokkos::View<T*, ExecSpace>& in,
                    const Kokkos::View<T*, ExecSpace>& out, T res_check) {
  ExecSpace().fence();
  Kokkos::Timer timer;

  if constexpr (std::is_same_v<ScanFunctor<T>, ImpSumBinOp<T>>) {
    KE::inclusive_scan("Default scan", ExecSpace(), KE::cbegin(in),
                       KE::cend(in), KE::begin(out));
  } else {
    KE::inclusive_scan("Scan using a functor", ExecSpace(), KE::cbegin(in),
                       KE::cend(in), KE::begin(out), ScanFunctor<T>());
  }

  ExecSpace().fence();
  double time_scan = timer.seconds();

  T res_scan  = obtain_last_elem(out);
  bool passed = (res_check == res_scan);

  return std::make_tuple(time_scan, passed);
}

// Benchmark: Inclusive Scan with default binary operation (sum)
// or user provided functor
template <class T, template <class> class ScanFunctor = ImpSumBinOp>
void BM_inclusive_scan(benchmark::State& state) {
  const std::size_t kProbSize = state.range(0);

  auto [in, out, h_in] = prepare_views<T>(kProbSize);

  T res_check;

  if constexpr (std::is_same_v<ScanFunctor<T>, ImpSumBinOp<T>>) {
    res_check = ref_scan<T>(h_in);
  } else {
    res_check = ref_scan<T>(h_in, ScanFunctor<T>());
  }

  double time_scan = 0.;
  bool passed      = false;

  for (auto _ : state) {
    if constexpr (std::is_same_v<ScanFunctor<T>, ImpSumBinOp<T>>) {
      std::tie(time_scan, passed) = inclusive_scan<T>(in, out, res_check);
    } else {
      std::tie(time_scan, passed) =
          inclusive_scan<T, ScanFunctor>(in, out, res_check);
    }

    KokkosBenchmark::report_results(state, in, 2, time_scan);
    state.counters["Passed"] = passed;
  }
}

constexpr std::size_t PROB_SIZE = 100'000'000;

}  // anonymous namespace

// FIXME: Add logic to pass min. warm-up time. Also, the value should be set
// by the user. Say, via the environment variable BENCHMARK_MIN_WARMUP_TIME.

BENCHMARK(BM_inclusive_scan<std::uint64_t>)->Arg(PROB_SIZE)->UseManualTime();
BENCHMARK(BM_inclusive_scan<std::int64_t>)->Arg(PROB_SIZE)->UseManualTime();
BENCHMARK(BM_inclusive_scan<double>)->Arg(PROB_SIZE)->UseManualTime();
BENCHMARK(BM_inclusive_scan<std::uint64_t, SumFunctor>)
    ->Arg(PROB_SIZE)
    ->UseManualTime();
BENCHMARK(BM_inclusive_scan<std::int64_t, SumFunctor>)
    ->Arg(PROB_SIZE)
    ->UseManualTime();
BENCHMARK(BM_inclusive_scan<double, SumFunctor>)
    ->Arg(PROB_SIZE)
    ->UseManualTime();
BENCHMARK(BM_inclusive_scan<std::uint64_t, MaxFunctor>)
    ->Arg(PROB_SIZE)
    ->UseManualTime();
BENCHMARK(BM_inclusive_scan<std::int64_t, MaxFunctor>)
    ->Arg(PROB_SIZE)
    ->UseManualTime();
BENCHMARK(BM_inclusive_scan<double, MaxFunctor>)
    ->Arg(PROB_SIZE)
    ->UseManualTime();