File: TestSharedSpace.cpp

package info (click to toggle)
kokkos 4.7.01-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 16,636 kB
  • sloc: cpp: 223,676; sh: 2,446; makefile: 2,437; python: 91; fortran: 4; ansic: 2
file content (239 lines) | stat: -rw-r--r-- 8,797 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>

#if defined(_WIN32)
#include <windows.h>
unsigned getBytesPerPage() {
  SYSTEM_INFO si;
  GetSystemInfo(&si);
  return si.dwPageSize;
}

#else  // unix/posix system
#include <unistd.h>
unsigned getBytesPerPage() { return sysconf(_SC_PAGESIZE); }
#endif

#include <algorithm>
#include <numeric>
#include <iostream>

namespace {
void printTimings(std::ostream& out, std::vector<uint64_t> const& tr,
                  uint64_t threshold = (std::numeric_limits<uint64_t>::max)()) {
  out << "TimingResult contains " << tr.size() << " results:\n";
  for (auto it = tr.begin(); it != tr.end(); ++it) {
    out << "Duration of loop " << it - tr.begin() << " is " << *it
        << " clock cycles. ";
    if ((*it) > threshold) out << "Migration assumed.";

    out << "\n";
  }
}

template <typename T>
T computeMean(std::vector<T> const& results) {
  return std::accumulate(results.begin(), results.end(), T{}) / results.size();
}

template <typename ViewType>
class IncrementFunctor {
 private:
  using index_type = decltype(std::declval<ViewType>().size());
  ViewType view_;

 public:
  IncrementFunctor() = delete;

  explicit IncrementFunctor(ViewType view) : view_(view) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(const index_type idx, uint64_t& clockTics) const {
    uint64_t start = Kokkos::Impl::clock_tic();
    ++view_(idx);
    clockTics += Kokkos::Impl::clock_tic() - start;
  }
};

// TIMING CAPTURED KERNEL
// PREMISE: This kernel should always be memory bound, as we are measuring
// memory access times. The compute load of an increment is small enough on
// current hardware but this could be different for new hardware. As we count
// the clocks in the kernel, the core frequency of the device has to be fast
// enough to guarante that the kernel stays memory bound.
template <typename ExecSpace, typename ViewType>
std::vector<uint64_t> incrementInLoop(ViewType& view,
                                      unsigned int numRepetitions) {
  using index_type = decltype(view.size());
  std::vector<uint64_t> results;

  Kokkos::fence();
  for (unsigned i = 0; i < numRepetitions; ++i) {
    uint64_t sum_clockTics;
    IncrementFunctor<ViewType> func(view);
    Kokkos::parallel_reduce(
        "increment",
        Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<index_type>>{
            0, view.size()},
        func, sum_clockTics);
    Kokkos::fence();
    results.push_back(sum_clockTics / view.size());
  }
  return results;
}

TEST(defaultdevicetype, shared_space) {
  ASSERT_TRUE(Kokkos::has_shared_space);

  if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace,
                               Kokkos::DefaultHostExecutionSpace>)
    GTEST_SKIP() << "Skipping as host and device are the same space";

#if defined(KOKKOS_ARCH_AMD_GPU) && defined(KOKKOS_ENABLE_HIP)
  if (!Kokkos::SharedSpace().impl_hip_driver_check_page_migration())
    GTEST_SKIP()
        << "skipping because specified arch does not support page migration";
#endif
#if defined(KOKKOS_ENABLE_SYCL) &&      \
    (!defined(KOKKOS_ARCH_INTEL_GPU) || \
     !defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE))
  GTEST_SKIP() << "skipping because clock_tic is only defined for sycl+intel "
                  "gpu and with rdc support";
#endif
#if defined(KOKKOS_ENABLE_DEBUG)
  GTEST_SKIP()
      << "skipping due to spurious failures when compiling in Debug mode";
#endif

  const unsigned int numRepetitions      = 10;
  const unsigned int numDeviceHostCycles = 3;
  const double threshold                 = 1.5;
  const size_t numPages                  = 100;
  const size_t numBytes                  = numPages * getBytesPerPage();

  using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace;
  using HostExecutionSpace   = Kokkos::DefaultHostExecutionSpace;

  std::vector<uint64_t> deviceLocalTimings{};
  std::vector<uint64_t> hostLocalTimings{};
  std::vector<decltype(deviceLocalTimings)> deviceSharedTimings{};
  std::vector<decltype(hostLocalTimings)> hostSharedTimings{};

  int64_t deviceLocalMean;
  int64_t hostLocalMean;

  int testRepetition               = 0;
  bool passed                      = false;
  bool fastAsLocalOnRepeatedAccess = false;

  while (!passed && testRepetition < 3) {
    // ALLOCATION
    Kokkos::View<int*, Kokkos::SharedSpace> sharedData("sharedData",
                                                       numBytes / sizeof(int));
    Kokkos::View<int*, DeviceExecutionSpace::memory_space> deviceData(
        "deviceData", numBytes / sizeof(int));
    Kokkos::View<int*, HostExecutionSpace::memory_space> hostData(
        "hostData", numBytes / sizeof(int));
    Kokkos::fence();
    // GET DEFAULT EXECSPACE LOCAL TIMINGS
    deviceLocalTimings =
        incrementInLoop<DeviceExecutionSpace>(deviceData, numRepetitions);

    // GET DEFAULT HOSTEXECSPACE LOCAL TIMINGS
    hostLocalTimings =
        incrementInLoop<HostExecutionSpace>(hostData, numRepetitions);

    // GET PAGE MIGRATING TIMINGS DATA
    deviceSharedTimings.clear();
    hostSharedTimings.clear();
    for (unsigned i = 0; i < numDeviceHostCycles; ++i) {
      // GET RESULTS DEVICE
      deviceSharedTimings.push_back(
          incrementInLoop<DeviceExecutionSpace>(sharedData, numRepetitions));

      // GET RESULTS HOST
      hostSharedTimings.push_back(
          incrementInLoop<HostExecutionSpace>(sharedData, numRepetitions));
    }

    // COMPUTE STATISTICS OF HOST AND DEVICE LOCAL KERNELS
    deviceLocalMean = computeMean(deviceLocalTimings);
    hostLocalMean   = computeMean(hostLocalTimings);

    // ASSESS RESULTS
    fastAsLocalOnRepeatedAccess = true;

    for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) {
      std::for_each(std::next(deviceSharedTimings[cycle].begin()),
                    deviceSharedTimings[cycle].end(),
                    [&](const uint64_t timing) {
                      (timing < threshold * deviceLocalMean)
                          ? fastAsLocalOnRepeatedAccess &= true
                          : fastAsLocalOnRepeatedAccess &= false;
                    });

      std::for_each(std::next(hostSharedTimings[cycle].begin()),
                    hostSharedTimings[cycle].end(), [&](const uint64_t timing) {
                      (timing < threshold * hostLocalMean)
                          ? fastAsLocalOnRepeatedAccess &= true
                          : fastAsLocalOnRepeatedAccess &= false;
                    });
    }

    // CHECK IF PASSED
    passed = (fastAsLocalOnRepeatedAccess);
    ++testRepetition;
  }

  // PRINT IF NOT PASSED
  if (!passed) {
    std::cout << "Page size as reported by os: " << getBytesPerPage()
              << " bytes \n";
    std::cout << "Allocating " << numPages
              << " pages of memory in SharedSpace.\n";

    std::cout << "Behavior found: \n";
    std::cout << "SharedSpace is as fast as local space on repeated access: "
              << fastAsLocalOnRepeatedAccess << ", we expect true \n\n";

    std::cout
        << "Please look at the following timings. The first access in a "
           "different ExecutionSpace is not evaluated for the test. As we "
           "expect the memory to migrate during the first access it might have "
           "a higher cycle count than subsequent accesses, depending on your "
           "hardware. If the cycles are more than "
        << threshold
        << " times the cycles for pure local memory access, we assume a page "
           "migration happened.\n\n";

    std::cout << "################SHARED SPACE####################\n";
    for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) {
      std::cout << "DeviceExecutionSpace timings of run " << cycle << ":\n";
      printTimings(std::cout, deviceSharedTimings[cycle],
                   threshold * deviceLocalMean);
      std::cout << "HostExecutionSpace timings of run " << cycle << ":\n";
      printTimings(std::cout, hostSharedTimings[cycle],
                   threshold * hostLocalMean);
    }
    std::cout << "################LOCAL SPACE####################\n";
    printTimings(std::cout, deviceLocalTimings);
    printTimings(std::cout, hostLocalTimings);
  }
  ASSERT_TRUE(passed);
}
}  // namespace