File: test_sharedSpace.cpp

package info (click to toggle)
kokkos 4.7.01-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 16,636 kB
  • sloc: cpp: 223,676; sh: 2,446; makefile: 2,437; python: 91; fortran: 4; ansic: 2
file content (311 lines) | stat: -rw-r--r-- 11,657 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#include <Kokkos_Core.hpp>

#if defined _WIN32
#include <windows.h>
unsigned getBytesPerPage() {
  SYSTEM_INFO si;
  GetSystemInfo(&si);
  return si.dwPageSize;
}
#else  // unix/posix system
#include <unistd.h>
unsigned getBytesPerPage() { return sysconf(_SC_PAGESIZE); }
#endif

#include <algorithm>
#include <numeric>
#include <iostream>

namespace {
void printTimings(std::ostream& out, std::vector<double> const& tr,
                  size_t numBytes,
                  double threshold = (std::numeric_limits<double>::max)()) {
  out << "TimingResult contains " << tr.size() << " results:\n";
  for (auto it = tr.begin(); it != tr.end(); ++it) {
    out << "Duration of loop " << it - tr.begin() << " is " << *it
        << " seconds.";
    if ((*it) > threshold) {
      out << " Marked as page migation.";
    }
    out << " The transfer rate is "
        << (double)numBytes / std::pow(1000.0, 3) / (*it) *
               2.0  // as we read and write
        << " GB/s \n";
  }
}

template <typename T>
T computeMean(std::vector<T> const& results) {
  return std::accumulate(results.begin(), results.end(), T{}) / results.size();
}

template <typename ViewType>
class IncrementFunctor {
 private:
  using index_type = decltype(std::declval<ViewType>().size());
  ViewType view_;

 public:
  IncrementFunctor() = delete;

  explicit IncrementFunctor(ViewType view) : view_(view) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(const index_type idx) const { ++view_(idx); }
};

// TIMING CAPTURED KERNEL
// PREMISE: This kernel should always be memory bound, as we are measuring
// memory access times. The compute load of an increment is small enough on
// current hardware but this could be different for new hardware. As we count
// the wall-clock time in the kernel, the core frequency of the device has to be
// at the maximum to guarantee accurate masurements.
template <typename ExecSpace, typename ViewType>
std::vector<double> incrementInLoop(ViewType& view,
                                    unsigned int numRepetitions) {
  using index_type = decltype(view.size());
  Kokkos::Timer timer;
  std::vector<double> results;

  Kokkos::fence();
  for (unsigned i = 0; i < numRepetitions; ++i) {
    IncrementFunctor<ViewType> func(view);
    timer.reset();
    Kokkos::parallel_for(
        "increment",
        Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<index_type>>{
            0, view.size()},
        func);
    Kokkos::fence();
    results.push_back(timer.seconds());
  }
  return results;
}

size_t getDeviceMemorySize() {
#if defined KOKKOS_ENABLE_CUDA
  return Kokkos::Cuda{}.cuda_device_prop().totalGlobalMem;
#elif defined KOKKOS_ENABLE_HIP
  return Kokkos::HIP{}.hip_device_prop().totalGlobalMem;
#elif defined KOKKOS_ENABLE_SYCL
  auto device = Kokkos::SYCL{}.sycl_queue().get_device();
  return device.get_info<sycl::info::device::global_mem_size>();
#else
#error \
    "The sharedMemory test is only defined for Kokkos::Cuda, Kokkos::HIP, and Kokkos::SYCL"
  return 0;
#endif
}

struct Arguments {
  unsigned int numRepetitions       = 10;
  unsigned int numWarmupRepetitions = 100;
  unsigned int numDeviceHostCycles  = 3;
  double fractionOfDeviceMemory     = 0.4;
  double threshold                  = 2.0;
};

void test_sharedSpace(Arguments args) {
  const unsigned int numRepetitions       = args.numRepetitions;
  const unsigned int numWarmupRepetitions = args.numWarmupRepetitions;
  const unsigned int numDeviceHostCycles  = args.numDeviceHostCycles;
  double fractionOfDeviceMemory           = args.fractionOfDeviceMemory;
  double threshold                        = args.threshold;
  size_t numBytes = fractionOfDeviceMemory * getDeviceMemorySize();
  size_t numPages = numBytes / getBytesPerPage();

  // ALLOCATION
  Kokkos::View<int*, Kokkos::SharedSpace> migratableData(
      "migratableData", numPages * getBytesPerPage() / sizeof(int));
  Kokkos::View<int*, Kokkos::DefaultExecutionSpace::memory_space> deviceData(
      "deviceData", numPages * getBytesPerPage() / sizeof(int));
  Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace::memory_space> hostData(
      "hostData", numPages * getBytesPerPage() / sizeof(int));
  Kokkos::fence();

  // WARMUP GPU
  incrementInLoop<Kokkos::DefaultExecutionSpace>(
      deviceData,
      numWarmupRepetitions);  // warming up gpu

  // GET DEVICE LOCAL TIMINGS
  auto deviceLocalResults = incrementInLoop<Kokkos::DefaultExecutionSpace>(
      deviceData, numRepetitions);

  // WARMUP HOST
  incrementInLoop<Kokkos::DefaultHostExecutionSpace>(
      hostData,
      numWarmupRepetitions);  // warming up host
  // GET HOST LOCAL TIMINGS
  auto hostLocalResults = incrementInLoop<Kokkos::DefaultHostExecutionSpace>(
      hostData, numRepetitions);

  // GET PAGE MIGRATING TIMINGS DATA
  std::vector<decltype(deviceLocalResults)> deviceResults{};
  std::vector<decltype(hostLocalResults)> hostResults{};
  for (unsigned i = 0; i < numDeviceHostCycles; ++i) {
    // WARMUP GPU
    incrementInLoop<Kokkos::DefaultExecutionSpace>(
        deviceData,
        numWarmupRepetitions);  // warming up gpu without touching the
                                // migratableData to get measurements of initial
                                // position
    // GET RESULTS DEVICE
    deviceResults.push_back(incrementInLoop<Kokkos::DefaultExecutionSpace>(
        migratableData, numRepetitions));

    // WARMUP HOST
    incrementInLoop<Kokkos::DefaultHostExecutionSpace>(
        hostData,
        numWarmupRepetitions);  // warming up host without touching the
                                // migratableData to get measurements of initial
                                // position
    // GET RESULTS HOST
    hostResults.push_back(incrementInLoop<Kokkos::DefaultHostExecutionSpace>(
        migratableData, numRepetitions));
  }

  // COMPUTE STATISTICS OF HOST AND DEVICE LOCAL KERNELS
  auto hostLocalMean   = computeMean(hostLocalResults);
  auto deviceLocalMean = computeMean(deviceLocalResults);

  // ASSESS PAGE MIGRATIONS
  bool initialPlacementOnDevice   = false;
  bool migratesOnEverySpaceAccess = true;
  bool migratesOnlyOncePerAccess  = true;

  for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) {
    unsigned int indicatedPageMigrationsDevice = std::count_if(
        deviceResults[cycle].begin(), deviceResults[cycle].end(),
        [&](auto const& val) { return val > (threshold * deviceLocalMean); });

    if (cycle == 0 && indicatedPageMigrationsDevice == 0)
      initialPlacementOnDevice = true;
    else {
      if (indicatedPageMigrationsDevice != 1) migratesOnlyOncePerAccess = false;
    }

    unsigned int indicatedPageMigrationsHost = std::count_if(
        hostResults[cycle].begin(), hostResults[cycle].end(),
        [&](auto const& val) { return val > (threshold * hostLocalMean); });

    if (indicatedPageMigrationsHost != 1) migratesOnlyOncePerAccess = false;

    if (cycle != 0 && indicatedPageMigrationsDevice != 1 &&
        indicatedPageMigrationsHost != 1)
      migratesOnEverySpaceAccess = false;
  }

  std::cout << "Page size as reported by os: " << getBytesPerPage()
            << " bytes \n";
  std::cout << "Allocating " << numPages
            << " pages of memory in pageMigratingMemorySpace.\n"
            << "This corresponds to " << fractionOfDeviceMemory * 100
            << " % of the device memory.\n"
            << "The view size is " << migratableData.size() << "\n";

  std::cout << "Behavior found: \n";
  std::cout << "Initial placement on device is " << initialPlacementOnDevice
            << "\n";
  std::cout << "Memory migrates on every space access is "
            << migratesOnEverySpaceAccess << "\n";
  std::cout << "Memory migrates only once per access "
            << migratesOnlyOncePerAccess << "\n\n";

  std::cout << "Please look at the following timings. A migration was "
               "marked detected if the time was larger than "
            << threshold * hostLocalMean << " for the host and "
            << threshold * deviceLocalMean << " for the device\n\n";

  std::cout << "#############TIMINGS WITH SHAREDSPACE##################\n";

  for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) {
    std::cout << "device timings of run " << cycle << ":\n";
    printTimings(std::cout, deviceResults[cycle], numBytes,
                 threshold * deviceLocalMean);
    std::cout << "host timings of run " << cycle << ":\n";
    printTimings(std::cout, hostResults[cycle], numBytes,
                 threshold * hostLocalMean);
  }
  std::cout << "\n#############TIMINGS WITH LOCALSPACE##################\n";
  std::cout << "Device local memory timings for comparison:\n";
  printTimings(std::cout, deviceLocalResults, numBytes);
  std::cout << "Host local memory timings for comparison:\n";
  printTimings(std::cout, hostLocalResults, numBytes);
}
}  // namespace

int main(int argc, char* argv[]) {
  static const char help_flag[]                   = "--help";
  static const char numRepetitions_flag[]         = "--numRepetitions=";
  static const char numWarmupRepetitions_flag[]   = "--numWarmupRepetitions=";
  static const char numDeviceHostCycles_flag[]    = "--numDeviceHostCycles=";
  static const char fractionOfDeviceMemory_flag[] = "--fractionOfDeviceMemory=";
  static const char threshold_flag[]              = "--threshold=";

  int ask_help = 0;
  Arguments args;

  for (int i = 1; i < argc; i++) {
    const char* const a = argv[i];

    if (!strncmp(a, help_flag, strlen(help_flag))) ask_help = 1;

    if (!strncmp(a, numRepetitions_flag, strlen(numRepetitions_flag)))
      args.numRepetitions = std::stoi(a + strlen(numRepetitions_flag));

    if (!strncmp(a, numWarmupRepetitions_flag,
                 strlen(numWarmupRepetitions_flag)))
      args.numWarmupRepetitions =
          std::stoi(a + strlen(numWarmupRepetitions_flag));

    if (!strncmp(a, numDeviceHostCycles_flag, strlen(numDeviceHostCycles_flag)))
      args.numDeviceHostCycles =
          std::stoi(a + strlen(numDeviceHostCycles_flag));

    if (!strncmp(a, fractionOfDeviceMemory_flag,
                 strlen(fractionOfDeviceMemory_flag)))
      args.fractionOfDeviceMemory =
          std::stod(a + strlen(fractionOfDeviceMemory_flag));

    if (!strncmp(a, threshold_flag, strlen(threshold_flag)))
      args.threshold = std::stod(a + strlen(threshold_flag));
  }

  if (ask_help) {
    std::cout << "command line options:"
              << " " << help_flag << " " << numRepetitions_flag << "##"
              << " " << numWarmupRepetitions_flag << "##"
              << " " << numDeviceHostCycles_flag << "##"
              << " " << fractionOfDeviceMemory_flag << "##"
              << " " << threshold_flag << "##"
              << " any given Kokkos args are passed to Kokkos::initialize ##"
              << std::endl;
    return 0;
  }

  Kokkos::initialize(argc, argv);
  if constexpr (Kokkos::has_shared_space)
    test_sharedSpace(args);
  else
    std::cout
        << "The used Kokkos configuration does not support SharedSpace \n";
  Kokkos::finalize();

  return 0;
}