File: uvm_example.cpp

package info (click to toggle)
kokkos 5.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,140 kB
  • sloc: cpp: 225,293; sh: 1,250; python: 78; makefile: 16; fortran: 4; ansic: 2
file content (108 lines) | stat: -rw-r--r-- 3,469 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>
#include <cstdlib>

#ifdef KOKKOS_ENABLE_CUDA
using view_type = Kokkos::View<double*, Kokkos::CudaUVMSpace>;
using idx_type  = Kokkos::View<int**, Kokkos::CudaUVMSpace>;
#else
using view_type = Kokkos::View<double*, Kokkos::HostSpace>;
using idx_type  = Kokkos::View<int**, Kokkos::HostSpace>;
#endif

template <class Device>
struct localsum {
  // Define the execution space for the functor (overrides the
  // DefaultExecutionSpace)
  using execution_space = Device;

  // Get the view types on the particular device the functor is instantiated for
  idx_type::const_type idx;
  view_type dest;
  Kokkos::View<view_type::const_data_type, view_type::array_layout,
               view_type::device_type, Kokkos::MemoryRandomAccess>
      src;

  localsum(idx_type idx_, view_type dest_, view_type src_)
      : idx(idx_), dest(dest_), src(src_) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(int i) const {
    double tmp = 0.0;
    for (int j = 0; j < int(idx.extent(1)); j++) {
      const double val = src(idx(i, j));
      tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val);
    }
    dest(i) += tmp;
  }
};

int main(int narg, char* arg[]) {
  Kokkos::initialize(narg, arg);

  {
    int size = 1000000;

    // Create Views
    idx_type idx("Idx", size, 64);
    view_type dest("Dest", size);
    view_type src("Src", size);

    srand(134231);

    Kokkos::fence();

    // When using UVM Cuda views can be accessed on the Host directly
    for (int i = 0; i < size; i++) {
      for (int j = 0; j < int(idx.extent(1)); j++)
        idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
    }

    Kokkos::fence();
    // Run on the device
    // This will cause a sync of idx to the device since it was modified on the
    // host
    Kokkos::Timer timer;
    Kokkos::parallel_for(size,
                         localsum<view_type::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec1_dev = timer.seconds();

    // No data transfer will happen now, since nothing is accessed on the host
    timer.reset();
    Kokkos::parallel_for(size,
                         localsum<view_type::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec2_dev = timer.seconds();

    // Run on the host
    // This will cause a sync back to the host of dest which was changed on the
    // device Compare runtime here with the dual_view example: dest will be
    // copied back in 4k blocks when they are accessed the first time during the
    // parallel_for. Due to the latency of a memcpy this gives lower effective
    // bandwidth when doing a manual copy via dual views
    timer.reset();
    Kokkos::parallel_for(
        size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec1_host = timer.seconds();

    // No data transfers will happen now
    timer.reset();
    Kokkos::parallel_for(
        size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec2_host = timer.seconds();

    printf("Device Time with Sync: %e without Sync: %e \n", sec1_dev, sec2_dev);
    printf("Host   Time with Sync: %e without Sync: %e \n", sec1_host,
           sec2_host);
  }

  Kokkos::finalize();
}