File: dual_view.cpp

package info (click to toggle)
kokkos 5.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,140 kB
  • sloc: cpp: 225,293; sh: 1,250; python: 78; makefile: 16; fortran: 4; ansic: 2
file content (184 lines) | stat: -rw-r--r-- 7,064 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>
#include <cstdlib>

// DualView helps you manage data and computations that take place on
// two different memory spaces.  Examples include CUDA device memory
// and (CPU) host memory (currently implemented), or Intel Knights
// Landing MCDRAM and DRAM (not yet implemented).  For example, if you
// have ported only some parts of you application to run in CUDA,
// DualView can help manage moving data between the parts of your
// application that work best with CUDA, and the parts that work
// better on the CPU.
//
// A DualView takes the same template parameters as a View, but
// contains two Views: One that lives in the DualView's memory space,
// and one that lives in that memory space's host mirror space.  If
// both memory spaces are the same, then the two Views just alias one
// another.  This means that you can use DualView all the time, even
// when not running in a memory space like CUDA.  DualView's
// operations to help you manage memory take almost no time in that
// case.  This makes your code even more performance portable.

using view_type = Kokkos::DualView<double*>;
using idx_type  = Kokkos::DualView<int**>;

template <class ExecutionSpace>
struct localsum {
  // If the functor has a public 'execution_space' alias, that defines
  // the functor's execution space (where it runs in parallel).  This
  // overrides Kokkos' default execution space.
  using execution_space = ExecutionSpace;

  using memory_space = std::conditional_t<
      std::is_same_v<ExecutionSpace, Kokkos::DefaultExecutionSpace>,
      idx_type::memory_space, idx_type::host_mirror_space>;

  // Get the view types on the particular device for which the functor
  // is instantiated.
  //
  // "const_data_type" is an alias in View (and DualView) which is
  // the const version of the first template parameter of the View.
  // For example, the const_data_type version of double** is const
  // double**.
  Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space>
      idx;
  // "data_type" is an alias in ViewTraits (and DualView) which is the
  // array version of the value(s) stored in the View.
  Kokkos::View<view_type::data_type, view_type::array_layout, memory_space>
      dest;
  Kokkos::View<view_type::const_data_type, view_type::array_layout,
               memory_space, Kokkos::MemoryRandomAccess>
      src;

  // Constructor takes DualViews, synchronizes them to the device,
  // then marks them as modified on the device.
  localsum(idx_type dv_idx, view_type dv_dest, view_type dv_src) {
    // Extract the view on the correct Device (i.e., the correct
    // memory space) from the DualView.  DualView has a template
    // method, view(), which is templated on the memory space.  If the
    // DualView has a View from that memory space, view() returns the
    // View in that space.
    idx  = dv_idx.view<memory_space>();
    dest = dv_dest.template view<memory_space>();
    src  = dv_src.template view<memory_space>();

    // Synchronize the DualView to the correct Device.
    //
    // DualView's sync() method is templated on a memory space, and
    // synchronizes the DualView in a one-way fashion to that memory
    // space.  "Synchronizing" means copying, from the other memory
    // space to the Device memory space.  sync() does _nothing_ if the
    // Views on the two memory spaces are in sync.  DualView
    // determines this by the user manually marking one side or the
    // other as modified; see the modify() call below.

    dv_idx.sync<memory_space>();
    dv_dest.template sync<memory_space>();
    dv_src.template sync<memory_space>();

    // Mark dest as modified on Device.
    dv_dest.template modify<memory_space>();
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const int i) const {
    double tmp = 0.0;
    for (int j = 0; j < (int)idx.extent(1); ++j) {
      const double val = src(idx(i, j));
      tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val);
    }
    dest(i) += tmp;
  }
};

class ParticleType {
 public:
  double q;
  double m;
  double q_over_m;
  KOKKOS_INLINE_FUNCTION
  ParticleType(double q_ = -1, double m_ = 1) : q(q_), m(m_), q_over_m(q / m) {}

 protected:
};

using ParticleTypes = Kokkos::DualView<ParticleType[10]>;
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg, arg);

  // If View is non-trivial constructible type then add braces so it is out of
  // scope before Kokkos::finalize() call
  {
    ParticleTypes test("Test");
    Kokkos::fence();
    test.view_host()(0) = ParticleType(-1e4, 1);
    Kokkos::fence();

    int size = 1000000;

    // Create DualViews. This will allocate on both the device and its
    // host_mirror_device.
    idx_type idx("Idx", size, 64);
    view_type dest("Dest", size);
    view_type src("Src", size);

    srand(134231);

    // Get a reference to the host view of idx directly (equivalent to
    // idx.view<idx_type::host_mirror_space>() )
    idx_type::t_host h_idx = idx.view_host();
    using size_type        = view_type::size_type;
    for (int i = 0; i < size; ++i) {
      for (size_type j = 0; j < static_cast<size_type>(h_idx.extent(1)); ++j) {
        h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
      }
    }

    // Mark idx as modified on the host_mirror_space so that a
    // sync to the device will actually move data.  The sync happens in
    // the functor's constructor.
    idx.modify<idx_type::host_mirror_space>();

    // Run on the device.  This will cause a sync of idx to the device,
    // since it was marked as modified on the host.
    Kokkos::Timer timer;
    Kokkos::parallel_for(size,
                         localsum<view_type::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec1_dev = timer.seconds();

    timer.reset();
    Kokkos::parallel_for(size,
                         localsum<view_type::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec2_dev = timer.seconds();

    // Run on the host's default execution space (could be the same as device).
    // This will cause a sync back to the host of dest.  Note that if the Device
    // is CUDA, the data layout will not be optimal on host, so performance is
    // lower than what it would be for a pure host compilation.
    timer.reset();
    Kokkos::parallel_for(
        size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec1_host = timer.seconds();

    timer.reset();
    Kokkos::parallel_for(
        size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src));
    Kokkos::fence();
    double sec2_host = timer.seconds();

    printf("Device Time with Sync: %f without Sync: %f \n", sec1_dev, sec2_dev);
    printf("Host   Time with Sync: %f without Sync: %f \n", sec1_host,
           sec2_host);
  }

  Kokkos::finalize();
}