File: overlapping_deepcopy.cpp

package info (click to toggle)
kokkos 5.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,140 kB
  • sloc: cpp: 225,293; sh: 1,250; python: 78; makefile: 16; fortran: 4; ansic: 2
file content (109 lines) | stat: -rw-r--r-- 4,104 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <Kokkos_Core.hpp>
#include <cstdio>
#include <typeinfo>
#include <cmath>
#include <iostream>
#include <Kokkos_Timer.hpp>

struct FillDevice {
  double value;
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a;
  FillDevice(
      const double& val,
      const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a)
      : value(val), a(d_a) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(const int& i) const { a(i) = value; }
};

struct ComputeADevice {
  int iter;
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a;
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> b;
  ComputeADevice(
      const int& iter_,
      const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a,
      const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_b)
      : iter(iter_), a(d_a), b(d_b) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(const int& i) const {
    for (int j = 1; j < iter; j++) {
      a(i) += std::pow(b(i), 1.0 + 1.0 / iter);
    }
  }
};

struct ComputeAHost {
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> a;
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> b;
  ComputeAHost(const Kokkos::View<double*, Kokkos::LayoutLeft,
                                  Kokkos::CudaHostPinnedSpace>& d_a,
               const Kokkos::View<double*, Kokkos::LayoutLeft,
                                  Kokkos::CudaHostPinnedSpace>& d_b)
      : a(d_a), b(d_b) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(const int& i) const { a(i) += b(i); }
};

struct MergeDevice {
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a;
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> b;
  MergeDevice(
      const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a,
      const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_b)
      : a(d_a), b(d_b) {}

  KOKKOS_INLINE_FUNCTION
  void operator()(const int& i) const { a(i) += b(i); }
};

int main(int argc, char* argv[]) {
  int size = 100000000;
  Kokkos::initialize();
  int synch = std::stoi(argv[1]);
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_a("Device A",
                                                                   size);
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_b("Device B",
                                                                   size);
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_tmp(
      "Device tmp", size);
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> h_a(
      "Host A", size);
  Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> h_b(
      "Host B", size);

  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
                       FillDevice(0.0, d_a));
  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
                       FillDevice(1.3513, d_b));
  Kokkos::fence();
  Kokkos::Timer timer;
  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
                       ComputeADevice(20, d_a, d_b));

  if (synch == 1) Kokkos::deep_copy(Kokkos::OpenMP(), h_b, d_b);
  if (synch == 2) Kokkos::deep_copy(h_b, d_b);

  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, size),
                       [=](const int& i) { h_a(i) = 0.0; });
  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, size),
                       ComputeAHost(h_a, h_b));
  Kokkos::OpenMP().fence();
  if (synch == 1) Kokkos::deep_copy(Kokkos::OpenMP(), d_tmp, h_a);
  if (synch == 2) Kokkos::deep_copy(d_tmp, h_a);
  Kokkos::fence();

  std::cout << "Time " << timer.seconds() << std::endl;
  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size),
                       MergeDevice(d_a, d_tmp));

  Kokkos::deep_copy(h_a, d_a);
  std::cout << "h_a(0): " << h_a(0) << " ( Correct: 27.4154 )" << std::endl;
  Kokkos::finalize();
}