File: data_layouts.cpp

package info (click to toggle)
kokkos 5.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 15,140 kB
  • sloc: cpp: 225,293; sh: 1,250; python: 78; makefile: 16; fortran: 4; ansic: 2
file content (136 lines) | stat: -rw-r--r-- 5,096 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>

// These two View types are both 2-D arrays of double.  However, they
// have different layouts in memory.  left_type has "layout left,"
// which means "column major," the same as in Fortran, the BLAS, or
// LAPACK.  right_type has "layout right," which means "row major,"
// the same as in C, C++, or Java.
using left_type  = Kokkos::View<double**, Kokkos::LayoutLeft>;
using right_type = Kokkos::View<double**, Kokkos::LayoutRight>;
// This is a one-dimensional View, so the layout matters less.
// However, it still has a layout!  Since its layout is not specified
// explicitly in the type, its layout is a function of the memory
// space.  For example, the default Cuda layout is LayoutLeft, and the
// default Host layout is LayoutRight.
using view_type = Kokkos::View<double*>;

// parallel_for functor that fills the given View with some data.  It
// expects to access the View by rows in parallel: each call i of
// operator() accesses a row.
template <class ViewType>
struct init_view {
  ViewType a;
  init_view(ViewType a_) : a(a_) {}

  using size_type = typename ViewType::size_type;

  KOKKOS_INLINE_FUNCTION
  void operator()(const typename ViewType::size_type i) const {
    // On CPUs this loop could be vectorized so j should do stride 1
    // access on a for optimal performance. I.e. a should be LayoutRight.
    // On GPUs threads should do coalesced loads and stores. That means
    // that i should be the stride one access for optimal performance.
    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
      a(i, j) = 1.0 * a.extent(0) * i + 1.0 * j;
    }
  }
};

// Compute a contraction of v1 and v2 into a:
//
//   a(i) := sum_j (v1(i,j) * v2(j,i))
//
// Since the functor is templated on the ViewTypes itself it doesn't matter what
// there layouts are. That means you can use different layouts on different
// architectures.
template <class ViewType1, class ViewType2>
struct contraction {
  view_type a;
  typename ViewType1::const_type v1;
  typename ViewType2::const_type v2;
  contraction(view_type a_, ViewType1 v1_, ViewType2 v2_)
      : a(a_), v1(v1_), v2(v2_) {}

  using size_type = typename view_type::size_type;

  // As with the initialization functor the performance of this operator
  // depends on the architecture and the chosen data layouts.
  // On CPUs optimal would be to vectorize the inner loop, so j should be the
  // stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft.
  // In order to get coalesced access on GPUs where i corresponds closely to
  // the thread Index, i must be the stride 1 dimension. That means v1 should be
  // LayoutLeft and v2 LayoutRight.
  KOKKOS_INLINE_FUNCTION
  void operator()(const view_type::size_type i) const {
    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
      a(i) = v1(i, j) * v2(j, i);
    }
  }
};

// Compute a dot product. This is used for result verification.
struct dot {
  view_type a;
  dot(view_type a_) : a(a_) {}
  using value_type = double;  // Specify type for reduction target, lsum
  KOKKOS_INLINE_FUNCTION
  void operator()(const view_type::size_type i, double& lsum) const {
    lsum += a(i) * a(i);
  }
};

int main(int narg, char* arg[]) {
  // When initializing Kokkos, you may pass in command-line arguments,
  // just like with MPI_Init().  Kokkos reserves the right to remove
  // arguments from the list that start with '--kokkos-'.
  Kokkos::initialize(narg, arg);

  {
    int size = 10000;
    view_type a("A", size);

    // Define two views with LayoutLeft and LayoutRight.
    left_type l("L", size, 10000);
    right_type r("R", size, 10000);

    // Initialize the data in the views.
    Kokkos::parallel_for(size, init_view<left_type>(l));
    Kokkos::parallel_for(size, init_view<right_type>(r));
    Kokkos::fence();

    // Measure time to execute the contraction kernel when giving it a
    // LayoutLeft view for v1 and a LayoutRight view for v2. This should be
    // fast on GPUs and slow on CPUs
    Kokkos::Timer time1;
    Kokkos::parallel_for(size, contraction<left_type, right_type>(a, l, r));
    Kokkos::fence();
    double sec1 = time1.seconds();

    double sum1 = 0;
    Kokkos::parallel_reduce(size, dot(a), sum1);
    Kokkos::fence();

    // Measure time to execute the contraction kernel when giving it a
    // LayoutRight view for v1 and a LayoutLeft view for v2. This should be
    // fast on CPUs and slow on GPUs
    Kokkos::Timer time2;
    Kokkos::parallel_for(size, contraction<right_type, left_type>(a, r, l));
    Kokkos::fence();
    double sec2 = time2.seconds();

    double sum2 = 0;
    Kokkos::parallel_reduce(size, dot(a), sum2);

    // Kokkos' reductions are deterministic.
    // The results should always be equal.
    printf("Result Left/Right %f Right/Left %f (equal result: %i)\n", sec1,
           sec2, sum2 == sum1);
  }

  Kokkos::finalize();
}