File: data_layouts.cpp

package info (click to toggle)
kokkos 5.0.1-1
links: PTS, VCS
area: main
in suites: experimental
size: 15,140 kB
sloc: cpp: 225,293; sh: 1,250; python: 78; makefile: 16; fortran: 4; ansic: 2
file content (136 lines) | stat: -rw-r--r-- 5,096 bytes
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>

// These two View types are both 2-D arrays of double.  However, they
// have different layouts in memory.  left_type has "layout left,"
// which means "column major," the same as in Fortran, the BLAS, or
// LAPACK.  right_type has "layout right," which means "row major,"
// the same as in C, C++, or Java.
using left_type  = Kokkos::View<double**, Kokkos::LayoutLeft>;
using right_type = Kokkos::View<double**, Kokkos::LayoutRight>;
// This is a one-dimensional View, so the layout matters less.
// However, it still has a layout!  Since its layout is not specified
// explicitly in the type, its layout is a function of the memory
// space.  For example, the default Cuda layout is LayoutLeft, and the
// default Host layout is LayoutRight.
using view_type = Kokkos::View<double*>;

// parallel_for functor that fills the given View with some data.  It
// expects to access the View by rows in parallel: each call i of
// operator() accesses a row.
template <class ViewType>
struct init_view {
  ViewType a;
  init_view(ViewType a_) : a(a_) {}

  using size_type = typename ViewType::size_type;

  KOKKOS_INLINE_FUNCTION
  void operator()(const typename ViewType::size_type i) const {
    // On CPUs this loop could be vectorized so j should do stride 1
    // access on a for optimal performance. I.e. a should be LayoutRight.
    // On GPUs threads should do coalesced loads and stores. That means
    // that i should be the stride one access for optimal performance.
    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
      a(i, j) = 1.0 * a.extent(0) * i + 1.0 * j;
    }
  }
};

// Compute a contraction of v1 and v2 into a:
//
//   a(i) := sum_j (v1(i,j) * v2(j,i))
//
// Since the functor is templated on the ViewTypes itself it doesn't matter what
// there layouts are. That means you can use different layouts on different
// architectures.
template <class ViewType1, class ViewType2>
struct contraction {
  view_type a;
  typename ViewType1::const_type v1;
  typename ViewType2::const_type v2;
  contraction(view_type a_, ViewType1 v1_, ViewType2 v2_)
      : a(a_), v1(v1_), v2(v2_) {}

  using size_type = typename view_type::size_type;

  // As with the initialization functor the performance of this operator
  // depends on the architecture and the chosen data layouts.
  // On CPUs optimal would be to vectorize the inner loop, so j should be the
  // stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft.
  // In order to get coalesced access on GPUs where i corresponds closely to
  // the thread Index, i must be the stride 1 dimension. That means v1 should be
  // LayoutLeft and v2 LayoutRight.
  KOKKOS_INLINE_FUNCTION
  void operator()(const view_type::size_type i) const {
    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
      a(i) = v1(i, j) * v2(j, i);
    }
  }
};

// Compute a dot product. This is used for result verification.
struct dot {
  view_type a;
  dot(view_type a_) : a(a_) {}
  using value_type = double;  // Specify type for reduction target, lsum
  KOKKOS_INLINE_FUNCTION
  void operator()(const view_type::size_type i, double& lsum) const {
    lsum += a(i) * a(i);
  }
};

int main(int narg, char* arg[]) {
  // When initializing Kokkos, you may pass in command-line arguments,
  // just like with MPI_Init().  Kokkos reserves the right to remove
  // arguments from the list that start with '--kokkos-'.
  Kokkos::initialize(narg, arg);

  {
    int size = 10000;
    view_type a("A", size);

    // Define two views with LayoutLeft and LayoutRight.
    left_type l("L", size, 10000);
    right_type r("R", size, 10000);

    // Initialize the data in the views.
    Kokkos::parallel_for(size, init_view<left_type>(l));
    Kokkos::parallel_for(size, init_view<right_type>(r));
    Kokkos::fence();

    // Measure time to execute the contraction kernel when giving it a
    // LayoutLeft view for v1 and a LayoutRight view for v2. This should be
    // fast on GPUs and slow on CPUs
    Kokkos::Timer time1;
    Kokkos::parallel_for(size, contraction<left_type, right_type>(a, l, r));
    Kokkos::fence();
    double sec1 = time1.seconds();

    double sum1 = 0;
    Kokkos::parallel_reduce(size, dot(a), sum1);
    Kokkos::fence();

    // Measure time to execute the contraction kernel when giving it a
    // LayoutRight view for v1 and a LayoutLeft view for v2. This should be
    // fast on CPUs and slow on GPUs
    Kokkos::Timer time2;
    Kokkos::parallel_for(size, contraction<right_type, left_type>(a, r, l));
    Kokkos::fence();
    double sec2 = time2.seconds();

    double sum2 = 0;
    Kokkos::parallel_reduce(size, dot(a), sum2);

    // Kokkos' reductions are deterministic.
    // The results should always be equal.
    printf("Result Left/Right %f Right/Left %f (equal result: %i)\n", sec1,
           sec2, sum2 == sum1);
  }

  Kokkos::finalize();
}