File: team_scan.cpp

package info (click to toggle)
kokkos 4.7.01-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 16,636 kB
  • sloc: cpp: 223,676; sh: 2,446; makefile: 2,437; python: 91; fortran: 4; ansic: 2
file content (124 lines) | stat: -rw-r--r-- 4,125 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <Kokkos_Timer.hpp>
#include <cstdio>
#include <cstdlib>

using Device = Kokkos::DefaultExecutionSpace;
using Host   = Kokkos::HostSpace::execution_space;

using team_policy = Kokkos::TeamPolicy<Device>;
using team_member = team_policy::member_type;

static const int TEAM_SIZE = 16;

struct find_2_tuples {
  int chunk_size;
  Kokkos::View<const int*> data;
  Kokkos::View<int**> histogram;

  find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_,
                Kokkos::DualView<int**> histogram_)
      : chunk_size(chunk_size_),
        data(data_.d_view),
        histogram(histogram_.d_view) {
    data_.sync<Device>();
    histogram_.sync<Device>();
    histogram_.modify<Device>();
  }

  KOKKOS_INLINE_FUNCTION
  void operator()(const team_member& dev) const {
    Kokkos::View<int**, Kokkos::MemoryUnmanaged> l_histogram(
        dev.team_shmem(), TEAM_SIZE, TEAM_SIZE);
    Kokkos::View<int*, Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),
                                                       chunk_size + 1);

    const int i = dev.league_rank() * chunk_size;
    for (int j = dev.team_rank(); j < chunk_size + 1; j += dev.team_size())
      l_data(j) = data(i + j);

    for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
      for (int l = 0; l < TEAM_SIZE; l++) l_histogram(k, l) = 0;
    dev.team_barrier();

    for (int j = 0; j < chunk_size; j++) {
      for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
        for (int l = 0; l < TEAM_SIZE; l++) {
          if ((l_data(j) == k) && (l_data(j + 1) == l)) l_histogram(k, l)++;
        }
    }

    for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size())
      for (int l = 0; l < TEAM_SIZE; l++) {
        Kokkos::atomic_fetch_add(&histogram(k, l), l_histogram(k, l));
      }
    dev.team_barrier();
  }
  size_t team_shmem_size(int team_size) const {
    return Kokkos::View<int**, Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE,
                                                                    TEAM_SIZE) +
           Kokkos::View<int*, Kokkos::MemoryUnmanaged>::shmem_size(chunk_size +
                                                                   1);
  }
};

int main(int narg, char* args[]) {
  Kokkos::initialize(narg, args);

  {
    int chunk_size = 1024;
    int nchunks    = 100000;  // 1024*1024;
    Kokkos::DualView<int*> data("data", nchunks * chunk_size + 1);

    srand(1231093);

    for (int i = 0; i < (int)data.extent(0); i++) {
      data.h_view(i) = rand() % TEAM_SIZE;
    }
    data.modify<Host>();
    data.sync<Device>();

    Kokkos::DualView<int**> histogram("histogram", TEAM_SIZE, TEAM_SIZE);

    Kokkos::Timer timer;
    // threads/team is automatically limited to maximum supported by the device.
    int const concurrency = Device::execution_space().concurrency();
    int team_size         = TEAM_SIZE;
    if (team_size > concurrency) team_size = concurrency;
    Kokkos::parallel_for(team_policy(nchunks, team_size),
                         find_2_tuples(chunk_size, data, histogram));
    Kokkos::fence();
    double time = timer.seconds();

    histogram.sync<Host>();

    printf("Time: %f \n\n", time);
    int sum = 0;
    for (int k = 0; k < TEAM_SIZE; k++) {
      for (int l = 0; l < TEAM_SIZE; l++) {
        printf("%i ", histogram.h_view(k, l));
        sum += histogram.h_view(k, l);
      }
      printf("\n");
    }
    printf("Result: %i %i\n", sum, chunk_size * nchunks);
  }
  Kokkos::finalize();
}