1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
#include <Kokkos_Core.hpp>
#include <cstdio>
//
// First reduction (parallel_reduce) example:
// 1. Start up Kokkos
// 2. Execute a parallel_reduce loop in the default execution space,
// using a functor to define the loop body
// 3. Shut down Kokkos
//
struct collision {
// Reduction functor
// For each i, we generate 10 hashes, look for and count collisions
// We use parallel_reduce to count the total collisions
// Note that we're just counting collisions within the 10 generated
// one i.
// This function was chosen as one that very simply can increase the
// register count.
using value_type = int;
KOKKOS_INLINE_FUNCTION
int hash(int q) const {
// A simple hash by Justin Sobel
// Thanks to Arash Partow (partow.net)
char* fourchars =
(char*)&q; // NOLINT(cppcoreguidelines-pro-type-cstyle-cast)
int hash = 1315423911;
for (int i = 0; i < 4; fourchars++, i++) {
hash ^= ((hash << 5) + *fourchars + (hash >> 2));
}
return hash;
}
KOKKOS_INLINE_FUNCTION
void operator()(const int i, int& lsum) const {
// This is a silly function which generates 10 hashes
// then checks for collisions
int a = hash(i) % 64;
int b = hash(i * 3) % 64;
int c = hash(i * 5) % 64;
int d = hash(i * 7) % 64;
int e = hash(i * 11) % 64;
int f = hash(i * 17) % 64;
int g = hash(i * 23) % 64;
int h = hash(i * 29) % 64;
int j = hash(i * 31) % 64;
int k = hash(i * 37) % 64;
if (a == b) lsum++;
if (a == c) lsum++;
if (a == d) lsum++;
if (a == e) lsum++;
if (a == f) lsum++;
if (a == g) lsum++;
if (a == h) lsum++;
if (a == j) lsum++;
if (a == k) lsum++;
if (b == c) lsum++;
if (b == d) lsum++;
if (b == e) lsum++;
if (b == f) lsum++;
if (b == g) lsum++;
if (b == h) lsum++;
if (b == j) lsum++;
if (b == k) lsum++;
if (c == d) lsum++;
if (c == e) lsum++;
if (c == f) lsum++;
if (c == g) lsum++;
if (c == h) lsum++;
if (c == j) lsum++;
if (c == k) lsum++;
if (d == e) lsum++;
if (d == f) lsum++;
if (d == g) lsum++;
if (d == h) lsum++;
if (d == j) lsum++;
if (d == k) lsum++;
if (e == f) lsum++;
if (e == g) lsum++;
if (e == h) lsum++;
if (e == j) lsum++;
if (e == k) lsum++;
if (f == g) lsum++;
if (f == h) lsum++;
if (f == j) lsum++;
if (f == k) lsum++;
if (g == h) lsum++;
if (g == j) lsum++;
if (g == k) lsum++;
if (h == j) lsum++;
if (h == k) lsum++;
if (j == k) lsum++;
}
};
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
const int n = 10000;
// Compute and count hash collisions in
// parallel, using Kokkos.
// This is not really a useful algorithm, but it demonstrates the
// LaunchBounds functionality
int sum1 = 0;
int sum2 = 0;
// Without LaunchBounds, the kernel uses 56 registers
Kokkos::parallel_reduce(n, collision(), sum1);
// With LaunchBounds, we can reduce the register usage to 32
Kokkos::parallel_reduce(
Kokkos::RangePolicy<Kokkos::LaunchBounds<512, 4>>(0, n), collision(),
sum2);
printf(
"Number of collisions, "
"computed in parallel, is %i\n",
sum1);
if (sum1 != sum2) {
printf("Uh-oh! Results do not match\n");
return -1;
}
Kokkos::finalize();
return 0;
}
|