File: test_fn_cartesian_gpu.cu

package info (click to toggle)
gridtools 2.3.9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 29,480 kB
  • sloc: cpp: 228,792; python: 17,561; javascript: 9,164; ansic: 4,101; sh: 850; makefile: 231; f90: 201
file content (147 lines) | stat: -rw-r--r-- 6,036 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/*
 * GridTools
 *
 * Copyright (c) 2014-2023, ETH Zurich
 * All rights reserved.
 *
 * Please, refer to the LICENSE file in the root directory.
 * SPDX-License-Identifier: BSD-3-Clause
 */
#include <gridtools/fn/cartesian.hpp>

#include <gtest/gtest.h>

#include <gridtools/fn/backend/gpu.hpp>

namespace gridtools::fn {
    namespace {
        using namespace literals;
        using sid::property;

        template <int I>
        using int_t = integral_constant<int, I>;

        struct stencil {
            constexpr auto operator()() const {
                using namespace cartesian::dim;
                return [](auto const &in) { return deref(shift(in, i(), 1_c)); };
            }
        };

        struct fwd_sum_scan : fwd {
            static GT_FUNCTION constexpr auto body() {
                return scan_pass(
                    [](auto acc, auto const &iter) { return acc + deref(iter); }, [](auto acc) { return acc; });
            }
        };

        struct bwd_sum_scan : bwd {
            static GT_FUNCTION constexpr auto body() {
                return scan_pass(
                    [](auto acc, auto const &iter) { return acc + deref(iter); }, [](auto acc) { return acc; });
            }
        };

        TEST(cartesian, stencil) {
            using block_sizes_t = meta::list<meta::list<cartesian::dim::i, int_t<32>>,
                meta::list<cartesian::dim::j, int_t<8>>,
                meta::list<cartesian::dim::k, int_t<1>>>;
            auto apply_stencil = [](auto executor, auto &out, auto const &in) {
                executor().arg(out).arg(in).assign(0_c, stencil(), 1_c).execute();
            };

            auto fencil = [&](auto const &sizes, auto &out, auto const &in) {
                auto be = backend::gpu<block_sizes_t>();
                auto alloc = tmp_allocator(be);
                auto tmp = allocate_global_tmp<int>(alloc, sizes);
                auto domain = cartesian_domain(std::array<int, 3>{sizes[0] - 1, sizes[1], sizes[2]});
                auto backend = make_backend(be, domain);
                apply_stencil(backend.stencil_executor(), tmp, in);
                apply_stencil(backend.stencil_executor(), out, tmp);
            };

            auto in = cuda_util::cuda_malloc<int>(5 * 3 * 2);
            auto out = cuda_util::cuda_malloc<int>(5 * 3 * 2);
            int inh[5][3][2], outh[5][3][2] = {};
            for (int i = 0; i < 5; ++i)
                for (int j = 0; j < 3; ++j)
                    for (int k = 0; k < 2; ++k)
                        inh[i][j][k] = 6 * i + 2 * j + k;
            cudaMemcpy(in.get(), inh, 5 * 3 * 2 * sizeof(int), cudaMemcpyHostToDevice);
            auto as_synthetic = [](int *x) {
                return sid::synthetic()
                    .set<property::origin>(sid::host_device::simple_ptr_holder(x))
                    .set<property::strides>(tuple(6_c, 2_c, 1_c));
            };

            auto out_s = as_synthetic(out.get());
            auto in_s = as_synthetic(in.get());
            fencil(std::array{5, 3, 2}, out_s, in_s);

            cudaMemcpy(outh, out.get(), 5 * 3 * 2 * sizeof(int), cudaMemcpyDeviceToHost);

            for (int i = 0; i < 3; ++i)
                for (int j = 0; j < 3; ++j)
                    for (int k = 0; k < 2; ++k)
                        EXPECT_EQ(outh[i][j][k], 6 * (i + 2) + 2 * j + k);
        }

        TEST(cartesian, vertical) {
            using block_sizes_t = meta::list<meta::list<cartesian::dim::i, int_t<32>>,
                meta::list<cartesian::dim::j, int_t<8>>,
                meta::list<cartesian::dim::k, int_t<1>>>;
            auto apply_double_scan = [](auto executor, auto &a, auto &b, auto const &c) {
                executor()
                    .arg(a)
                    .arg(b)
                    .arg(c)
                    .assign(1_c, fwd_sum_scan(), 42, 2_c)
                    .assign(0_c, bwd_sum_scan(), 8, 1_c)
                    .execute();
            };

            auto double_scan = [&](auto sizes, auto &a, auto &b, auto const &c) {
                auto domain = cartesian_domain(sizes);
                auto backend = make_backend(backend::gpu<block_sizes_t>(), domain);
                apply_double_scan(backend.vertical_executor(), a, b, c);
            };

            std::array<int, 3> sizes = {5, 3, 2};
            auto a = cuda_util::cuda_malloc<int>(5 * 3 * 2);
            auto b = cuda_util::cuda_malloc<int>(5 * 3 * 2);
            auto c = cuda_util::cuda_malloc<int>(5 * 3 * 2);
            int ah[5][3][2] = {}, bh[5][3][2] = {}, ch[5][3][2];
            for (int i = 0; i < 5; ++i)
                for (int j = 0; j < 3; ++j)
                    for (int k = 0; k < 2; ++k)
                        ch[i][j][k] = 6 * i + 2 * j + k;
            cudaMemcpy(c.get(), ch, 5 * 3 * 2 * sizeof(int), cudaMemcpyHostToDevice);
            auto as_synthetic = [](int *x) {
                return sid::synthetic()
                    .set<property::origin>(sid::host_device::simple_ptr_holder(x))
                    .set<property::strides>(tuple(6_c, 2_c, 1_c));
            };

            auto a_s = as_synthetic(a.get());
            auto b_s = as_synthetic(b.get());
            auto c_s = as_synthetic(c.get());
            double_scan(sizes, a_s, b_s, c_s);
            cudaMemcpy(bh, b.get(), 5 * 3 * 2 * sizeof(int), cudaMemcpyDeviceToHost);
            cudaMemcpy(ah, a.get(), 5 * 3 * 2 * sizeof(int), cudaMemcpyDeviceToHost);

            for (int i = 0; i < 5; ++i)
                for (int j = 0; j < 3; ++j) {
                    int res = 42;
                    for (int k = 0; k < 2; ++k) {
                        res += ch[i][j][k];
                        EXPECT_EQ(bh[i][j][k], res);
                    }
                    res = 8;
                    for (int k = 1; k >= 0; --k) {
                        res += bh[i][j][k];
                        EXPECT_EQ(ah[i][j][k], res);
                    }
                }
        }
    } // namespace
} // namespace gridtools::fn