File: gpu_thread_barrier.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (142 lines) | stat: -rw-r--r-- 3,758 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include "Halide.h"
#include <stdio.h>

using namespace Halide;
using namespace Halide::Internal;

class CountBarriers : public IRVisitor {
public:
    int count;

    CountBarriers()
        : count(0) {
    }

protected:
    using IRVisitor::visit;

    void visit(const Call *op) override {
        if (op->is_intrinsic(Call::gpu_thread_barrier)) {
            count++;
        }
        IRVisitor::visit(op);
    }
};

class CheckBarrierCount : public IRMutator {
    int correct;

public:
    CheckBarrierCount(int correct)
        : correct(correct) {
    }
    using IRMutator::mutate;

    Stmt mutate(const Stmt &s) override {
        CountBarriers c;
        s.accept(&c);

        if (c.count != correct) {
            printf("There were %d barriers. There were supposed to be %d\n", c.count, correct);
            exit(1);
        }

        return s;
    }
};

int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("[SKIP] No GPU target enabled.\n");
        return 0;
    }

    {
        Func f;
        Var x, y;

        // Construct a Func with lots of potential race conditions, and
        // then run it in thread blocks on the gpu.

        f(x, y) = x + 100 * y;

        const int passes = 10;
        for (int i = 0; i < passes; i++) {
            RDom rx(0, 10);
            // Flip each row, using spots 10-19 as temporary storage
            f(rx + 10, y) = f(9 - rx, y);
            f(rx, y) = f(rx + 10, y);
            // Flip each column the same way
            RDom ry(0, 8);
            f(x, ry + 8) = f(x, 7 - ry);
            f(x, ry) = f(x, ry + 8);
        }

        Func g;
        g(x, y) = f(0, 0) + f(9, 7);

        Var xi, yi;
        g.gpu_tile(x, y, xi, yi, 16, 8);
        f.compute_at(g, x);

        for (int i = 0; i < passes; i++) {
            f.update(i * 4 + 0).gpu_threads(y);
            f.update(i * 4 + 1).gpu_threads(y);
            f.update(i * 4 + 2).gpu_threads(x);
            f.update(i * 4 + 3).gpu_threads(x);
        }

        Buffer<int> out = g.realize({100, 100});
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 7 * 100 + 9;
                if (out(x, y) != correct) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, out(x, y), correct);
                    return 1;
                }
            }
        }
    }

    {
        // Construct a Func with undef stages, then run it in thread
        // blocks and make sure the right number of syncthreads are
        // added.

        Func f;
        Var x, y;
        f(x, y) = 0;
        f(x, y) += undef<int>();
        f(x, y) += x + 100 * y;
        // This next line is dubious, because it entirely masks the
        // effect of the previous definition. If you add an undefined
        // value to the previous def, then Halide can evaluate this to
        // whatever it likes. Currently we'll just elide this update
        // definition.
        f(x, y) += undef<int>();
        f(x, y) += y * 100 + x;

        Func g;
        g(x, y) = f(0, 0) + f(7, 7);

        Var xi, yi;
        g.gpu_tile(x, y, xi, yi, 8, 8);
        f.compute_at(g, x);

        f.gpu_threads(x, y);
        f.update(0).gpu_threads(x, y);
        f.update(1).gpu_threads(x, y);
        f.update(2).gpu_threads(x, y);

        // There should be three thread barriers: one after the intial
        // pure definition, one in between the
        // non-undef definitions, and one between f and g.
        g.add_custom_lowering_pass(new CheckBarrierCount(3));

        Buffer<int> out = g.realize({100, 100});
    }

    printf("Success!\n");
    return 0;
}