File: gpu_multi_device.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (120 lines) | stat: -rw-r--r-- 3,728 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include "Halide.h"
#include <stdio.h>

using namespace Halide;

struct MultiDevicePipeline {
    Var x, y, c, xi, yi;
    Func stage[5];
    size_t current_stage;

    MultiDevicePipeline(Func input) {
        current_stage = 0;

        stage[current_stage](x, y, c) = input(x, y, c);
        current_stage++;

        Target jit_target(get_jit_target_from_environment());
        if (jit_target.has_feature(Target::OpenCL)) {
            stage[current_stage](x, y, c) = stage[current_stage - 1](x, y, c) + 69;
            stage[current_stage]
                .compute_root()
                .reorder(c, x, y)
                .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenCL);
            current_stage++;
        }
        if (jit_target.has_feature(Target::CUDA)) {
            stage[current_stage](x, y, c) = stage[current_stage - 1](x, y, c) + 69;
            stage[current_stage]
                .compute_root()
                .reorder(c, x, y)
                .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::CUDA);
            current_stage++;
        }
        if (jit_target.has_feature(Target::Metal)) {
            stage[current_stage](x, y, c) = stage[current_stage - 1](x, y, c) + 69;
            stage[current_stage]
                .compute_root()
                .reorder(c, x, y)
                .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::Metal);
            current_stage++;
        }
    }

    void run(Buffer<float> &result) {
        stage[current_stage - 1].realize(result);
        if (result.copy_to_host() != halide_error_code_success) {
            fprintf(stderr, "copy_to_host failed\n");
            exit(1);
        }
        if (result.device_free() != halide_error_code_success) {
            fprintf(stderr, "device_free failed\n");
            exit(1);
        }
        result.set_host_dirty();
    }

    bool verify(const Buffer<float> &result, size_t stages, const char *test_case) {
        for (int i = 0; i < 100; i++) {
            for (int j = 0; j < 100; j++) {
                for (int k = 0; k < 3; k++) {
                    float correct = 42.0f + stages * 69;
                    if (result(i, j, k) != correct) {
                        printf("result(%d, %d, %d) = %f instead of %f. (%s).\n", i, j, k, result(i, j, k), correct, test_case);
                        return false;
                    }
                }
            }
        }
        return true;
    }
};

int main(int argc, char **argv) {
    Var x, y, c;
    Func const_input;
    const_input(x, y, c) = 42.0f;

    {
        MultiDevicePipeline pipe1(const_input);
        if (pipe1.current_stage < 3) {
            printf("[SKIP] Need two or more GPU targets enabled.\n");
            return 0;
        }

        Buffer<float> output1(100, 100, 3);
        pipe1.run(output1);

        if (!pipe1.verify(output1, pipe1.current_stage - 1, "const input")) {
            return 1;
        }
    }

    {
        MultiDevicePipeline pipe2(const_input);

        ImageParam gpu_buffer(Float(32), 3);
        gpu_buffer.dim(2).set_bounds(0, 3);
        Func buf_input;
        buf_input(x, y, c) = gpu_buffer(x, y, c);
        MultiDevicePipeline pipe3(buf_input);

        Buffer<float> output2(100, 100, 3);
        pipe2.run(output2);

        if (!pipe2.verify(output2, pipe2.current_stage - 1, "chained buffers intermediate")) {
            return 1;
        }

        Buffer<float> output3(100, 100, 3);
        gpu_buffer.set(output2);
        pipe3.run(output3);

        if (!pipe3.verify(output3, pipe2.current_stage + pipe3.current_stage - 2, "chained buffers")) {
            return 1;
        }
    }

    printf("Success!\n");
    return 0;
}