File: gpu_non_contiguous_copy.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (86 lines) | stat: -rw-r--r-- 3,190 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();

    Var x, y, z, w;
    Buffer<int> full(80, 60, 10, 10);

    const int x_off = 4, y_off = 8, z_off = 2, w_off = 4;
    const int x_size = 16, y_size = 16, z_size = 3, w_size = 3;

    // We want to construct a new Buffer that refers to the same data
    // but a different halide_buffer_t.
    Buffer<int> cropped(*full.raw_buffer());
    cropped.raw_buffer()->host = (uint8_t *)&(full(x_off, y_off, z_off, w_off));
    cropped.raw_buffer()->dim[0].extent = x_size;
    cropped.raw_buffer()->dim[1].extent = y_size;
    cropped.raw_buffer()->dim[2].extent = z_size;
    cropped.raw_buffer()->dim[3].extent = w_size;
    cropped.raw_buffer()->dim[0].stride *= 2;
    cropped.raw_buffer()->dim[1].stride *= 2;
    cropped.raw_buffer()->dim[2].stride *= 2;
    cropped.raw_buffer()->dim[3].stride *= 2;

    // Make a bitmask representing the region inside the crop.
    Buffer<bool> in_subregion(80, 60, 10, 10);
    Expr test = ((x >= x_off) && (x < x_off + x_size * 2) &&
                 (y >= y_off) && (y < y_off + y_size * 2) &&
                 (z >= z_off) && (z < z_off + z_size * 2) &&
                 (w >= w_off) && (w < w_off + w_size * 2) &&
                 (x % 2 == 0) &&
                 (y % 2 == 0) &&
                 (z % 2 == 0) &&
                 (w % 2 == 0));
    Func test_func;
    test_func(x, y, z, w) = test;
    test_func.realize(in_subregion);

    Func f;
    f(x, y, z, w) = 3 * x + 2 * y + z + 4 * w;
    if (target.has_gpu_feature()) {
        Var xi, yi;
        f.gpu_tile(x, y, xi, yi, 16, 16);
    } else if (target.has_feature(Target::HVX)) {
        f.hexagon().vectorize(x, 16);
    }
    f.output_buffer().dim(0).set_stride(Expr());
    f.realize(cropped);

    // Put some data in the full host buffer, avoiding the region
    // being evaluated above.
    Expr change_out_of_subregion = select(test, undef<int>(), 4 * x + 3 * y + 2 * z + w);
    lambda(x, y, z, w, change_out_of_subregion).realize(full);

    // Copy back the output subset from the GPU.
    cropped.copy_to_host();

    for (int w = 0; w < full.extent(3); ++w) {
        for (int z = 0; z < full.extent(2); ++z) {
            for (int y = 0; y < full.extent(1); ++y) {
                for (int x = 0; x < full.extent(0); ++x) {
                    int correct;
                    if (in_subregion(x, y, z, w)) {
                        int x_ = (x - x_off) / 2;
                        int y_ = (y - y_off) / 2;
                        int z_ = (z - z_off) / 2;
                        int w_ = (w - w_off) / 2;
                        correct = 3 * x_ + 2 * y_ + z_ + 4 * w_;
                    } else {
                        correct = 4 * x + 3 * y + 2 * z + w;
                    }
                    if (full(x, y, z, w) != correct) {
                        printf("Error! Incorrect value %i != %i at %i, %i, %i, %i\n", full(x, y, z, w), correct, x, y, z, w);
                        return 1;
                    }
                }
            }
        }
    }

    printf("Success!\n");
    return 0;
}