File: gpu_transpose.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (57 lines) | stat: -rw-r--r-- 1,587 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("[SKIP] No GPU target enabled.\n");
        return 0;
    }

    ImageParam in(UInt(8), 2);

    Var x, y;

    // Wrap the input in a dummy func so we can schedule it.
    Func in_func;
    in_func(x, y) = in(x, y);

    // Transpose
    Func out;
    out(x, y) = in_func(y, x);

    // Do a nested tiling of the output into 4x4 tiles of 16x16
    // pixels.  We'll make blockidy be the tile index, and blockidx be
    // the subtile index.
    Var xi, yi, xo, yo, xii, xio, yii, yio, tile_idx, subtile_idx;
    out.tile(x, y, xo, yo, xi, yi, 64, 64)
        .fuse(xo, yo, tile_idx)
        .tile(xi, yi, xio, yio, xii, yii, 16, 16)
        .fuse(xio, yio, subtile_idx)
        .gpu_blocks(subtile_idx, tile_idx)
        .gpu_threads(xii, yii);

    // Load a tile on input and store it into shared.
    in_func.compute_at(out, subtile_idx).gpu_threads(x, y);

    Buffer<uint8_t> input(256, 256);
    lambda(x, y, cast<uint8_t>(x * 17 + y)).realize(input);
    in.set(input);

    Buffer<uint8_t> output = out.realize({256, 256});

    for (int y = 0; y < 256; y++) {
        for (int x = 0; x < 256; x++) {
            uint8_t correct = y * 17 + x;
            if (output(x, y) != correct) {
                printf("output(%d, %d) = %d instead of %d\n",
                       x, y, output(x, y), correct);
                return 1;
            }
        }
    }

    printf("Success!\n");
    return 0;
}