File: gpu_half_throughput.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (83 lines) | stat: -rw-r--r-- 2,493 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include "Halide.h"
#include "halide_benchmark.h"

using namespace Halide;

int main(int argc, char **argv) {
    Target t = get_jit_target_from_environment();
    if (t.arch == Target::WebAssembly) {
        printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
        return 0;
    }

    if (!(t.has_feature(Target::CUDA) ||
          t.has_feature(Target::Metal))) {
        printf("[SKIP] No GPU target enabled supporting half-precision.\n");
        return 0;
    }

    std::vector<Target::Feature> cuda_capabilities{Target::CUDACapability30, Target::CUDACapability32, Target::CUDACapability35, Target::CUDACapability50, Target::CUDACapability61};
    if (t.has_feature(Target::CUDA) && !(t.features_any_of(cuda_capabilities))) {
        printf("[SKIP] Need CUDA Capability 30 or greater.\n");
        return 0;
    }

    std::cout << t.to_string() << "\n";

    // Test three variants, in increasing order of speed.
    // 1) Store as float, math as float
    // 2) Store as half, math as float
    // 3) Store as half, math as half

    const int size = 1024 * 1024 * 10;
    const int step = 1024;

    Buffer<float> f32_in(size + step);
    Buffer<float16_t> f16_in(size + step);

    f32_in.fill(2.0f);
    f16_in.fill(float16_t(2.0f));

    Buffer<float> f32_out(size);
    Buffer<float16_t> f16_out(size);

    Func f1, f2, f3;
    Var x;

    f1(x) = f32_in(x) * f32_in(x + step);
    f2(x) = f16_in(x) * f16_in(x + step);

    Var xi;
    f1.bound(x, 0, size).vectorize(x, 2).gpu_tile(x, xi, 32);
    f2.bound(x, 0, size).vectorize(x, 4).gpu_tile(x, xi, 32);

    f1.compile_jit(t);
    f2.compile_jit(t);

    double t1 = Tools::benchmark([&]() {f1.realize(f32_out); f32_out.device_sync(); });
    double t2 = Tools::benchmark([&]() {f2.realize(f16_out); f16_out.device_sync(); });

    printf("Times: %f %f\n", t1, t2);
    printf("Speed-up from using half type: %f x\n", t1 / t2);

    if (t2 > t1) {
        printf("Half should not have been slower than float\n");
        return 1;
    }

    f32_out.copy_to_host();
    f16_out.copy_to_host();
    for (int i = 0; i < size; i++) {
        if (f32_out(i) != 4.0f) {
            printf("f32_out(%d) = %f instead of 4\n", i, f32_out(i));
            return 1;
        }
        if (f16_out(i) != float16_t(4.0f)) {
            printf("f16_out(%d) = %f instead of 4\n", i, (float)f16_out(i));
            return 1;
        }
    }

    printf("Success!\n");
    return 0;
}