File: const_division.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (174 lines) | stat: -rw-r--r-- 5,747 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#include "Halide.h"
#include "halide_benchmark.h"
#include <cstdint>
#include <cstdio>
#include <random>

using namespace Halide;
using namespace Halide::Tools;

// Use std::mt19937 instead of rand() to ensure consistent behavior on all systems.
// Note that this returns an unsigned result of at-least-32 bits.
std::mt19937 rng(0);

template<typename T>
bool test(int w, bool div, bool round_to_zero) {
    Func f, g, h;
    Var x, y;

    size_t bits = sizeof(T) * 8;
    bool is_signed = (T)(-1) < (T)(0);

    printf("%sInt(%2d, %2d)    ",
           is_signed ? " " : "U",
           (int)bits, w);

    int min_val = 2, num_vals = 254;

    if (bits <= 8 && is_signed) {
        // There are two types of integer division that cause runtime crashes:
        // 1) Division by zero
        // 2) Division of the smallest negative number by -1 (because
        // the result overflows)
        // In either case, let's avoid overflows to dodge such errors.
        num_vals = 126;
    }

    Buffer<T> input(w, num_vals);

    for (int y = 0; y < num_vals; y++) {
        for (int x = 0; x < input.width(); x++) {
            uint32_t bits = (uint32_t)rng();
            input(x, y) = (T)bits;
            // Round-to-zero faults on zero denominators
            if (round_to_zero && (input(x, y) == 0)) {
                input(x, y) = 1;
            }
        }
    }

    if (div) {
        if (round_to_zero) {
            // Test div. We'll unroll entirely across y to turn the denominator into a constant.
            f(x, y) = div_round_to_zero(input(x, y), cast<T>(y + min_val));

            // Reference good version. Not unrolled across y.
            g(x, y) = div_round_to_zero(input(x, y), cast<T>(y + min_val));

            // Version that uses fast_integer_divide
            h(x, y) = Halide::fast_integer_divide_round_to_zero(input(x, y), cast<uint8_t>(y + min_val));
        } else {
            // Test div
            f(x, y) = input(x, y) / cast<T>(y + min_val);

            // Reference good version
            g(x, y) = input(x, y) / cast<T>(y + min_val);

            // Version that uses fast_integer_divide
            h(x, y) = Halide::fast_integer_divide(input(x, y), cast<uint8_t>(y + min_val));
        }
    } else {
        // Test mod
        f(x, y) = input(x, y) % cast<T>(y + min_val);

        // Reference good version
        g(x, y) = input(x, y) % cast<T>(y + min_val);

        // Version that uses fast_integer_modulo
        h(x, y) = Halide::fast_integer_modulo(input(x, y), cast<uint8_t>(y + min_val));
    }

    // Try dividing by all the known constants using vectors
    f.bound(y, 0, num_vals).bound(x, 0, input.width()).unroll(y);
    h.bound(x, 0, input.width());
    if (w > 1) {
        f.vectorize(x);
        h.vectorize(x);
    }
    Target t = get_jit_target_from_environment();
    f.compile_jit(t);
    g.compile_jit(t);
    h.compile_jit(t);

    Buffer<T> correct = g.realize({input.width(), num_vals});
    double t_correct = benchmark([&]() { g.realize(correct); });

    Buffer<T> fast = f.realize({input.width(), num_vals});
    double t_fast = benchmark([&]() { f.realize(fast); });

    Buffer<T> fast_dynamic = h.realize({input.width(), num_vals});
    double t_fast_dynamic = benchmark([&]() { h.realize(fast_dynamic); });

    printf("%6.3f                  %6.3f\n", t_correct / t_fast, t_correct / t_fast_dynamic);

    for (int y = 0; y < num_vals; y++) {
        for (int x = 0; x < input.width(); x++) {
            if (fast(x, y) != correct(x, y)) {
                printf("fast(%d, %d) = %lld instead of %lld (%lld/%d)\n",
                       x, y,
                       (long long int)fast(x, y),
                       (long long int)correct(x, y),
                       (long long int)input(x, y),
                       (T)(y + min_val));
                return false;
            }
        }
    }

    return true;
}

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();
    if (target.arch == Target::WebAssembly) {
        printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
        return 0;
    }

    int seed = argc > 1 ? atoi(argv[1]) : time(nullptr);
    rng.seed(seed);
    std::cout << "const_division test seed: " << seed << std::endl;

    bool success = true;
    for (int i = 0; i < 3; i++) {
        switch (i) {
        case 0:
            printf("division rounding to negative infinity:\n");
            break;
        case 1:
            printf("signed division rounding to zero:\n");
            break;
        case 2:
            printf("modulus:\n");
            break;
        }
        printf("type            const-divisor speed-up  runtime-divisor speed-up\n");

        // Scalar
        success = success && test<int32_t>(1, i == 0, i == 1);
        success = success && test<int16_t>(1, i == 0, i == 1);
        success = success && test<int8_t>(1, i == 0, i == 1);
        if (i != 1) {
            success = success && test<uint32_t>(1, i == 0, false);
            success = success && test<uint16_t>(1, i == 0, false);
            success = success && test<uint8_t>(1, i == 0, false);
        }

        // Vector
        success = success && test<int32_t>(8, i == 0, i == 1);
        success = success && test<int16_t>(16, i == 0, i == 1);
        success = success && test<int8_t>(32, i == 0, i == 1);
        if (i != 1) {
            success = success && test<uint32_t>(8, i == 0, false);
            success = success && test<uint16_t>(16, i == 0, false);
            success = success && test<uint8_t>(32, i == 0, false);
        }
    }

    if (!success) {
        return 1;
    }

    printf("Success!\n");
    return 0;
}