File: lots_of_small_allocations.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (77 lines) | stat: -rw-r--r-- 2,491 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#include "Halide.h"
#include "halide_benchmark.h"

using namespace Halide;

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();
    if (target.arch == Target::WebAssembly) {
        printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
        return 0;
    }

    Param<int> p;

    const char *names[3] = {"heap", "pseudostack", "stack"};

    double t[3];
    for (int i = 0; i < 3; i++) {
        Var x("x");

        Func in;
        in(x) = x;

        std::vector<Func> chain;
        chain.push_back(in);
        for (int j = 0; j < 50; j++) {
            Func next;
            // Iterate the Collatz conjecture
            Expr prev = chain.back()(x);
            next(x) = select(prev % 2 == 0, prev / 2, 3 * prev + 1);
            chain.push_back(next);
        }

        Var xo, xi;
        chain.back().split(x, xo, xi, p, TailStrategy::RoundUp);
        for (size_t j = 0; j < chain.size() - 1; j++) {
            chain[j].compute_at(chain.back(), xo);
            if (i != 0) {
                chain[j].store_in(MemoryType::Stack);
            }
            if (i == 2) {
                chain[j].bound_extent(x, p);
            }
            // Vectorize. Otherwise llvm autovectorizes the stack version, confusing the results
            chain[j].vectorize(x, 8, TailStrategy::RoundUp);
        }
        // One of the problems with frequent heap allocations is that
        // they can serialize in the allocator, so we should
        // parallelize things too.
        Var xoo;
        if (i == 2) {
            chain.back().specialize(p == 200).split(xo, xoo, xo, 100, TailStrategy::RoundUp).parallel(xoo);
            chain.back().specialize_fail("Expected p == 200");
        } else {
            chain.back().split(xo, xoo, xo, 100, TailStrategy::RoundUp).parallel(xoo);
        }
        chain.back().vectorize(xi, 8, TailStrategy::RoundUp);

        // Make it too large for llvm to promote into registers or
        // bother unrolling. We're trying to compare stack to
        // pseudostack, not stack to register.
        p.set(200);

        Buffer<int> out(16 * 1000 * 1000);
        t[i] = Halide::Tools::benchmark([&] { chain.back().realize(out); });

        printf("Time using %s: %f\n", names[i], t[i]);
    }

    if (t[0] < t[1]) {
        printf("Heap allocation was faster than pseudostack!\n");
        return 1;
    }

    printf("Success!\n");
    return 0;
}