File: lesson_22_jit_performance.cpp

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (259 lines) | stat: -rw-r--r-- 11,290 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
// Halide tutorial lesson 22: JIT compilation performance

// This lesson demonstrates the various performance implications of the
// various Halide methods of doing "Just-In-Time" compilation.

// On linux, you can compile and run it like so:
// g++ lesson_22*.cpp -g -I <path/to/Halide.h> -I <path/to/tools/halide_benchmark.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_22 -std=c++17
// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_22

// On os x:
// g++ lesson_22*.cpp -g -I <path/to/Halide.h> -I <path/to/tools/halide_benchmark.h> -L <path/to/libHalide.so> -lHalide -o lesson_22 -std=c++17
// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_22

// If you have the entire Halide source tree, you can also build it by
// running:
//    make tutorial_lesson_22_jit_performance
// in a shell at the top of the halide source tree.

#include "Halide.h"
#include "halide_benchmark.h"
#include <stdio.h>

using namespace Halide;
using namespace Halide::Tools; // for benchmark()

// Let's define a helper function to construct a simple pipeline that we'll use for our performance tests.
Pipeline make_pipeline() {
    // We'll start with a simple transpose operation...
    Func input("input"), output("output");
    Var x("x"), y("y");

    // Fill the input with a linear combination of the coordinate values...
    input(x, y) = cast<uint16_t>(x + y);
    input.compute_root();

    // Transpose the rows and cols 
    output(x, y) = input(y, x);

    // Schedule it ... there's a number of possibilities here to do an efficient block-wise transpose.
    Var xi("xi"), yi("yi");
    
    // Let's focus on 8x8 subtiles, and then vectorize across X, and unroll across Y.
    output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);

    // For more advanced scheduling: 
    //
    // We can improve this even more by using the .in() directive (see Tutorial 19), 
    // which allows us to interpose new Funcs in between input and output.
    // 
    // Here we can inject a block_transpose function to allow us to do 8 vectorized loads from the input.
    Func block_transpose("block_transpose"), block("block");
    block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
    //
    // And now Let's reorder and vectorize in X across the block.
    block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);

    // Return the constructed pipeline
    return Pipeline(output);
}

int main(int argc, char **argv) {
    // Since we'll be using the same sample and iteration counts for our benchmarking,
    // let's define them here in the outermost scope.
    constexpr int samples = 100;
    constexpr int iterations = 1;
    
    // Now, let's measure the performance of constructing and executing a simple pipeline from scratch...
    {
        size_t count = 0;
        double t = benchmark(samples, iterations, [&]() {

            // First, create an output buffer to hold the results.
            Buffer<uint16_t> result(1024, 1024);
            
            // Now, construct our pipeline from scratch.
            Pipeline pipeline = make_pipeline();

            // And then call realize to execute the pipeline.
            pipeline.realize(result);
            ++count;
        });

        // On a MacBook Pro M1, we should get around ~1800 times/sec.
        std::cout << "Compile & Execute Pipeline (from scratch): " << int(count / t) << " times/sec\n";
    }

    // This time, let's create the pipeline outside the timing loop and re-use it for each execution...
    {
        // Create our pipeline, and re-use it in the loop below
        Pipeline pipeline = make_pipeline();

        size_t count = 0;
        double t = benchmark(samples, iterations, [&]() {

            // Create our output buffer
            Buffer<uint16_t> result(1024, 1024);
            
            // Now, call realize
            pipeline.realize(result);
            ++count;
        });

        // On a MacBook Pro M1, we should get around ~175000 times/sec (almost 95-100x times faster!).
        std::cout << "Compile & Execute Pipeline (re-use pipeline): " << int(count / t) << " times/sec\n";
    }

    // Let's do the same thing as before, but explicitly JIT compile before we realize...
    {
        Pipeline pipeline = make_pipeline();

        // Let's JIT compile for our target before we realize, and see what happens...
        const Target target = get_jit_target_from_environment();
        pipeline.compile_jit(target);

        size_t count = 0;
        double t = benchmark(samples, iterations, [&]() {
            Buffer<uint16_t> result(1024, 1024);
            pipeline.realize(result);
            ++count;
        });
 
        // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec)
        //
        // This may seem somewhat surprising, since compiling before realizing doesn't seem to make 
        // much of a difference to the previous case.  However, the first call to realize() will implicitly
        // JIT-compile and cache the generated code associated with the Pipeline object, which is basically 
        // what we've done here. Each subsequent call to realize uses the cached version of the native code, 
        // so there's no additional overhead, and the cost is amortized as we re-use the pipeline.
        std::cout << "Execute Pipeline (compile before realize): " << int(count / t) << " times/sec\n";

        // Another subtlety is the creation of the result buffer ... the declaration implicitly
        // allocates memory which will add overhead to each loop iteration. This time, let's try 
        // using the realize({1024, 1024}) call which will use the buffer managed by the pipeline 
        // object for the outputs...
        count = 0;
        t = benchmark(samples, iterations, [&]() {
            Buffer<uint16_t> result = pipeline.realize({1024, 1024});
            ++count;
        });

        // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec).
        std::cout << "Execute Pipeline (same but with realize({})): " << int(count / t) << " times/sec\n";

        // Or ... we could move the declaration of the result buffer outside the timing loop, and
        // re-use the allocation (with the caveat that we will be stomping over its contents on each 
        // execution).
        Buffer<uint16_t> result(1024, 1024);

        count = 0;
        t = benchmark(samples, iterations, [&]() {
            pipeline.realize(result);
            ++count;
        });

        // On a MacBook Pro M1, this should be much more efficient ... ~200000 times/sec (or 10-12% faster).
        std::cout << "Execute Pipeline (re-use buffer with realize): " << int(count / t) << " times/sec\n";
    }

    // Alternatively, we could compile to a Callable object...
    {
        Pipeline pipeline = make_pipeline();
        const Target target = get_jit_target_from_environment();

        // Here, we can ask the pipeline for its argument list (these are either Params,
        // ImageParams, or Buffers) so that we can construct a Callable object with the same 
        // calling convention.
        auto arguments = pipeline.infer_arguments();

        // The Callable object acts as a convenient way of invoking the compiled code like
        // a function call, using an argv-like syntax for the argument list. It also caches 
        // the JIT compiled code, so there's no code generation overhead when invoking the
        // callable object and executing the pipeline.
        Callable callable = pipeline.compile_to_callable(arguments, target);

        // Again, we'll pre-allocate and re-use the result buffer.
        Buffer<uint16_t> result(1024, 1024);

        size_t count = 0;
        double t = benchmark(samples, iterations, [&]() {
            callable(result);
            ++count;
        });

        // This should be about the same as the previous run (about ~200000 times/sec).
        std::cout << "Execute Pipeline (compile to callable): " << int(count / t) << " times/sec\n";

        // Perhaps even more convient, we can create a std::function object from the callable,
        // which allows cleaner type checking for the parameters, and slightly less overhead
        // for invoking the function. The list used for the template parameters needs to match
        // the list for the parameters of the pipeline.  Here, we have a single result buffer,
        // so we specify Buffer<uint16_t> in our call to .make_std_function<>. If we had other 
        // scalar parameters, input buffers or output buffers, we'd pass them in the template 
        // parameter list too.
        auto function = callable.make_std_function<Buffer<uint16_t>>();

        count = 0;
        t = benchmark(samples, iterations, [&]() {
            function(result);
            ++count;
        });

        // On a MacBook Pro M1, this should be slightly more efficient than the callable (~1% faster).
        std::cout << "Execute Pipeline (compile to std::function): " << int(count / t) << " times/sec\n";
    }

    // Let's see how much time is spent on just compiling...
    {
        Pipeline pipeline = make_pipeline();

        // Only the first call to compile_jit() is expensive ... after the code is generated,
        // it gets stored in a cache for later re-use, so repeatedly calling compile_jit has
        // very little overhead after its been cached.

        size_t count = 0;
        double t = benchmark(samples, iterations, [&]() {
            pipeline.compile_jit();
            ++count;
        });

        // Only the first call does any work and the rest are essentially free.
        // On a MacBook Pro M1, we should expect ~2 billion times/sec.
        std::cout << "Compile JIT (using cache): " << int(count / t) << " times/sec\n";

        // You can invalidate the cache manually, which will destroy all the compiled state.
        count = 0;
        t = benchmark(samples, iterations, [&]() {
            pipeline.invalidate_cache();
            pipeline.compile_jit();
            ++count;
        });

        // This is an intentionally expensive loop, and very slow!
        // On a MacBook Pro M1, we should see only ~2000 times/sec.
        std::cout << "Compile JIT (from scratch): " << int(count / t) << " times/sec\n";
    }

    // Alternatively we could compile to a Module...
    {
        Pipeline pipeline = make_pipeline();
        auto args = pipeline.infer_arguments();

        // Compiling to a module generates a self-contained Module containing an internal-representation
        // of the lowered code suitable for further compilation. So, it's not directly
        // runnable, but it can be used to link/combine Modules and generate object files,
        // static libs, bitcode, etc.

        size_t count = 0;
        double t = benchmark(samples, iterations, [&]() {
            Module m = pipeline.compile_to_module(args, "transpose");
            ++count;
        });

        // On a MacBook Pro M1, this should be around ~10000 times/sec
        std::cout << "Compile to Module: " << int(count / t) << " times/sec\n";
    }

    printf("DONE!\n");
    return 0;
}