1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
|
// Halide tutorial lesson 24: Async execution
// This lesson demonstrates how to asynchronously execute a function
// using scheduling directives 'async' and 'ring_buffer'.
// On linux, you can compile and run it like so:
// g++ lesson_24*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_24 -std=c++17
// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_24
// On os x:
// g++ lesson_24*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_24 -std=c++17
// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_24
// If you have the entire Halide source tree, you can also build it by
// running:
// make tutorial_lesson_24_async
// in a shell with the current directory at the top of the halide
// source tree.
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
// Declare some Vars to use below.
Var x("x"), y("y"), c("c"), xo("xo"), yo("yo"), xi("xi"), yi("yi"), tile("tile");
{
// In this example we simply tell Halide to run `producer` in a
// separate thread. This is not very useful on its own, but is a good start
// for the next examples.
Func producer("producer"), consumer("consumer");
producer(x, y) = x + y;
consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1);
consumer.compute_root();
// Use async() to produce `producer` in a separate thread.
producer.compute_root().async();
// The high-level structure of the generated code will be:
// {
// allocate producer[...]
// thread #1 {
// produce producer {
// ...
// }
// signal that data is ready
// }
// thread #2 {
// consume producer {
// block until producer data is ready
// produce consumer {
// ...
// }
// }
// }
// }
consumer.realize({128, 128});
}
{
// Now let's use async() to execute two different producers simultaneously.
// This could be useful in various scenarios when you want to overlap
// computations of different functions in time. For example, you could execute
// producer1 and producer2 on different devices in parallel (e.g producer1 on CPU
// and producer2 on GPU).
Func producer1("producer1"), producer2("producer2"), consumer("consumer");
producer1(x, y) = x + y;
producer2(x, y) = x + y;
consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1);
// With the schedule below, `producer1` and `producer2` computations will be each
// launched in separate threads. Since `consumer` depends on both of them, and producers
// are scheduled as compute_root(), `consumer` will have to wait until `producer1` and
// `producer2` fully completed their work. The required synchronization primitives
// will be added between producers and `consumer` to ensure that it's safe for `consumer`
// to start its work and input data is fully ready.
consumer.compute_root();
producer1.compute_root().async();
producer2.compute_root().async();
// The high-level structure of the generated code will be:
// {
// allocate producer1[...]
// allocate producer2[...]
// thread #1 {
// produce producer1 {
// ...
// }
// signal that producer1 data is ready
// }
// thread #2 {
// produce producer2 {
// ...
// }
// signal that producer2 data is ready
// }
// thread #3 {
// consume producer1 {
// consume producer2 {
// block until producer1 data is ready
// block until producer2 data is ready
// produce consumer {
// ...
// }
// }
// }
// }
// }
consumer.realize({128, 128});
}
{
// In the previous example, we managed to run two producers in parallel, but `consumer` had
// to wait until the data is fully ready. Wouldn't it be great if we could overlap computations
// of `producer` and `consumer` too? This computational pattern is known as 'double buffering' and
// can be critical for achieving good performance in certain scenarios. The high-level idea is that
// producer is allowed to run ahead and do the next chunk of work without waiting while consumer
// is processing the current chunk. The obvious drawback of this method is that it requires twice
// as much memory for `producer`.
Func producer("producer"), consumer("consumer");
producer(x, y, c) = (x + y) * (c + 1);
consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c);
consumer.compute_root();
// In this example the planes are processed separately, so producer can run ahead
// and start producing plane `c + 1`, while `consumer` consumes already produced plane `c`.
// One way to express it with Halide schedule is very similar to how sliding window schedules
// are expressed (see lesson_8 for details). There are indeed a lot of commonalities between the two
// because both of them are relying on a circular buffer as underlying data structure.
producer
.async()
.compute_at(consumer, c)
// fold_storage requires store_at which is separate from compute_at.
.store_at(consumer, Var::outermost())
// Explicit fold_storage is required here, because otherwise Halide will infer that only
// one plane of `producer` is necessary for `consumer`, but for the purposes of this
// example we want at least 2.
// Please, note that adding a fold_storage(c, 2) will double the amount of storage allocated
// for `producer`.
.fold_storage(c, 2);
// The high-level structure of the generated code will be:
// {
// allocate producer1[extent.x, extent.y, 2]
// // In this case there are two semaphores, because producer can run ahead, so we need
// // to track how much was consumed and produced separately.
// // This semaphore indicates how much producer has produced.
// producer1.semaphore = 0
// // This semaphore indicates how much `space` for producer is available.
// producer1.folding_semaphore = 2
// thread #1 {
// loop over c {
// // Acquire a semaphore or block until the space to produce to is available.
// // The semaphore is released by consumer thread, when the data was fully
// // consumed.
// acquire(producer1.folding_semaphore, 1)
// produce producer1 {
// // Produce the next plane of the producer1 and store it at index c % 2.
// producer1[_, _, c % 2] = ...
// // Release a semaphore to indicate that plane was produced, consumer will
// // acquire this semaphore in the other thread.
// release(producer1.semaphore)
// }
// }
// }
// thread #2 {
// loop over c {
// // Acquire a semaphore or block until the data from producer is ready.
// // The semaphore is released by producer thread, when the data was fully
// // produced.
// acquire(producer1.semaphore, 1)
// consume producer1 {
// consumer[_, _, c] = <computations which use producer[_, _, c % 2]>
// // Release a semaphore to indicate that plane was consumed, producer will
// // acquire this semaphore in the other thread.
// release(producer1.folding_semaphore)
// }
// }
// }
// }
consumer.realize({128, 128, 4});
}
{
// In the previous example, we relied on the storage folding to express double buffering
// technique, but there is another, more direct way to do that.
Func producer("producer"), consumer("consumer");
producer(x, y, c) = (x + y) * (c + 1);
consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c);
consumer.compute_root();
// As mentioned in the previous example, the planes are processed separately, so producer can run
// ahead and start producing plane `c + 1`, while `consumer` consumes already produced plane `c`.
// A more direct way to express this would be to hoist storage of `producer` to ouside of the loop
// `c` over planes, double its size and add necessary indices to flip the planes.
// The first part can be achieved with `hoist_storage` directive and the rest is done with
// `ring_buffer`. Please, note that it's enough to provide only extent of the ring buffer, there is no
// need to specify an explicit loop level to tie ring buffer to, because the index for ring buffer
// will be implicitly computed based on a linear combination of loop variables between storage and
// compute_at/store_at levels.
producer
.async()
.compute_at(consumer, c)
.hoist_storage(consumer, Var::outermost())
// Similarly, to the previous example, the amount of storage is doubled here.
.ring_buffer(2);
// The high-level structure of the generated code will be very similar to the previous example.
consumer.realize({128, 128, 4});
}
{
// The advantage of the `hoist_storage` + `ring_buffer` approach is that it can be applied to
// fairly arbitrary loop splits and tilings. For example, in the following schedule instead of
// double buffering over whole planes, we double buffer over sub-regions or tiles of the planes.
// This is not possible to achieve with fold_storage, because it works over the *storage*
// dimensions of the function and not the loop splits.
Func producer("producer"), consumer("consumer");
producer(x, y, c) = (x + y) * (c + 1);
consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c);
consumer.compute_root()
.tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::Auto);
producer
.async()
.compute_at(consumer, xo)
.hoist_storage(consumer, Var::outermost())
.ring_buffer(2);
// // The high-level structure of the generated code will be:
// {
// // The size of the tile (16, 16, 1) + extra to accomodate 3x3 filter. The fourth dimension
// // is added by ring_buffer() directive.
// allocate producer1[18, 18, 1, 2]
// // In this case there are two semaphores, because producer can run ahead, so we need
// // to track how much was consumed and produced separately.
// // This semaphore indicates how much producer has produced.
// producer1.semaphore = 0
// // This semaphore indicates how much `space` for producer is available.
// producer1.folding_semaphore.ring_buffer = 2
// thread #1 {
// loop over c {
// loop over yo {
// loop over xo {
// // Acquire a semaphore or block until the space to produce to is available.
// // The semaphore is released by consumer thread, when the data was fully
// // consumed.
// acquire(producer1.folding_semaphore.ring_buffer, 1)
// produce producer1 {
// // The index of ring buffer is computed as a linear combination of the all loop
// // variables up to the storage level.
// ring_buffer_index = <linear combination of c, yo, xo> % 2
// // Produce the next tile of the producer1 and store it at index ring_buffer_index.
// producer1[x, y, 0, ring_buffer_index % 2] = ...
// // Release a semaphore to indicate that tile was produced, consumer will
// // acquire this semaphore in the other thread.
// release(producer1.semaphore)
// }
// }
// }
// }
// }
// thread #2 {
// loop over c {
// loop over yo {
// loop over xo {
// // Acquire a semaphore or block until the data from producer is ready.
// // The semaphore is released by producer thread, when the data was fully
// // produced.
// acquire(producer1.semaphore, 1)
// consume producer1 {
// ring_buffer_index = <linear combination of c, yo, xo> % 2
// consumer[_, _, c] = <computations which use producer[_, _, 0, ring_buffer_index]>
// // Release a semaphore to indicate that tile was consumed, producer will
// // acquire this semaphore in the other thread.
// release(producer1.folding_semaphore.ring_buffer)
// }
// }
// }
// }
// }
// }
consumer.realize({128, 128, 4});
}
printf("Success!\n");
return 0;
}
|