1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
|
#!/usr/bin/python3
# Halide tutorial lesson 9
# This lesson demonstrates how to define a hl.Func in multiple passes,
# including scattering.
# This lesson can be built by invoking the command:
# make test_tutorial_lesson_09_update_definitions
# in a shell with the current directory at python_bindings/
import halide as hl
import halide.imageio
import numpy as np
import os.path
def main():
# Declare some Vars to use below.
x, y = hl.Var("x"), hl.Var("y")
# Load a grayscale image to use as an input.
image_path = os.path.join(
os.path.dirname(__file__), "../../tutorial/images/gray.png"
)
input_data = halide.imageio.imread(image_path)
if True:
# making the image smaller to go faster
input_data = input_data[:150, :160]
assert input_data.dtype == np.uint8
input = hl.Buffer(input_data)
# You can define a hl.Func in multiple passes. Let's see a toy
# example first.
if True:
# The first definition must be one like we have seen already
# - a mapping from Vars to an hl.Expr:
f = hl.Func("f")
f[x, y] = x + y
# We call this first definition the "pure" definition.
# But the later definitions can include computed expressions on
# both sides. The simplest example is modifying a single point:
f[3, 7] = 42
# We call these extra definitions "update" definitions, or
# "reduction" definitions. A reduction definition is an
# update definition that recursively refers back to the
# function's current value at the same site:
if False:
e = f[x, y] + 17
print("f[x, y] + 17", e)
print("(f[x, y] + 17).type()", e.type())
print("(f[x, y]).type()", f[x, y].type())
f[x, y] = f[x, y] + 17
# If we confine our update to a single row, we can
# recursively refer to values in the same column:
f[x, 3] = f[x, 0] * f[x, 10]
# Similarly, if we confine our update to a single column, we
# can recursively refer to other values in the same row.
f[0, y] = f[0, y] / f[3, y]
# The general rule is: Each hl.Var used in an update definition
# must appear unadorned in the same position as in the pure
# definition in all references to the function on the left-
# and right-hand sides. So the following definitions are
# legal updates:
# x is used, so all uses of f must have x as the first argument.
f[x, 17] = x + 8
# y is used, so all uses of f must have y as the second argument.
f[0, y] = y * 8
f[x, x + 1] = x + 8
f[y / 2, y] = f[0, y] * 17
# But these ones would cause an error:
# f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'.
# f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'.
# f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places.
# f[3, 4) = x + y <- Free variables appear on the right-hand-side
# but not the left-hand-side.
# We'll realize this one just to make sure it compiles. The
# second-to-last definition forces us to realize over a
# domain that is taller than it is wide.
f.realize([100, 101])
# For each realization of f, each step runs in its entirety
# before the next one begins. Let's trace the loads and
# stores for a simpler example:
g = hl.Func("g")
g[x, y] = x + y # Pure definition
g[2, 1] = 42 # First update definition
g[x, 0] = g[x, 1] # Second update definition
g.trace_loads()
g.trace_stores()
g.realize([4, 4])
# Reading the log, we see that each pass is applied in turn. The
# equivalent Python is:
result = np.empty((4, 4), dtype=np.int32)
# Pure definition
for yy in range(4):
for xx in range(4):
result[yy][xx] = xx + yy
# First update definition
result[1][2] = 42
# Second update definition
for xx in range(4):
result[0][xx] = result[1][xx]
# end of section
# Putting update passes inside loops.
if True:
# Starting with this pure definition:
f = hl.Func("f")
f[x, y] = x + y
# Say we want an update that squares the first fifty rows. We
# could do this by adding 50 update definitions:
# f[x, 0) = f[x, 0) * f[x, 0)
# f[x, 1) = f[x, 1) * f[x, 1)
# f[x, 2) = f[x, 2) * f[x, 2)
# ...
# f[x, 49) = f[x, 49) * f[x, 49)
# Or equivalently using a compile-time loop in our C++:
# for (int i = 0 i < 50 i++) {
# f[x, i) = f[x, i) * f[x, i)
#
# But it's more manageable and more flexible to put the loop
# in the generated code. We do this by defining a "reduction
# domain" and using it inside an update definition:
r = hl.RDom([(0, 50)])
f[x, r] = f[x, r] * f[x, r]
halide_result = f.realize([100, 100])
# The equivalent Python is:
py_result = np.empty((100, 100), dtype=np.int32)
for yy in range(100):
for xx in range(100):
py_result[yy][xx] = xx + yy
for xx in range(100):
for rr in range(50):
# The loop over the reduction domain occurs inside of
# the loop over any pure variables used in the update
# step:
py_result[rr][xx] = py_result[rr][xx] * py_result[rr][xx]
# Check the results match:
for yy in range(100):
for xx in range(100):
assert halide_result[xx, yy] == py_result[yy][xx], (
f"halide_result({xx}, {yy}) = {halide_result[xx, yy]} instead of {py_result[yy][xx]}"
)
# Now we'll examine a real-world use for an update definition:
# computing a histogram.
if True:
# Some operations on images can't be cleanly expressed as a pure
# function from the output coordinates to the value stored
# there. The classic example is computing a histogram. The
# natural way to do it is to iterate over the input image,
# updating histogram buckets. Here's how you do that in Halide:
histogram = hl.Func("histogram")
# Histogram buckets start as zero.
histogram[x] = 0
# Define a multi-dimensional reduction domain over the input image:
r = hl.RDom([(0, input.width()), (0, input.height())])
# For every point in the reduction domain, increment the
# histogram bucket corresponding to the intensity of the
# input image at that point.
histogram[input[r.x, r.y]] += 1
halide_result = histogram.realize([256])
# The equivalent Python is:
py_result = np.empty((256), dtype=np.int32)
for xx in range(256):
py_result[xx] = 0
for r_y in range(input.height()):
for r_x in range(input.width()):
py_result[input_data[r_y, r_x]] += 1
# Check the answers agree:
for xx in range(256):
assert py_result[xx] == halide_result[xx], (
f"halide_result({xx}) = {halide_result[xx]} instead of {py_result[xx]}"
)
# Scheduling update steps
if True:
# The pure variables in an update step and can be
# parallelized, vectorized, split, etc as usual.
# Vectorizing, splitting, or parallelize the variables that
# are part of the reduction domain is trickier. We'll cover
# that in a later lesson.
# Consider the definition:
f = hl.Func("x")
f[x, y] = x * y
# Set the second row to equal the first row.
f[x, 1] = f[x, 0]
# Set the second column to equal the first column plus 2.
f[1, y] = f[0, y] + 2
# The pure variables in each stage can be scheduled
# independently. To control the pure definition, we schedule
# as we have done in the past. The following code vectorizes
# and parallelizes the pure definition only.
f.vectorize(x, 4).parallel(y)
# We use hl.Func::update(int) to get a handle to an update step
# for the purposes of scheduling. The following line
# vectorizes the first update step across x. We can't do
# anything with y for this update step, because it doesn't
# use y.
f.update(0).vectorize(x, 4)
# Now we parallelize the second update step in chunks of size
# 4.
yo, yi = hl.Var("yo"), hl.Var("yi")
f.update(1).split(y, yo, yi, 4).parallel(yo)
halide_result = f.realize([16, 16])
# Here's the equivalent (serial) C:
py_result = np.empty((16, 16), dtype=np.int32)
# Pure step. Vectorized in x and parallelized in y.
for yy in range(16): # Should be a parallel for loop
for x_vec in range(4):
xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
py_result[yy][xx[0]] = xx[0] * yy
py_result[yy][xx[1]] = xx[1] * yy
py_result[yy][xx[2]] = xx[2] * yy
py_result[yy][xx[3]] = xx[3] * yy
# First update. Vectorized in x.
for x_vec in range(4):
xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3]
py_result[1][xx[0]] = py_result[0][xx[0]]
py_result[1][xx[1]] = py_result[0][xx[1]]
py_result[1][xx[2]] = py_result[0][xx[2]]
py_result[1][xx[3]] = py_result[0][xx[3]]
# Second update. Parallelized in chunks of size 4 in y.
for yo in range(4): # Should be a parallel for loop
for yi in range(4):
yy = yo * 4 + yi
py_result[yy][1] = py_result[yy][0] + 2
# Check the C and Halide results match:
for yy in range(16):
for xx in range(16):
assert halide_result[xx, yy] == py_result[yy][xx], (
f"halide_result({xx}, {yy}) = {halide_result[xx, yy]} instead of {py_result[yy][xx]}"
)
# That covers how to schedule the variables within a hl.Func that
# uses update steps, but what about producer-consumer
# relationships that involve compute_at and store_at? Let's
# examine a reduction as a producer, in a producer-consumer pair.
if True:
# Because an update does multiple passes over a stored array,
# it's not meaningful to inline them. So the default schedule
# for them does the closest thing possible. It computes them
# in the innermost loop of their consumer. Consider this
# trivial example:
producer, consumer = hl.Func("producer"), hl.Func("consumer")
producer[x] = x * 17
producer[x] += 1
consumer[x] = 2 * producer[x]
halide_result = consumer.realize([10])
# The equivalent Python is:
py_result = np.empty((10), dtype=np.int32)
for xx in range(10):
producer_storage = np.empty((1), dtype=np.int32)
# Pure step for producer
producer_storage[0] = xx * 17
# Update step for producer
producer_storage[0] = producer_storage[0] + 1
# Pure step for consumer
py_result[xx] = 2 * producer_storage[0]
# Check the results match
for xx in range(10):
assert halide_result[xx] == py_result[xx], (
f"halide_result({xx}) = {halide_result[xx]} instead of {py_result[xx]}"
)
# For all other compute_at/store_at options, the reduction
# gets placed where you would expect, somewhere in the loop
# nest of the consumer.
# Now let's consider a reduction as a consumer in a
# producer-consumer pair. This is a little more involved.
if True:
if True:
# Case 1: The consumer references the producer in the pure step
# only.
producer, consumer = hl.Func("producer"), hl.Func("consumer")
# The producer is pure.
producer[x] = x * 17
consumer[x] = 2 * producer[x]
consumer[x] += 1
# The valid schedules for the producer in this case are
# the default schedule - inlined, and also:
#
# 1) producer.compute_at(x), which places the computation of
# the producer inside the loop over x in the pure step of the
# consumer.
#
# 2) producer.compute_root(), which computes all of the
# producer ahead of time.
#
# 3) producer.store_root().compute_at(x), which allocates
# space for the consumer outside the loop over x, but fills
# it in as needed inside the loop.
#
# Let's use option 1.
producer.compute_at(consumer, x)
halide_result = consumer.realize([10])
# The equivalent Python is:
py_result = np.empty((10), dtype=np.int32)
# Pure step for the consumer
for xx in range(10):
# Pure step for producer
producer_storage = np.empty((1), dtype=np.int32)
producer_storage[0] = xx * 17
py_result[xx] = 2 * producer_storage[0]
# Update step for the consumer
for xx in range(10):
py_result[xx] += 1
# All of the pure step is evaluated before any of the
# update step, so there are two separate loops over x.
# Check the results match
for xx in range(10):
assert halide_result[xx] == py_result[xx], (
f"halide_result({xx}) = {halide_result[xx]} instead of {py_result[xx]}"
)
if True:
# Case 2: The consumer references the producer in the update step
# only
producer, consumer = hl.Func("producer"), hl.Func("consumer")
producer[x] = x * 17
consumer[x] = x
consumer[x] += producer[x]
# Again we compute the producer per x coordinate of the
# consumer. This places producer code inside the update
# step of the producer, because that's the only step that
# uses the producer.
producer.compute_at(consumer, x)
# Note however, that we didn't say:
#
# producer.compute_at(consumer.update(0), x).
#
# Scheduling is done with respect to Vars of a hl.Func, and
# the Vars of a hl.Func are shared across the pure and
# update steps.
halide_result = consumer.realize([10])
# The equivalent Python is:
py_result = np.empty((10), dtype=np.int32)
# Pure step for the consumer
for xx in range(10):
py_result[xx] = xx
# Update step for the consumer
for xx in range(10):
# Pure step for producer
producer_storage = np.empty((1), dtype=np.int32)
producer_storage[0] = xx * 17
py_result[xx] += producer_storage[0]
# Check the results match
for xx in range(10):
assert halide_result[xx] == py_result[xx], (
f"halide_result({xx}) = {halide_result[xx]} instead of {py_result[xx]}"
)
if True:
# Case 3: The consumer references the producer in
# multiple steps that share common variables
producer, consumer = hl.Func("producer"), hl.Func("consumer")
producer[x] = x * 17
consumer[x] = producer[x] * x
consumer[x] += producer[x]
# Again we compute the producer per x coordinate of the
# consumer. This places producer code inside both the
# pure and the update step of the producer. So there ends
# up being two separate realizations of the producer, and
# redundant work occurs.
producer.compute_at(consumer, x)
halide_result = consumer.realize([10])
# The equivalent Python is:
py_result = np.empty((10), dtype=np.int32)
# Pure step for the consumer
for xx in range(10):
# Pure step for producer
producer_storage = np.empty((1), dtype=np.int32)
producer_storage[0] = xx * 17
py_result[xx] = producer_storage[0] * xx
# Update step for the consumer
for xx in range(10):
# Another copy of the pure step for producer
producer_storage = np.empty((1), dtype=np.int32)
producer_storage[0] = xx * 17
py_result[xx] += producer_storage[0]
# Check the results match
for xx in range(10):
assert halide_result[xx] == py_result[xx], (
f"halide_result({xx}) = {halide_result[xx]} instead of {py_result[xx]}"
)
if True:
# Case 4: The consumer references the producer in
# multiple steps that do not share common variables
producer, consumer = hl.Func("producer"), hl.Func("consumer")
producer[x, y] = x * y
consumer[x, y] = x + y
consumer[x, 0] = producer[x, x - 1]
consumer[0, y] = producer[y, y - 1]
# In this case neither producer.compute_at(consumer, x)
# nor producer.compute_at(consumer, y) will work, because
# either one fails to cover one of the uses of the
# producer. So we'd have to inline producer, or use
# producer.compute_root().
# Let's say we really really want producer to be
# compute_at the inner loops of both consumer update
# steps. Halide doesn't allow multiple different
# schedules for a single hl.Func, but we can work around it
# by making two wrappers around producer, and scheduling
# those instead:
# Attempt 2:
producer_wrapper_1, producer_wrapper_2, consumer_2 = (
hl.Func(),
hl.Func(),
hl.Func(),
)
producer_wrapper_1[x, y] = producer[x, y]
producer_wrapper_2[x, y] = producer[x, y]
consumer_2[x, y] = x + y
consumer_2[x, 0] += producer_wrapper_1[x, x - 1]
consumer_2[0, y] += producer_wrapper_2[y, y - 1]
# The wrapper functions give us two separate handles on
# the producer, so we can schedule them differently.
producer_wrapper_1.compute_at(consumer_2, x)
producer_wrapper_2.compute_at(consumer_2, y)
halide_result = consumer_2.realize([10, 10])
# The equivalent Python is:
py_result = np.empty((10, 10), dtype=np.int32)
# Pure step for the consumer
for yy in range(10):
for xx in range(10):
py_result[yy][xx] = xx + yy
# First update step for consumer
for xx in range(10):
producer_wrapper_1_storage = np.empty((1), dtype=np.int32)
producer_wrapper_1_storage[0] = xx * (xx - 1)
py_result[0][xx] += producer_wrapper_1_storage[0]
# Second update step for consumer
for yy in range(10):
producer_wrapper_2_storage = np.empty((1), dtype=np.int32)
producer_wrapper_2_storage[0] = yy * (yy - 1)
py_result[yy][0] += producer_wrapper_2_storage[0]
# Check the results match
for yy in range(10):
for xx in range(10):
assert halide_result[xx, yy] == py_result[yy][xx], (
f"halide_result({xx}, {yy}) = {halide_result[xx, yy]} instead of {py_result[yy][xx]}"
)
if True:
# Case 5: Scheduling a producer under a reduction domain
# variable of the consumer.
# We are not just restricted to scheduling producers at
# the loops over the pure variables of the consumer. If a
# producer is only used within a loop over a reduction
# domain (hl.RDom) variable, we can also schedule the
# producer there.
producer, consumer = hl.Func("producer"), hl.Func("consumer")
r = hl.RDom([(0, 5)])
producer[x] = x * 17
consumer[x] = x + 10
consumer[x] += r + producer[x + r]
producer.compute_at(consumer, r)
halide_result = consumer.realize([10])
# The equivalent Python is:
py_result = np.empty((10), dtype=np.int32)
# Pure step for the consumer.
for xx in range(10):
py_result[xx] = xx + 10
# Update step for the consumer.
for xx in range(10):
# The loop over the reduction domain is always the inner loop.
for rr in range(5):
# We've schedule the storage and computation of
# the producer here. We just need a single value.
producer_storage = np.empty((1), dtype=np.int32)
# Pure step of the producer.
producer_storage[0] = (xx + rr) * 17
# Now use it in the update step of the consumer.
py_result[xx] += rr + producer_storage[0]
# Check the results match
for xx in range(10):
assert halide_result[xx] == py_result[xx], (
f"halide_result({xx}) = {halide_result[xx]} instead of {py_result[xx]}"
)
# A real-world example of a reduction inside a producer-consumer chain.
if True:
# The default schedule for a reduction is a good one for
# convolution-like operations. For example, the following
# computes a 5x5 box-blur of our grayscale test image with a
# hl.clamp-to-edge boundary condition:
# First add the boundary condition.
clamped = hl.BoundaryConditions.repeat_edge(input)
# Define a 5x5 box that starts at (-2, -2)
r = hl.RDom([(-2, 5), (-2, 5)])
# Compute the 5x5 sum around each pixel.
local_sum = hl.Func("local_sum")
local_sum[x, y] = 0 # Compute the sum as a 32-bit integer
local_sum[x, y] += clamped[x + r.x, y + r.y]
# Divide the sum by 25 to make it an average
blurry = hl.Func("blurry")
blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25)
halide_result = blurry.realize([input.width(), input.height()])
# The default schedule will inline 'clamped' into the update
# step of 'local_sum', because clamped only has a pure
# definition, and so its default schedule is fully-inlined.
# We will then compute local_sum per x coordinate of blurry,
# because the default schedule for reductions is
# compute-innermost. Here's the equivalent Python:
# cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0]
local_sum = np.empty((1), dtype=np.int32)
py_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()])
for yy in range(input.height()):
for xx in range(input.width()):
# FIXME this loop is quite slow
# Pure step of local_sum
local_sum[0] = 0
# Update step of local_sum
for r_y in range(-2, 2 + 1):
for r_x in range(-2, 2 + 1):
# The clamping has been inlined into the update step.
clamped_x = min(max(xx + r_x, 0), input.width() - 1)
clamped_y = min(max(yy + r_y, 0), input.height() - 1)
local_sum[0] += input[clamped_x, clamped_y]
# Pure step of blurry
# py_result(x, y) = (uint8_t)(local_sum[0] / 25)
# py_result[xx, yy] = cast_to_uint8(local_sum[0] / 25)
# hl.cast done internally
py_result[xx, yy] = int(local_sum[0] / 25)
# Check the results match
for yy in range(input.height()):
for xx in range(input.width()):
assert halide_result[xx, yy] == py_result[xx, yy], (
f"halide_result({xx}, {yy}) = {halide_result[xx, yy]} instead of {py_result[xx, yy]}"
)
# Reduction helpers.
if True:
# There are several reduction helper functions provided in
# Halide.h, which compute small reductions and schedule them
# innermost into their consumer. The most useful one is
# "sum".
f1 = hl.Func("f1")
r = hl.RDom([(0, 100)])
f1[x] = hl.sum(r + x) * 7
# Sum creates a small anonymous hl.Func to do the reduction. It's
# equivalent to:
f2, anon = hl.Func("f2"), hl.Func("anon")
anon[x] = 0
anon[x] += r + x
f2[x] = anon[x] * 7
# So even though f1 references a reduction domain, it is a
# pure function. The reduction domain has been swallowed to
# define the inner anonymous reduction.
halide_result_1 = f1.realize([10])
halide_result_2 = f2.realize([10])
# The equivalent Python is:
py_result = np.empty((10), dtype=np.int32)
for xx in range(10):
anon = np.empty((1), dtype=np.int32)
anon[0] = 0
for rr in range(100):
anon[0] += rr + xx
py_result[xx] = anon[0] * 7
# Check they all match.
for xx in range(10):
assert halide_result_1[xx] == py_result[xx], (
f"halide_result_1({xx}) = {halide_result_1[xx]} instead of {py_result[xx]}"
)
assert halide_result_2[xx] == py_result[xx], (
f"halide_result_2({xx}) = {halide_result_2[xx]} instead of {py_result[xx]}"
)
print("Success!")
return 0
if __name__ == "__main__":
main()
|