1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
"""
Simple blur.
"""
import halide as hl
import enum
class BlurGPUSchedule(enum.Enum):
# Fully inlining schedule.
Inline = 0
# Schedule caching intermedia result of blur_x.
Cache = 1
# Schedule enabling sliding window opt within each work-item or cuda
# thread.
Slide = 2
# The same as above plus vectorization per work-item.
SlideVectorize = 3
_GPU_SCHEDULE_ENUM_MAP = {
"inline": BlurGPUSchedule.Inline,
"cache": BlurGPUSchedule.Cache,
"slide": BlurGPUSchedule.Slide,
"slide_vector": BlurGPUSchedule.SlideVectorize,
}
@hl.generator()
class blur:
gpu_schedule = hl.GeneratorParam("slide_vector")
gpu_tile_x = hl.GeneratorParam(32)
gpu_tile_y = hl.GeneratorParam(8)
# Note: although this is declared as operating on uint16 images,
# it will produce incorrect results if more than 14-bit images are used.
input_buf = hl.InputBuffer(hl.UInt(16), 2)
blur_y = hl.OutputBuffer(hl.UInt(16), 2)
def generate(self):
g = self
x, y, xi, yi = hl.vars("x y xi yi")
# The algorithm
clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)
blur_x = hl.Func("blur_x")
blur_x[x, y] = (clamped[x, y] + clamped[x + 1, y] + clamped[x + 2, y]) // 3
g.blur_y[x, y] = (blur_x[x, y] + blur_x[x, y + 1] + blur_x[x, y + 2]) // 3
# How to schedule it
if g.target().has_gpu_feature():
# GPU schedule.
# This will raise an exception for unknown strings, which is what
# we want
schedule_enum = _GPU_SCHEDULE_ENUM_MAP[g.gpu_schedule]
if schedule_enum == BlurGPUSchedule.Inline:
# - Fully inlining.
g.blur_y.gpu_tile(x, y, xi, yi, g.gpu_tile_x, g.gpu_tile_y)
elif schedule_enum == BlurGPUSchedule.Cache:
# - Cache blur_x calculation.
g.blur_y.gpu_tile(x, y, xi, yi, g.gpu_tile_x, g.gpu_tile_y)
blur_x.compute_at(g.blur_y, x).gpu_threads(x, y)
elif schedule_enum == BlurGPUSchedule.Slide:
# - Instead of caching blur_x calculation explicitly, the
# alternative is to allow each work-item in OpenCL or thread
# in CUDA to calculate more rows of blur_y so that temporary
# blur_x calculation is re-used implicitly. This achieves
# the similar schedule of sliding window.
y_inner = hl.Var("y_inner")
(
g.blur_y.split(y, y, y_inner, g.gpu_tile_y)
.reorder(y_inner, x)
.unroll(y_inner)
.gpu_tile(x, y, xi, yi, g.gpu_tile_x, 1)
)
elif schedule_enum == BlurGPUSchedule.SlideVectorize:
# Vectorization factor.
factor = 2
y_inner = hl.Var("y_inner")
(
g.blur_y.vectorize(x, factor)
.split(y, y, y_inner, g.gpu_tile_y)
.reorder(y_inner, x)
.unroll(y_inner)
.gpu_tile(x, y, xi, yi, g.gpu_tile_x, 1)
)
elif g.target().has_feature(hl.TargetFeature.HVX):
# Hexagon schedule.
# TODO: Try using a schedule like the CPU one below.
vector_size = 128
(
g.blur_y.compute_root()
.hexagon()
.prefetch(g.input_buf, y, y, 2)
.split(y, y, yi, 128)
.parallel(y)
.vectorize(x, vector_size * 2)
)
(
blur_x.store_at(g.blur_y, y)
.compute_at(g.blur_y, yi)
.vectorize(x, vector_size)
)
else:
# CPU schedule.
# Compute blur_x as needed at each vector of the output.
# Halide will store blur_x in a circular buffer so its
# results can be re-used.
vector_size = g.natural_vector_size(g.input_buf.type())
g.blur_y.split(y, y, yi, 32).parallel(y).vectorize(x, vector_size)
(
blur_x.store_at(g.blur_y, y)
.compute_at(g.blur_y, x)
.vectorize(x, vector_size)
)
if __name__ == "__main__":
hl.main()
|