1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
#include "Halide.h"
namespace {
class StencilChain : public Halide::Generator<StencilChain> {
public:
GeneratorParam<int> stencils{"stencils", 32, 1, 100};
Input<Buffer<uint16_t, 2>> input{"input"};
Output<Buffer<uint16_t, 2>> output{"output"};
void generate() {
std::vector<Func> stages;
Var x("x"), y("y");
Func f = Halide::BoundaryConditions::repeat_edge(input);
stages.push_back(f);
for (int s = 0; s < (int)stencils; s++) {
Func f("stage_" + std::to_string(s));
Expr e = cast<uint16_t>(0);
for (int i = -2; i <= 2; i++) {
for (int j = -2; j <= 2; j++) {
e += ((i + 3) * (j + 3)) * stages.back()(x + i, y + j);
}
}
f(x, y) = e;
stages.push_back(f);
}
output(x, y) = stages.back()(x, y);
/* ESTIMATES */
// (This can be useful in conjunction with RunGen and benchmarks as well
// as auto-schedule, so we do it in all cases.)
{
const int width = 1536;
const int height = 2560;
// Provide estimates on the input image
input.set_estimates({{0, width}, {0, height}});
// Provide estimates on the pipeline output
output.set_estimates({{0, width}, {0, height}});
}
if (using_autoscheduler()) {
// nothing
} else if (get_target().has_gpu_feature()) {
// GPU schedule
// 2.9 ms on a 2060 RTX
// It seems that just compute-rooting all the stencils is
// fastest on this GPU, plus some unrolling and aggressive
// staging to share loads between adjacent pixels.
Var xi, yi, xii, yii;
stages.pop_back(); // Inline the second-last stage into the output
stages.push_back(output);
for (size_t i = 1; i < stages.size(); i++) {
Func &s = stages[i];
Func prev = stages[i - 1];
x = s.args()[0];
y = s.args()[1];
s.compute_root()
.gpu_tile(x, y, xi, yi, 30 * 2, 12)
.tile(xi, yi, xii, yii, 2, 2)
.unroll(xii)
.unroll(yii);
// Pre-load the entire region required of the previous
// stage into shared memory by adding a wrapper Func
// and scheduling it at blocks. This way instead of
// every pixel doing 25 loads from global memory, many of
// which overlap, we load each unique value from
// global into shared once, and then we use faster
// loads from shared in the actual stencil.
prev.in()
.compute_at(s, x)
.tile(prev.args()[0], prev.args()[1], xi, yi, 2, 2)
.vectorize(xi)
.unroll(yi)
.gpu_threads(prev.args()[0], prev.args()[1]);
// A similar benefit applies for the
// vectorized/unrolled 2x2 tiles. Instead of having
// each unrolled iteration do its own mix of scalar
// and vector loads from shared memory in a 5x5
// window, many of which get deduped across the block,
// we load a 6x6 window of shared into registers using
// only aligned vector loads, and then the actual
// stencil pulls from those registers. We're adding
// another wrapper Func around the wrapper Func we
// created above, so we say .in().in()
prev.in()
.in()
.compute_at(s, xi)
.vectorize(prev.args()[0], 2)
.unroll(prev.args()[0])
.unroll(prev.args()[1]);
}
} else {
// CPU schedule
// 4.23ms on an Intel i9-9960X using 16 threads at 3.5
// GHz.
// Runtime is pretty noisy, so benchmarked over 1000
// trials instead of the default of 10 in the
// Makefile. This uses AVX-512 instructions, but not
// floating-point ones. My CPU seems to hover at 3.5GHz on
// this workload.
const int vec = natural_vector_size<uint16_t>();
// How many stencils in between each compute-root
const int group_size = 11;
Var yi, yo, xo, xi, t;
const int last_stage_idx = (int)stages.size() - 1;
for (int j = last_stage_idx; j > 0; j -= group_size) {
Func out = (j == last_stage_idx) ? output : stages[j];
const int stages_to_output = last_stage_idx - j;
const int expansion = 4 * stages_to_output;
const int w = 1536 + expansion;
const int h = 2560 + expansion;
out.compute_root()
// Break into 16 tiles for our 16 threads
.tile(x, y, xo, yo, xi, yi, w / 4, h / 4)
.fuse(xo, yo, t)
.parallel(t)
.vectorize(xi, vec);
for (int i = std::max(0, j - group_size + 1); i < j; i++) {
Func s = stages[i];
s.store_at(out, t)
.compute_at(out, yi)
.vectorize(s.args()[0], vec);
}
}
}
}
};
} // namespace
HALIDE_REGISTER_GENERATOR(StencilChain, stencil_chain)
|