1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
// Halide tutorial lesson 21: Auto-Scheduler
// So far we have written Halide schedules by hand, but it is also possible to
// ask Halide to suggest a reasonable schedule. We call this auto-scheduling.
// This lesson demonstrates how to use the autoscheduler to generate a
// copy-pasteable CPU schedule that can be subsequently improved upon.
// On linux or os x, you can compile and run it like so:
// g++ lesson_21_auto_scheduler_generate.cpp <path/to/tools/halide_image_io.h>/GenGen.cpp -g -std=c++17 -fno-rtti -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_21_generate
// export LD_LIBRARY_PATH=<path/to/libHalide.so> # For linux
// export DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> # For OS X
// ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_false -e static_library,h,schedule target=host auto_schedule=false
// ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_true -e static_library,h,schedule -p <path/to/libautoschedule_mullapudi2016.so> -S Mullapudi2016 target=host autoscheduler=Mullapudi2016 autoscheduler.parallelism=32 autoscheduler.last_level_cache_size=16777216 autoscheduler.balance=40
// g++ lesson_21_auto_scheduler_run.cpp -std=c++17 -I <path/to/Halide.h> -I <path/to/tools/halide_image_io.h> auto_schedule_false.a auto_schedule_true.a -ldl -lpthread -o lesson_21_run
// ./lesson_21_run
// If you have the entire Halide source tree, you can also build it by
// running:
// make tutorial_lesson_21_auto_scheduler_run
// in a shell with the current directory at the top of the halide
// source tree.
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
// We will define a generator to auto-schedule.
class AutoScheduled : public Halide::Generator<AutoScheduled> {
public:
Input<Buffer<float, 3>> input{"input"};
Input<float> factor{"factor"};
Output<Buffer<float, 2>> output1{"output1"};
Output<Buffer<float, 2>> output2{"output2"};
Expr sum3x3(Func f, Var x, Var y) {
return f(x - 1, y - 1) + f(x - 1, y) + f(x - 1, y + 1) +
f(x, y - 1) + f(x, y) + f(x, y + 1) +
f(x + 1, y - 1) + f(x + 1, y) + f(x + 1, y + 1);
}
void generate() {
// For our algorithm, we'll use Harris corner detection.
Func in_b = BoundaryConditions::repeat_edge(input);
gray(x, y) = 0.299f * in_b(x, y, 0) + 0.587f * in_b(x, y, 1) + 0.114f * in_b(x, y, 2);
Iy(x, y) = gray(x - 1, y - 1) * (-1.0f / 12) + gray(x - 1, y + 1) * (1.0f / 12) +
gray(x, y - 1) * (-2.0f / 12) + gray(x, y + 1) * (2.0f / 12) +
gray(x + 1, y - 1) * (-1.0f / 12) + gray(x + 1, y + 1) * (1.0f / 12);
Ix(x, y) = gray(x - 1, y - 1) * (-1.0f / 12) + gray(x + 1, y - 1) * (1.0f / 12) +
gray(x - 1, y) * (-2.0f / 12) + gray(x + 1, y) * (2.0f / 12) +
gray(x - 1, y + 1) * (-1.0f / 12) + gray(x + 1, y + 1) * (1.0f / 12);
Ixx(x, y) = Ix(x, y) * Ix(x, y);
Iyy(x, y) = Iy(x, y) * Iy(x, y);
Ixy(x, y) = Ix(x, y) * Iy(x, y);
Sxx(x, y) = sum3x3(Ixx, x, y);
Syy(x, y) = sum3x3(Iyy, x, y);
Sxy(x, y) = sum3x3(Ixy, x, y);
det(x, y) = Sxx(x, y) * Syy(x, y) - Sxy(x, y) * Sxy(x, y);
trace(x, y) = Sxx(x, y) + Syy(x, y);
harris(x, y) = det(x, y) - 0.04f * trace(x, y) * trace(x, y);
output1(x, y) = harris(x, y);
output2(x, y) = factor * harris(x, y);
}
void schedule() {
if (using_autoscheduler()) {
// The autoscheduler requires estimates on all the input/output
// sizes and parameter values in order to compare different
// alternatives and decide on a good schedule.
// To provide estimates (min and extent values) for each dimension
// of the input images ('input', 'filter', and 'bias'), we use the
// set_estimates() method. set_estimates() takes in a list of
// (min, extent) of the corresponding dimension as arguments.
input.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
// To provide estimates on the parameter values, we use the
// set_estimate() method.
factor.set_estimate(2.0f);
// To provide estimates (min and extent values) for each dimension
// of pipeline outputs, we use the set_estimates() method. set_estimates()
// takes in a list of (min, extent) for each dimension.
output1.set_estimates({{0, 1024}, {0, 1024}});
output2.set_estimates({{0, 1024}, {0, 1024}});
// Technically, the estimate values can be anything, but the closer
// they are to the actual use-case values, the better the generated
// schedule will be.
// To auto-schedule the pipeline, we don't have to do anything else:
// every Generator implicitly has a GeneratorParam named "auto_scheduler.name";
// if this is set to the name of the Autoscheduler we want to use, Halide will
// apply it to all of our pipeline's outputs automatically.
// Every Generator also implicitly has additional, optional GeneratorParams that are
// dependent on the specific Autoscheduler select, which allows you to specify
// characteristics of the machine architecture
// for the autoscheduler; it's generally specified in your Makefile.
// If none is specified, the default machine parameters for a generic CPU
// architecture will be used by the autoscheduler.
// Let's see some arbitrary but plausible values for the machine parameters
// for the Mullapudi2016 Autoscheduler:
//
// autoscheduler=Mullapudi2016
// autoscheduler.parallelism=32
// autoscheduler.last_level_cache_size=16777216
// autoscheduler.balance=40
//
// These are the maximum level of parallelism
// available, the size of the last-level cache (in bytes), and the ratio
// between the cost of a miss at the last level cache and the cost
// of arithmetic on the target architecture, in that order.
// Note that when using the autoscheduler, no schedule should have
// been applied to the pipeline; otherwise, the autoscheduler will
// throw an error. The current autoscheduler cannot handle a
// partially-scheduled pipeline.
// If HL_DEBUG_CODEGEN is set to 3 or greater, the schedule will be dumped
// to stdout (along with much other information); a more useful way is
// to add "schedule" to the -e flag to the Generator. In CMake, this is
// done by passing the argument SCHEDULE <outvar> to add_halide_library().
// See doc/HalideCMakePackage.md for more detail.
// The generated schedule that is dumped to file is an actual
// Halide C++ source, which is readily copy-pasteable back into
// this very same source file with few modifications. Programmers
// can use this as a starting schedule and iteratively improve the
// schedule. Note that the current autoscheduler is only able to
// generate CPU schedules and only does tiling, simple vectorization
// and parallelization. It doesn't deal with line buffering, storage
// reordering, or factoring reductions.
// At the time of writing, the autoscheduler will produce the
// following schedule for the estimates and machine parameters
// declared above when run on this pipeline:
//
// Var x_i("x_i");
// Var x_i_vi("x_i_vi");
// Var x_i_vo("x_i_vo");
// Var x_o("x_o");
// Var x_vi("x_vi");
// Var x_vo("x_vo");
// Var y_i("y_i");
// Var y_o("y_o");
//
// Func Ix = pipeline.get_func(4);
// Func Iy = pipeline.get_func(7);
// Func gray = pipeline.get_func(3);
// Func harris = pipeline.get_func(14);
// Func output1 = pipeline.get_func(15);
// Func output2 = pipeline.get_func(16);
//
// {
// Var x = Ix.args()[0];
// Ix
// .compute_at(harris, x_o)
// .split(x, x_vo, x_vi, 8)
// .vectorize(x_vi);
// }
// {
// Var x = Iy.args()[0];
// Iy
// .compute_at(harris, x_o)
// .split(x, x_vo, x_vi, 8)
// .vectorize(x_vi);
// }
// {
// Var x = gray.args()[0];
// gray
// .compute_at(harris, x_o)
// .split(x, x_vo, x_vi, 8)
// .vectorize(x_vi);
// }
// {
// Var x = harris.args()[0];
// Var y = harris.args()[1];
// harris
// .compute_root()
// .split(x, x_o, x_i, 256)
// .split(y, y_o, y_i, 128)
// .reorder(x_i, y_i, x_o, y_o)
// .split(x_i, x_i_vo, x_i_vi, 8)
// .vectorize(x_i_vi)
// .parallel(y_o)
// .parallel(x_o);
// }
// {
// Var x = output1.args()[0];
// Var y = output1.args()[1];
// output1
// .compute_root()
// .split(x, x_vo, x_vi, 8)
// .vectorize(x_vi)
// .parallel(y);
// }
// {
// Var x = output2.args()[0];
// Var y = output2.args()[1];
// output2
// .compute_root()
// .split(x, x_vo, x_vi, 8)
// .vectorize(x_vi)
// .parallel(y);
// }
} else {
// This is where you would declare the schedule you have written by
// hand or paste the schedule generated by the autoscheduler.
// We will use a naive schedule here to compare the performance of
// the autoschedule with a basic schedule.
gray.compute_root();
Iy.compute_root();
Ix.compute_root();
// As discussed earlier, the generated schedule that is dumped to
// file is an actual Halide C++ source, which is readily copy-pasteable
// back into this very same source file with few modifications.
// Or, developers can save the generated schedules to the source directory,
// and then include the generated schedule here.
//
// #include "tutorial.schedule.h"
// apply_schedule_auto_schedule_true(get_pipeline(), get_target());
}
}
private:
Var x{"x"}, y{"y"}, c{"c"};
Func gray, Iy, Ix, Ixx, Iyy, Ixy, Sxx, Syy, Sxy, det, trace, harris;
};
// As in lesson 15, we register our generator and then compile this
// file along with tools/GenGen.cpp.
HALIDE_REGISTER_GENERATOR(AutoScheduled, auto_schedule_gen)
// After compiling this file, see how to use it in
// lesson_21_auto_scheduler_run.cpp
|