1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
|
#include "Halide.h"
namespace {
using namespace Halide;
using namespace Halide::BoundaryConditions;
class DepthwiseSeparableConvolution : public Generator<DepthwiseSeparableConvolution> {
public:
// [in_channels, width, height, batch_size]
Input<Buffer<float, 4>> input{"input"};
// [channel_multiplier, in_channels, filter_width, filter_height]
Input<Buffer<float, 4>> depthwise_filter{"depthwise_filter"};
// [out_channels, channel_multiplier * in_channels]
Input<Buffer<float, 2>> pointwise_filter{"pointwise_filter"};
// [out_channels]
Input<Buffer<float, 1>> bias{"bias"};
// [out_channels, width, height, batch_size]
Output<Buffer<float, 4>> output{"output"};
void generate() {
// The algorithm. It will be a generic depthwise convolution,
// with no assumptions about input sizes or shapes. This makes
// it especially challenging to schedule.
// Some free variables, where x and y represent the spatial dimensions.
Var x("x"), y("y"), d("d"), b("b");
// Pad x and y with 0. Unfortunately the built-in boundary
// condition helpers cause unwanted loop partitioning.
Func input_bounded;
Expr in_bounds = (x >= 0 && x < input.dim(1).extent() &&
y >= 0 && y < input.dim(2).extent());
Expr clamped_x = clamp(x, 0, input.dim(1).max());
Expr clamped_y = clamp(y, 0, input.dim(2).max());
input_bounded(d, x, y, b) =
select(in_bounds, input(d, clamped_x, clamped_y, b), 0.0f);
Expr channel_multiplier = depthwise_filter.dim(0).extent();
// Convolve the image depthwise -- for each input channel,
// generate channel_multiplier number of intermediate channels using convolution
Func depthwise_convolved("depthwise_convolved");
Expr pad_width = depthwise_filter.dim(2).extent() / 2;
Expr pad_height = depthwise_filter.dim(3).extent() / 2;
RDom depthwise_filter_dom(0, depthwise_filter.dim(0).extent(),
0, depthwise_filter.dim(2).extent(),
0, depthwise_filter.dim(3).extent());
// Give clearer names to the reduction over input channels (depth), x and y.
RVar rd = depthwise_filter_dom[0];
RVar rx = depthwise_filter_dom[1];
RVar ry = depthwise_filter_dom[2];
depthwise_convolved(d, x, y, b) +=
depthwise_filter(rd, d, rx, ry) *
input_bounded(d / channel_multiplier,
x + rx - pad_width,
y + ry - pad_height,
b);
// Convolve the image point-wise: for each pixel we map from
// input_channels * channel_multiplier number of channels to output_channels
Func pointwise_convolved("pointwise_convolved");
// Reduction over the channels in the depthwise output
RDom rc(0, pointwise_filter.dim(1).extent());
pointwise_convolved(d, x, y, b) = bias(d);
pointwise_convolved(d, x, y, b) +=
pointwise_filter(d, rc) * depthwise_convolved(rc, x, y, b);
// ReLU
output(d, x, y, b) = max(pointwise_convolved(d, x, y, b), 0.f);
// The schedule.
if (using_autoscheduler()) {
// Second layer of MobileNet v2
const int N = 4, CI = 32, CO = 16, CM = 1, W = 112, H = 112;
input.dim(0).set_estimate(0, CI);
input.dim(1).set_estimate(0, W);
input.dim(2).set_estimate(0, H);
input.dim(3).set_estimate(0, N);
depthwise_filter.dim(0).set_estimate(0, CI / CO);
depthwise_filter.dim(1).set_estimate(0, CI);
depthwise_filter.dim(2).set_estimate(0, 3);
depthwise_filter.dim(3).set_estimate(0, 3);
pointwise_filter.dim(0).set_estimate(0, CO);
pointwise_filter.dim(1).set_estimate(0, CI * CM);
bias.dim(0).set_estimate(0, CO);
output.dim(0).set_estimate(0, CO);
output.dim(1).set_estimate(0, W);
output.dim(2).set_estimate(0, H);
output.dim(3).set_estimate(0, N);
} else if (get_target().has_gpu_feature()) {
// 0.066ms on a 2060 RTX super. This is about 1.2 TFlops,
// which is not a very large fraction of peak. For
// comparison though, tensorflow 2.3 achieves 0.13ms via
// cudnn 7. So we're twice as fast.
// This schedule fuses the depthwise conv into the pointwise
// conv. The results of the depthwise conv are computed inside
// the outer of the two pointwise reduction loops.
Var xi, yi, di, dii, xii, yii;
RVar ro, ri;
// The pointwise convolution kernel. Produces a 4x4 tile of output.
Func(output)
.tile({d, x, y}, {di, xi, yi}, {16, 4, 4})
.tile({di, xi, yi}, {dii, xii, yii}, {1, 2, 2})
.gpu_threads(di, xi, yi)
.fuse(y, b, b)
.gpu_blocks(d, x, b)
.unroll(xii)
.unroll(yii)
.unroll(dii);
pointwise_convolved.compute_at(output, di)
.reorder(x, y, d)
.unroll(x)
.unroll(y)
.unroll(d)
.update()
.unroll(x)
.unroll(y)
.unroll(d)
.split(rc, ro, ri, 4)
.reorder(ri, x, y, d, ro)
.unroll(ri);
// We're going to call in() on depthwise_convolved twice.
// The first will be to give it a wrapper to do the
// accumulation in registers before writing the result to
// shared. The second will be staging the loads from
// shared into registers. We write them in reverse order
// below:
// We can do 4-wide vectorized loads from shared memory if
// we unroll the reduction loop by a factor of four above
// and stage the loads from the depthwise_convolved
// output.
depthwise_convolved.in()
.in()
.compute_at(pointwise_convolved, x)
.bound_extent(d, 4)
.vectorize(d)
.unroll(x)
.unroll(y);
// The depthwise convolution kernel. Produces a 4x4 tile
// of intermediate state, storing the result in shared.
depthwise_convolved.in()
.compute_at(output, d)
.tile({d, x, y}, {di, xi, yi}, {32, 4, 4}, TailStrategy::RoundUp)
.tile({di, xi, yi}, {dii, xii, yii}, {2, 2, 2})
.gpu_threads(di, xi, yi)
.unroll(xii)
.unroll(yii)
.unroll(dii);
depthwise_convolved
.compute_at(depthwise_convolved.in(), di)
.unroll(x)
.unroll(y)
.unroll(d)
.update()
.reorder(d, x, y, rx, ry, rd)
.unroll(x)
.unroll(y)
.unroll(d);
} else {
// CPU schedule
// 0.13ms on an Intel i9-9960X using 16 threads pinned to 3.0 GHz,
// which is only about 20% of peak flops.
int tile_w = 1;
int tile_h = 1;
int tile_d = 1;
const int vec = natural_vector_size<float>();
// Figure out how many registers we have in the register
// file on this target.
int num_regs = 16;
if (get_target().has_feature(Target::AVX512_Skylake) ||
(get_target().arch == Target::ARM &&
get_target().bits == 64)) {
num_regs = 32;
}
// Pick a tile size designed to fit into the register file.
if (num_regs == 32 && vec == 16) {
// 32 vector registers available of size 16. Use 24 of
// them for accumulators.
tile_d = 1;
tile_w = 6;
tile_h = 4;
// Using more tiles in the d dimension would be
// better, but we're tuning for 16 output channels and
// our vectors are already that wide (on avx512).
} else if (num_regs == 32 && vec == 4) {
// 32 vector registers, of size 4. We'll use 24.
tile_d = 4;
tile_w = 3;
tile_h = 2;
} else if (num_regs == 16 && vec == 8) {
// 16 registers available of size 8. Use 12 for accumulators.
tile_d = 2;
tile_w = 3;
tile_h = 2;
} else {
// Old x86 or 32-bit arm. Assume vectors of size 4,
// 16 registers. No FMA so we need to reserve a few
// more registers for things other than the
// accumulators.
tile_d = 4;
tile_w = 2;
tile_h = 1;
}
// Change units from vectors to elements
tile_d *= vec;
// This schedule aggressively fuses the depthwise conv into
// the pointwise conv. We do the depthwise convolution within
// slices of the channel reduction loop in the pointwise
// convolution.
Var di, xi, yi;
RVar ro, ri;
Func(output)
.tile({d, x, y}, {di, xi, yi}, {tile_d, tile_w, tile_h})
.vectorize(di)
.unroll(xi)
.unroll(yi)
.fuse(y, b, b)
.parallel(b);
pointwise_convolved.compute_at(output, d)
.vectorize(d)
.unroll(x)
.unroll(y)
.update()
.reorder(d, x, y, rc, b)
.vectorize(d)
.unroll(x)
.unroll(y)
.split(rc, ro, ri, tile_d);
depthwise_convolved
.store_in(MemoryType::Stack)
.bound_extent(d, tile_d)
.compute_at(pointwise_convolved, ro)
.vectorize(d)
.reorder(x, y, d)
.unroll(x)
.unroll(y)
.update()
.vectorize(d)
.reorder(x, y, d, rd, rx, ry, b)
.unroll(x)
.unroll(y);
input_bounded
.store_in(MemoryType::Stack)
.compute_at(pointwise_convolved, ro)
.tile(d, x, di, xi, vec, 4, TailStrategy::RoundUp)
.vectorize(di)
.unroll(xi);
}
if (!using_autoscheduler()) {
// We're going to specialize both schedules for channel_multiplier = 1,
// in which case it's nice to know that depthwise_filter
// is dense across the second dimension.
depthwise_filter.dim(1).set_stride(channel_multiplier);
Expr intermediate_channels = pointwise_filter.dim(1).extent();
// We'll also specialize for a multiple-of-32 intermediate
// channels, and a 3x3 conv.
output.specialize(channel_multiplier == 1 &&
intermediate_channels == (intermediate_channels / 32) * 32 &&
depthwise_filter.dim(2).extent() == 3 &&
depthwise_filter.dim(3).extent() == 3);
}
}
};
} // namespace
HALIDE_REGISTER_GENERATOR(DepthwiseSeparableConvolution, depthwise_separable_conv)
|