1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
|
// RUN: mlir-opt %s --transform-interpreter \
// RUN: --test-transform-dialect-erase-schedule \
// RUN: --math-uplift-to-fma \
// RUN: --convert-bufferization-to-memref \
// RUN: --test-lower-to-llvm |\
// RUN: FileCheck %s
// Fixed-size tensor types to be used in convolution.
// Named sizes are: N=5 OH=80 OW=100 F=C=128 KH=KW=3.
// Input is NHWC.
// Filter is CHWF.
// Ouptut is NHWF.
!tinput = tensor<5x82x102x128xf32>
!tfilter = tensor<128x3x3x128xf32>
!tbias = tensor<128xf32>
!toutput = tensor<5x80x100x128xf32>
// Function containing the convolution. Note that its arguments and results are
// tensors annotated with attributes from the `bufferization` dialect. These
// attributes hint the bufferization pass to assume buffers can be directly
// used for these tensors without reshaping.
func.func @conv(
%input: !tinput {bufferization.writable = false,
bufferization.access = "read",
bufferization.buffer_layout =
affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>},
%filter: !tfilter {bufferization.writable = false,
bufferization.access = "read",
bufferization.buffer_layout =
affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>},
%bias: !tbias {bufferization.writable = false,
bufferization.access = "read",
bufferization.buffer_layout = affine_map<(d0)->(d0)>},
%output: !toutput {bufferization.writable = true,
bufferization.buffer_layout =
affine_map<(d0,d1,d2,d3)->(d0,d1,d2,d3)>,
bufferization.access = "write"}) -> !toutput
// This requests a C-compatible interface to be emitted for the function
// when translating to LLVM IR.
attributes { llvm.emit_c_interface }
{
// Bias. Using a named Linalg operation for brevity.
%bias_init = tensor.empty() : !toutput
%biased = linalg.broadcast ins(%bias : !tbias)
outs(%bias_init : !toutput) dimensions = [0, 1, 2]
// Convolution proper. While Linalg has named operations for 2D convolutions,
// the one in the Halide example has an uncommon order of filter dimensions
// and is not supported. It also takes the fitler as first argument. This
// code recreates it faithfully using the generic form.
%convolved = linalg.generic {
iterator_types = ["parallel", "parallel", "parallel", "parallel",
"reduction", "reduction", "reduction"],
indexing_maps = [
affine_map<(n, y, x, c, rz, ry, rx) -> (rx, rz, ry, c)>,
affine_map<(n, y, x, c, rz, ry, rx) -> (n, y+rz, x+ry, rx)>,
affine_map<(n, y, x, c, rz, ry, rx) -> (n, y, x, c)>
]
} ins(%filter, %input: !tfilter, !tinput) outs(%biased : !toutput) {
^bb0(%in: f32, %f: f32, %b: f32):
// Note the fastmath attributes that allow operations to be recombined into
// %0 = math.fma %in, %f, %b : f32
// later on and to reorder reductions.
%m1 = arith.mulf %in, %f {fastmath = #arith.fastmath<fast>} : f32
%0 = arith.addf %b, %m1 {fastmath = #arith.fastmath<fast>} : f32
linalg.yield %0 : f32
} -> !toutput
// ReLU is just a max(0, x).
%c0 = arith.constant 0.0 : f32
%relued = linalg.generic {
iterator_types = ["parallel", "parallel", "parallel", "parallel"],
indexing_maps = [
affine_map<(d0, d1, d2, d3) -> ()>,
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
]
} ins(%c0, %convolved : f32, !toutput)
outs(%output : !toutput) {
^bb0(%cst: f32, %in: f32, %out: f32):
%0 = llvm.intr.maxnum(%cst, %in) : (f32, f32) -> f32
linalg.yield %0 : f32
} -> !toutput
return %relued : !toutput
}
// Module containing the transformation script to be applied. The attribute
// is required to correctly verify the use of named (macro-like) sequences.
module attributes { transform.with_named_sequence } {
// Apply transformations in a sequence to recreate the following Halide
// schedule:
//
// Var co, ci, xo, xi;
// relu.split(c, co, ci, vec * tile_w)
// .split(x, xo, xi, tile_h)
// .reorder(ci, xi, xo, y, n, co)
// .vectorize(ci, vec)
// .unroll(ci)
// .unroll(xi);
// conv.compute_at(relu, xo)
// .vectorize(c, vec)
// .unroll(c)
// .unroll(x)
// .unroll(y)
// .update()
// .reorder(c, x, y, r.x, r.y, r.z, n)
// .vectorize(c, vec)
// .unroll(c)
// .unroll(x)
// .unroll(y)
// .unroll(r.x, 2);
//
// where tile_w = 4, tile_h = 5, vec = 16. Note that unroll(y) and unroll(r.x)
// have no effect on the Halide IR as of 294f80c49bf3bb8582446613c25fcce03b82.
// Also note that the order of dimensions in Halide is inverted, e.g., co and
// n are the outermost loops in the respective reorder directives.
transform.named_sequence @__transform_main(
// This argument will point to the top-level module.
%arg0: !transform.any_op) {
// 1. Find the operations we are going to transform usnig their names. This
// is a simplistic approach that works when there are few operations in the
// IR to be transformed. More complex scenarios should rely on operations
// with `transform.match` prefix that are out of scope for this chapter.
%bias = transform.structured.match ops{["linalg.broadcast"]} in %arg0
: (!transform.any_op) -> !transform.any_op
%generics = transform.structured.match ops{["linalg.generic"]} in %arg0
: (!transform.any_op) -> !transform.any_op
%conv, %relu = transform.split_handle %generics
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
// 2. Initial tiling to start producing the loop structure. Note that the
// linalg.generic operation has the implicit loop order (n, y, x, c). Since
// the desired order of dimensions is (co, n, y, xo, xi, ci), we first tile
// only the c dimension to materialize the outermost co loop, and then tile
// the other dimensions since they are already in the expected order. Tiling
// by 1 produces the loop that iterates along the entire dimension. Tiling
// by 0 does not produce a loop. The size 64 is chosen as tiling by 4*16
// where 16 is the AVX512 vector length. Note that structured tiling doesn't
// remove the dimensions that became trivial (unit size) so the resulting
// sturucture is technically (co, no=n, yo=y, xo, [ni=1, yi=1, xi, ci])
// where brackets indicate implicit loops of the `linalg.generic` operation
// inside the loops produced by tiling.
//
// [n y x c]
%relu2, %co = transform.structured.tile_using_forall %relu
tile_sizes [0, 0, 0, 64]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%relu3, %n_y_xo = transform.structured.tile_using_forall %relu2
tile_sizes [1, 1, 5, 0]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
// Compute_at is actually fusion into the given loop (given that we start
// with totally fissioned form, Halide starts with a fused form by reusing
// the loop iterators).
%conv2, %co2 = transform.structured.fuse_into_containing_op %conv into %co
: (!transform.any_op, !transform.any_op)
-> (!transform.any_op, !transform.any_op)
%conv3, %n_y_xo2 = transform.structured.fuse_into_containing_op %conv2
into %n_y_xo
: (!transform.any_op, !transform.any_op)
-> (!transform.any_op, !transform.any_op)
// Also fuse the bias that we represent as a separate operation and Halide
// represents as the "pure" (as opposed to "update") part of the conv
// expression. Note that fusion consumes both handles and produces new
// handles for chaining purposes.
%bias2, %co3 = transform.structured.fuse_into_containing_op %bias into %co2
: (!transform.any_op, !transform.any_op)
-> (!transform.any_op, !transform.any_op)
%bias3, %n_y_xo3 = transform.structured.fuse_into_containing_op %bias2
into %n_y_xo2
: (!transform.any_op, !transform.any_op)
-> (!transform.any_op, !transform.any_op)
// Clean up the result of fusion, which mechanically duplicates the producer
// operation in the consumer loop without removing the original operation.
// The original operation is now "dead": it has no uses and no side effects
// so it can be removed by dead-code elimination (DCE) that runs as part of
// pattern rewriting. The transform dialect allows to apply a combination
// of named pattern sets, exposed as operations, in one sweep to an
// isolated-from-above container payload operation. Note that we don't
// actually need any patterns for DCE to run, just trigger the rewriting.
//
// This step is optional. The transformation can continue without it and
// produce the same final IR, but makes it easier to manually examine the
// intermediate stages.
%f00 = transform.structured.match ops{["func.func"]} in %arg0
: (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %f00 {
} : !transform.any_op
// The loop reordering requested for the convolution operation requires
// putting reduction loops (r.z, r.y. r.x) before the "inner" loops xi, ci.
// The "inner" loops are still implicit as part of the linalg.generic
// operation, and we need to materialize reduction loops around it by tiling
// with size 1. Since we are producing reduction loops, we indicate that we
// are tiling a reduction and request a sequential `scf.for` loops (parallel
// reductions are supported by `scf.forall`, but we don't need those here).
//
// This transform operation is more capable than merely producing
// (reduction) loops: the transformed code performs `tile_size` partial
// reductions of `N / tile_size` elements, potentially in parallel by
// changing the dimension kind of the structured operation inside the loop,
// and then performs a final reduction of these partial results by producing
// a new “combiner” structured operation after the loops. In our case,
// tile_size = 1 along all dimensions, so the reduction is entirely
// performed by the generated loops. The combiner structured operation is
// still produced and adds up the reduction result with the initial value.
%red_fill, %conv4, %combining, %rz_ry_rx
= transform.structured.tile_reduction_using_for %conv3 by
// n y x c rz ry rx
tile_sizes=[0, 0, 0, 0, 1, 1, 1]
: (!transform.any_op)
-> (!transform.any_op, !transform.any_op, !transform.any_op,
!transform.any_op)
// At this point, the inner Linalg operations have implicit iteration spaces
// of 5x64 size, with some additional unit-size dimensions. Completely
// replicating Halide schedule would require materializing the loops with
// 5 and 4 iterations, respectively, unrolling those loops and marking the
// remaining 16-point iteration space for vectorization.
//
// This is unnecessary in MLIR that supports multi-dimensional vectors,
// which will be decomposed into target-specific sizes during the lowering.
// Therefore, this schedule stops here.
// Transform the named broadcast operation used for bias into the generic
// form before vectorization to prevent special cases from kicking in.
transform.structured.generalize %bias3
: (!transform.any_op) -> !transform.any_op
// Use the named macro to perform most of the lowering.
transform.include @lower failures(propagate) (%arg0)
: (!transform.any_op) -> ()
transform.yield
}
// Named sequence of transformations is a macro-like object that can be
// included from another place in the transform dialect, but doesn't allow for
// recursion. This can be reused in other scenarios.
transform.named_sequence @lower(
%arg0: !transform.any_op {transform.consumed}) {
%f00 = transform.structured.match ops{["func.func"]} in %arg0
: (!transform.any_op) -> !transform.any_op
// Simplify the code as tiling and fusion may have produced a lot of
// operations computing tensor subsets and loop ranges, some of which may be
// duplicated or excessively complex. Simplification involving
// canonicalization, common subexpression elimination, loop invariant code
// motion and various rewrite patterns can be applied directly from the
// transform dialect. Furthermore, an arbitrary combination of rewrite
// patterns can be applied in one sweep to a given scope, a functionality
// that cannot be achieved with conventional compiler passes that apply each
// group of patterns separately (at least without creating a new pass for
// each combination of pattern groups).
transform.apply_patterns to %f00 {
transform.apply_patterns.canonicalization
transform.apply_patterns.linalg.tiling_canonicalization
} : !transform.any_op
transform.apply_cse to %f00 : !transform.any_op
%all_loops = transform.structured.match interface{LoopLikeInterface}
in %arg0
: (!transform.any_op) -> !transform.any_op
transform.apply_licm to %all_loops : !transform.any_op
// Tiling-by-one as a way of materializing loops produced operations
// processing 4+D types where only a handful of dimension isn’t unit-sized,
// e.g., tensor<1x1x1x5x64xf32> where 5 and 64 are tile sizes. Remove such
// unit dimensions before vectorization, for clarity.
transform.apply_patterns to %f00 {
transform.apply_patterns.linalg.fold_unit_extent_dims_via_reshapes
} : !transform.any_op
// Vectorize the remaining non-unit dimensions in structured operations.
// This essentially rewrites operations on `tensor<5x64xf32>` into
// opreations on `vector<5x64xf32>`. Further lowering in MLIR and LLVM will
// decompose this into a sequence of operations on single-dimensional
// vectors of the platform-relevant size, e.g., `vector<16xf32>` for AVX512.
// High-level vector primitives, such as `vector.transpose` and
// `vector.broadcast` can be introduced at this stage. They will be later
// lowered to sequences of lower-level primitives such as `vector.shuffle`
// depending on the selected lowering strategy.
%fv = transform.structured.vectorize_children_and_apply_patterns %f00
: (!transform.any_op) -> !transform.any_op
// Vectorization may have created new opportunities for cleanups. In
// particular, tensor subsetting operations can be composed with vector
// operations, and vector transfer (multi-dimensional load/store) operations
// can be recombined and hoisted out of loops.
transform.apply_patterns to %fv {
transform.apply_patterns.canonicalization
transform.apply_patterns.tensor.fold_tensor_subset_ops_into_vector_transfers
} : !transform.any_op
transform.apply_cse to %fv : !transform.any_op
transform.structured.hoist_redundant_vector_transfers %fv
: (!transform.any_op) -> !transform.any_op
// Apply bufferization that rewrites the remaining operations on tensors
// as operations on structured buffer (memref) types, including the function
// API. MLIR bufferization uses destination-passing style meaning that a
// buffer is shared between one of the operation's operands and its result.
//
// Since bufferization rewrites function signatures, it is applied as a
// module-wise transformation. Therefore, it invalidates all previously
// defined handles. Bufferization is usually a late step in the
// transformation process, so invalidation is not an issue. However, if
// other transformations, such as loop unrolling, are required after
// bufferization, new handles should be produced using the match operations.
//
// One-shot bufferization itself does not produce buffer deallocations,
// which may lead to leaks. So we have to run the buffer deallocation pass
// pipeline to avoid them. Note that the transform dialect seamlessly runs
// named passes and pass pipelines: if desired, one could replace complex
// --pass-pipeline expressions with operations. Note that we apply the
// pipeline to functions rather than entire module to avoid running it
// on the transform IR that is contained in the module.
%arg1 = transform.bufferization.one_shot_bufferize %arg0 {
bufferize_function_boundaries = true,
function_boundary_type_conversion = 1 : i32 }
: (!transform.any_op) -> !transform.any_op
%f = transform.structured.match ops{["func.func"]} in %arg1
: (!transform.any_op) -> !transform.any_op
transform.apply_registered_pass "buffer-deallocation-pipeline" to %f
: (!transform.any_op) -> !transform.any_op
// Apply general canonicalization and CSE to each function after
// bufferization as new simplification opportunities may have appeared.
%fb = transform.structured.match ops{["func.func"]} in %arg1
: (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %fb {
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %fb : !transform.any_op
// Lower complex, multidimensional vector operations into simpler
// primitives. This particular selection of the pattern groups corresponds
// to vector dialect operations present in the payload IR at this stage.
// Many of these groups can be parameterized to use different strategies or
// lower-level primitives offering performance trade-offs. In this case, we
// are selecting the simplest strategies.
transform.apply_patterns to %fb {
transform.apply_patterns.vector.lower_contraction
lowering_strategy = parallelarith
transform.apply_patterns.vector.lower_transfer
max_transfer_rank = 1
transform.apply_patterns.vector.lower_transpose
lowering_strategy = eltwise
transform.apply_patterns.vector.lower_shape_cast
} : !transform.any_op
// These patterns apply in a separate sweep to avoid transfer-to-scf
// patterns overlap with lower-transfer patterns as they apply to the same
// kind of operations. These patterns may produce local allocations to act
// as temporary caches deep inside loops, which could lead to catastrophic
// performance. Such allocations are moved onto the stack and hoisted from
// all the surrounding loops.
transform.apply_patterns to %fb {
transform.apply_patterns.vector.transfer_to_scf
transform.apply_patterns.memref.alloc_to_alloca
} : !transform.any_op
transform.bufferization.buffer_loop_hoisting %fb : !transform.any_op
// A final round of cleanups additionally includes patterns to simplify
// buffer aliasing operations that may have been introduced during
// bufferization and could result in excessively complex address
// computation.
transform.apply_patterns to %fb {
transform.apply_patterns.memref.fold_memref_alias_ops
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %fb : !transform.any_op
transform.yield
}
}
// The core computation, at the LLVM dialect level, must correspond to five
// immediately adjacent fma on vector<64xf32>.
// CHECK: %[[R0:.+]] = llvm.mlir.undef : !llvm.array<5 x vector<64xf32>>
// CHECK: %[[V:.+]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.array<5 x vector<64xf32>>
// CHECK-NEXT: %[[LINE0:.+]] = llvm.extractvalue %[[V]][0] : !llvm.array<5 x vector<64xf32>>
// CHECK-NEXT: %[[FMA0:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE0]])
// CHECK-SAME: -> vector<64xf32>
// CHECK-NEXT: %[[R1:.+]] = llvm.insertvalue %[[FMA0]], %[[R0]][0]
// CHECK-NEXT: %[[LINE1:.+]] = llvm.extractvalue %[[V]][1] : !llvm.array<5 x vector<64xf32>>
// CHECK-NEXT: %[[FMA1:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE1]])
// CHECK-SAME: -> vector<64xf32>
// CHECK-NEXT: %[[R2:.+]] = llvm.insertvalue %[[FMA1]], %[[R1]][1]
// CHECK-NEXT: %[[LINE2:.+]] = llvm.extractvalue %[[V]][2] : !llvm.array<5 x vector<64xf32>>
// CHECK-NEXT: %[[FMA2:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE2]])
// CHECK-SAME: -> vector<64xf32>
// CHECK-NEXT: %[[R3:.+]] = llvm.insertvalue %[[FMA2]], %[[R2]][2]
// CHECK-NEXT: %[[LINE3:.+]] = llvm.extractvalue %[[V]][3] : !llvm.array<5 x vector<64xf32>>
// CHECK-NEXT: %[[FMA3:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE3]])
// CHECK-SAME: -> vector<64xf32>
// CHECK-NEXT: %[[R4:.+]] = llvm.insertvalue %[[FMA3]], %[[R3]][3]
// CHECK-NEXT: %[[LINE4:.+]] = llvm.extractvalue %[[V]][4] : !llvm.array<5 x vector<64xf32>>
// CHECK-NEXT: %[[FMA4:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE4]])
// CHECK-SAME: -> vector<64xf32>
// CHECK-NEXT: %[[R5:.+]] = llvm.insertvalue %[[FMA4]], %[[R4]][4]
|