File: Transforms.h

package info (click to toggle)
llvm-toolchain-17 1%3A17.0.6-22
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,799,624 kB
  • sloc: cpp: 6,428,607; ansic: 1,383,196; asm: 793,408; python: 223,504; objc: 75,364; f90: 60,502; lisp: 33,869; pascal: 15,282; sh: 9,684; perl: 7,453; ml: 4,937; awk: 3,523; makefile: 2,889; javascript: 2,149; xml: 888; fortran: 619; cs: 573
file content (83 lines) | stat: -rw-r--r-- 3,892 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
//===- Transforms.h - NVGPU Dialect transformations --------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file declares functions that assist transformations for the nvgpu
// dialect.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_NVGPU_TRANSFORMS_TRANSFORMS_H_
#define MLIR_DIALECT_NVGPU_TRANSFORMS_TRANSFORMS_H_

#include "mlir/IR/Operation.h"
#include "mlir/Support/LogicalResult.h"

namespace mlir {
class RewriterBase;

namespace nvgpu {

///
/// Passes
///

/// Optimizes vectorized accesses to a shared memory buffer specified by
/// memrefValue. This transformation assumes the following:
/// 1) All relevant accesses to `memrefValue` are contained with `parentOp`.
/// 2) The function will fail precondition checks if any subviews are
/// taken of `memrefValue`. All reads/writes to `memrefValue` should occur
/// through `memrefValue` directly.
///
/// Shared memory bank conflicts occur when multiple threads attempt to read or
/// write locations assigned to the same shared memory bank. For `2^N` byte
/// vectorized accesses, we need to be concerned with conflicts among threads
/// identified as `(tid) -> tid.floordiv(2^{7-N})`. As such, this transformation
/// changes any indexed memory access (vector.load, memref.load, nvgpu.ldmatrix,
/// etc) such that the final dimension's index value is permuted such that
/// `newColIndex = oldColIndex % vectorSize +
/// perm[rowIndex](oldColIndex/vectorSize, rowIndex)` where `rowIndex` is the
/// index for the second-to last dimension and `perm[rowIndex]` is a permutation
/// function that depends on the row Index. The permutation function is chosen
/// to ensure that sequential distributed+vectorized reads/writes down a single
/// dimension of the memref have minimal conflicts.
mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
                                                       Value memrefValue);

///
/// Rewrites patterns
///

//===----------------------------------------------------------------------===//
// NVGPU transformation options exposed as auxiliary structs.
//===----------------------------------------------------------------------===//
/// Enum to control the lowering of `nvgpu.mmasync`.
enum class MmaSyncF32Lowering { TF32 = 0, TF32x3 = 1, Unkown = 2 };

/// Collect patterns to convert mma.sync on f32 input and rewrite
/// to use tensor cores with user provided level of accuracy:
/// (a) tf32   (1 mma.sync per warp-level matrix-multiply-accumulate)
/// (b) tf32x3 (3 mma.sync per warp-level matrix-multiply-accumulate)
/// Typically, tf32 tensor core acceleration comes at a cost
/// of accuracy from missing precision bits. While f32 has 23 precision
/// bits, tf32 has only 10 precision bits. tf32x3 aims to recover the
/// precision bits by spliting each operand into two tf32 values
/// and issue three mma.sync tensor core operations.
void populateMmaSyncF32ToTF32Patterns(
    RewritePatternSet &patterns,
    nvgpu::MmaSyncF32Lowering precision = nvgpu::MmaSyncF32Lowering::TF32);

/// Convert global->shared vector transfers to async device copies. This
/// function looks for suitable vector transfers within the specified op and
/// converts them to "nvgpu.device_async_copy" ops. Consecutive copies are put
/// into the same sync group. If `bypassL1` is set, the "bypassL1" attribute is
/// set for suitable (i.e., transfer size 16 bytes) transfers.
void createAsyncGroups(RewriterBase &rewriter, Operation *op, bool bypassL1);

} // namespace nvgpu
} // namespace mlir

#endif // MLIR_DIALECT_NVGPU_TRANSFORMS_TRANSFORMS_H_