File: GPUHeuristics.cpp

package info (click to toggle)
llvm-toolchain-17 1%3A17.0.6-22
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,799,624 kB
  • sloc: cpp: 6,428,607; ansic: 1,383,196; asm: 793,408; python: 223,504; objc: 75,364; f90: 60,502; lisp: 33,869; pascal: 15,282; sh: 9,684; perl: 7,453; ml: 4,937; awk: 3,523; makefile: 2,889; javascript: 2,149; xml: 888; fortran: 619; cs: 573
file content (267 lines) | stat: -rw-r--r-- 11,378 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"

#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Support/MathExtras.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <cmath>
#include <numeric>

using namespace mlir;

#define DEBUG_TYPE "linalg-transforms"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

static Attribute linearIdX(MLIRContext *ctx) {
  return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimX);
}
static Attribute linearIdY(MLIRContext *ctx) {
  return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimY);
}
static Attribute linearIdZ(MLIRContext *ctx) {
  return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimZ);
}

transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
                                                 int totalNumThreads,
                                                 int64_t desiredBitAlignment,
                                                 ArrayRef<int64_t> copySizes,
                                                 bool favorPredication,
                                                 int64_t elementalBitwidth) {
  assert(!copySizes.empty() && copySizes.size() <= 3 &&
         "only 1,2,3-D copies are supported for now");

  LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
  LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
             llvm::dbgs() << "\n";);

  // Greedily find the largest vector size that can be used to copy the most
  // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
  // contiguous memory transactions with as few threads as possible.
  int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
      desiredBitAlignment, copySizes.back(), elementalBitwidth);

  LDBG("--greedily determined vectorSize: "
       << desiredVectorSize << " elements of " << elementalBitwidth
       << "b each -> " << (desiredVectorSize * elementalBitwidth)
       << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");

  status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
                           favorPredication);
  if (status == Status::Invalid)
    return;

  LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
             llvm::dbgs() << "\n"; llvm::interleaveComma(
                 this->numThreads, DBGS() << "--numThreads: ");
             llvm::dbgs() << "\n";);
  LDBG("--vectorSize: " << this->vectorSize);
  assert(this->numThreads.size() == copySizes.size() &&
         "compute copy mapping expected same number of threads and copy sizes");

  // Compute the smallest bounding box.
  this->smallestBoundingTileSizes = llvm::to_vector(
      llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
        int64_t size, numThreads;
        std::tie(size, numThreads) = pair;
        return mlir::ceilDiv(size, numThreads);
      }));
  SmallVector<Attribute> allThreadMappings{linearIdZ(ctx), linearIdY(ctx),
                                           linearIdX(ctx)};

  // Set the thread mapping.
  this->threadMapping =
      llvm::to_vector(ArrayRef(allThreadMappings)
                          .take_back(this->smallestBoundingTileSizes.size()));
  LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
}

int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
    int64_t desiredBitAlignment, int64_t numContiguousElements,
    int64_t elementalBitwidth) {
  assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
         "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
  assert(desiredBitAlignment % elementalBitwidth == 0 &&
         "elemental bitwidth does not divide desired bit alignment");
  return std::gcd(
      std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
      kMaxVectorLoadBitWidth / elementalBitwidth);
}

/// Get the list of all factors that divide `val`, not just the prime factors.
static SmallVector<int64_t> getFactors(int64_t val) {
  SmallVector<int64_t> factors;
  factors.reserve(val);
  for (int64_t factor = 1; factor <= val; ++factor) {
    if (val % factor != 0)
      continue;
    factors.push_back(factor);
  }
  factors.push_back(val);
  return factors;
}

static int64_t product(ArrayRef<int64_t> vals) {
  int64_t res = 1;
  for (auto val : vals)
    res *= val;
  return res;
}

/// Extract `result` from `sizes` with the following constraints:
///   1. sizes[i] % result[i] for all i
///   2. product_of_threadsPerDim <= maxNumThreads
///   3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
///      must be sizes[currentIndex].
/// This is used to greedily extract the maximum number of threads usable for
/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
/// ensuring coalesced access along the most minor dimension.
/// Return the number of threads used in the range:
///   threadsPerDim[currentIndex .. sizes.end()]
// The implementation uses a dynamic programming approach to greedily extract
// the best combination under the constraints.
// TODO: Implementation details can be improved but putting effort there is a
// tradeoffs: `sizes` is expected to be of small rank and contain small values.
static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
                                               int64_t currentIndex,
                                               int64_t maxNumThreads) {
  assert(static_cast<size_t>(currentIndex) < sizes.size() &&
         "currentIndex out of bounds");
  std::string indent(2 * currentIndex, '-');
  if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
    LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
    return SmallVector<int64_t>{sizes[currentIndex]};
  }

  int64_t best = 0;
  int64_t s = sizes[currentIndex];
  SmallVector<int64_t> factors = getFactors(s);
  SmallVector<int64_t> localThreadsPerDim;
  localThreadsPerDim.reserve(sizes.size());
  LDBG(indent << "maximizeNumThreads in " << s
              << " with limit: " << maxNumThreads);
  for (auto factor : factors) {
    auto nestedThreadsPerDim =
        maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
    int64_t localBest = factor * product(nestedThreadsPerDim);
    if (localBest > best && localBest <= maxNumThreads) {
      LDBG(indent << "new localBest: " << localBest);
      LLVM_DEBUG(
          llvm::interleaveComma(nestedThreadsPerDim,
                                DBGS() << indent << "nestedThreadsPerDim: ");
          llvm::dbgs() << "\n";);
      localThreadsPerDim.clear();
      localThreadsPerDim.push_back(factor);
      llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
      best = localBest;
    }
  }

  LDBG(indent << "found globalBest: " << best);
  LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
                                   DBGS() << indent << "numThreads: ");
             llvm::dbgs() << "\n";);

  return localThreadsPerDim;
}

transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
                                                 ArrayRef<int64_t> sizes,
                                                 int64_t desiredVectorSize,
                                                 bool favorPredication) {

  if (!favorPredication) {
    int64_t localVectorSize = desiredVectorSize;
    for (; localVectorSize >= 1; localVectorSize /= 2) {
      // Attempt to map the copy with predication and current fixed vector size:
      //   1. if the status is Success, we are done.
      //   2. if the status is Invalid, we fail immediately, no amount of
      //   vector size reduction can offset the bad tile size selection from the
      //   higher-level.
      //   3. if the status is RequiresPredication, we try again with a smaller
      //   vector size.
      Status status =
          inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
      if (status == Status::Success || status == Status::Invalid)
        return status;

      LDBG("requires predication, try reducing vector size to "
           << (localVectorSize / 2));
    }
  }

  // If we have not yet returned, it means that we have tried all vector sizes
  // and we still require predication. Restart from the original vector size and
  // do not attempt to
  return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
}

transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
    int64_t totalNumThreads, ArrayRef<int64_t> sizes,
    int64_t desiredVectorSize) {
  assert(sizes.back() % desiredVectorSize == 0 &&
         "most-minor size not divisible by actualVectorSize");

  LDBG("inferNumThreadsImpl with totalNumThreads: "
       << totalNumThreads << " and vectorSize: " << desiredVectorSize);

  // Scale the most minor size to account for the chosen vector size and
  // maximize the number of threads without exceeding the total number of
  // threads.
  SmallVector<int64_t> scaledSizes{sizes};
  scaledSizes.back() /= desiredVectorSize;
  if (scaledSizes.back() > totalNumThreads) {
    LDBG("--Too few threads given the required vector size -> FAIL");
    return Status::Invalid;
  }
  SmallVector<int64_t> inferredNumThreads =
      maximizeNumThreads(scaledSizes, 0, totalNumThreads);

  LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
                                   DBGS() << "inferred numThreads: ");
             llvm::dbgs() << "\n";
             LDBG("computed actualVectorSize: " << desiredVectorSize););

  // Corner case: we cannot use more threads than available. If the dimension of
  // the copy is so bad it is because higher-level tiling did not do its job, we
  // do not try to recover from it here.
  int64_t totalNumThreadsUsed = product(inferredNumThreads);
  LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
  if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
    LDBG("--Too few threads given the required vector size -> FAIL");
    return Status::Invalid;
  }

  this->vectorSize = desiredVectorSize;
  this->numThreads = inferredNumThreads;
  if (totalNumThreadsUsed == totalNumThreads)
    return Status::Success;

  return Status::RequiresPredication;
}

void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
  os << "MappingInfo{";
  os << "CopyMappingInfo: ";
  os << "valid: " << (status != Status::Invalid) << ", ";
  os << "vectorSize: " << vectorSize << ", ";
  llvm::interleaveComma(numThreads, os << ", numThreads: {");
  llvm::interleaveComma(smallestBoundingTileSizes,
                        os << "}, smallestBoundingTileSizes: {");
  llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
  os << "}}";
}