1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
|
//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This file contains the declarations of different VPlan-related auxiliary
/// helpers.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
#include "VPlanAnalysis.h"
#include "VPlanDominatorTree.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/ModuleSlotTracker.h"
#include "llvm/Support/InstructionCost.h"
namespace llvm {
class AssumptionCache;
class BasicBlock;
class DominatorTree;
class InnerLoopVectorizer;
class IRBuilderBase;
class LoopInfo;
class SCEV;
class Type;
class VPBasicBlock;
class VPRegionBlock;
class VPlan;
class Value;
/// Returns a calculation for the total number of elements for a given \p VF.
/// For fixed width vectors this value is a constant, whereas for scalable
/// vectors it is an expression determined at runtime.
Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
/// Return a value for Step multiplied by VF.
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step);
/// A helper function that returns how much we should divide the cost of a
/// predicated block by. Typically this is the reciprocal of the block
/// probability, i.e. if we return X we are assuming the predicated block will
/// execute once for every X iterations of the loop header so the block should
/// only contribute 1/X of its cost to the total cost calculation, but when
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
///
/// TODO: We should use actual block probability here, if available. Currently,
/// we always assume predicated blocks have a 50% chance of executing.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
}
/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
/// [1, 16) = {1, 2, 4, 8}
struct VFRange {
// A power of 2.
const ElementCount Start;
// A power of 2. If End <= Start range is empty.
ElementCount End;
bool isEmpty() const {
return End.getKnownMinValue() <= Start.getKnownMinValue();
}
VFRange(const ElementCount &Start, const ElementCount &End)
: Start(Start), End(End) {
assert(Start.isScalable() == End.isScalable() &&
"Both Start and End should have the same scalable flag");
assert(isPowerOf2_32(Start.getKnownMinValue()) &&
"Expected Start to be a power of 2");
assert(isPowerOf2_32(End.getKnownMinValue()) &&
"Expected End to be a power of 2");
}
/// Iterator to iterate over vectorization factors in a VFRange.
class iterator
: public iterator_facade_base<iterator, std::forward_iterator_tag,
ElementCount> {
ElementCount VF;
public:
iterator(ElementCount VF) : VF(VF) {}
bool operator==(const iterator &Other) const { return VF == Other.VF; }
ElementCount operator*() const { return VF; }
iterator &operator++() {
VF *= 2;
return *this;
}
};
iterator begin() { return iterator(Start); }
iterator end() {
assert(isPowerOf2_32(End.getKnownMinValue()));
return iterator(End);
}
};
/// In what follows, the term "input IR" refers to code that is fed into the
/// vectorizer whereas the term "output IR" refers to code that is generated by
/// the vectorizer.
/// VPLane provides a way to access lanes in both fixed width and scalable
/// vectors, where for the latter the lane index sometimes needs calculating
/// as a runtime expression.
class VPLane {
public:
/// Kind describes how to interpret Lane.
enum class Kind : uint8_t {
/// For First, Lane is the index into the first N elements of a
/// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
First,
/// For ScalableLast, Lane is the offset from the start of the last
/// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
/// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
/// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
ScalableLast
};
private:
/// in [0..VF)
unsigned Lane;
/// Indicates how the Lane should be interpreted, as described above.
Kind LaneKind = Kind::First;
public:
VPLane(unsigned Lane) : Lane(Lane) {}
VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
"trying to extract with invalid offset");
unsigned LaneOffset = VF.getKnownMinValue() - Offset;
Kind LaneKind;
if (VF.isScalable())
// In this case 'LaneOffset' refers to the offset from the start of the
// last subvector with VF.getKnownMinValue() elements.
LaneKind = VPLane::Kind::ScalableLast;
else
LaneKind = VPLane::Kind::First;
return VPLane(LaneOffset, LaneKind);
}
static VPLane getLastLaneForVF(const ElementCount &VF) {
return getLaneFromEnd(VF, 1);
}
/// Returns a compile-time known value for the lane index and asserts if the
/// lane can only be calculated at runtime.
unsigned getKnownLane() const {
assert(LaneKind == Kind::First &&
"can only get known lane from the beginning");
return Lane;
}
/// Returns an expression describing the lane index that can be used at
/// runtime.
Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
/// Returns the Kind of lane offset.
Kind getKind() const { return LaneKind; }
/// Returns true if this is the first lane of the whole vector.
bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
/// Maps the lane to a cache index based on \p VF.
unsigned mapToCacheIndex(const ElementCount &VF) const {
switch (LaneKind) {
case VPLane::Kind::ScalableLast:
assert(VF.isScalable() && Lane < VF.getKnownMinValue() &&
"ScalableLast can only be used with scalable VFs");
return VF.getKnownMinValue() + Lane;
default:
assert(Lane < VF.getKnownMinValue() &&
"Cannot extract lane larger than VF");
return Lane;
}
}
};
/// VPTransformState holds information passed down when "executing" a VPlan,
/// needed for generating the output IR.
struct VPTransformState {
VPTransformState(const TargetTransformInfo *TTI, ElementCount VF,
LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
IRBuilderBase &Builder, VPlan *Plan, Loop *CurrentParentLoop,
Type *CanonicalIVTy);
/// Target Transform Info.
const TargetTransformInfo *TTI;
/// The chosen Vectorization Factor of the loop being vectorized.
ElementCount VF;
/// Hold the index to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
/// instructions.
std::optional<VPLane> Lane;
struct DataState {
// Each value from the original loop, when vectorized, is represented by a
// vector value in the map.
DenseMap<const VPValue *, Value *> VPV2Vector;
DenseMap<const VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
} Data;
/// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
/// is false, otherwise return the generated scalar. \See set.
Value *get(const VPValue *Def, bool IsScalar = false);
/// Get the generated Value for a given VPValue and given Part and Lane.
Value *get(const VPValue *Def, const VPLane &Lane);
bool hasVectorValue(const VPValue *Def) {
return Data.VPV2Vector.contains(Def);
}
bool hasScalarValue(const VPValue *Def, VPLane Lane) {
auto I = Data.VPV2Scalars.find(Def);
if (I == Data.VPV2Scalars.end())
return false;
unsigned CacheIdx = Lane.mapToCacheIndex(VF);
return CacheIdx < I->second.size() && I->second[CacheIdx];
}
/// Set the generated vector Value for a given VPValue, if \p
/// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
void set(const VPValue *Def, Value *V, bool IsScalar = false) {
if (IsScalar) {
set(Def, V, VPLane(0));
return;
}
assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
"scalar values must be stored as (0, 0)");
Data.VPV2Vector[Def] = V;
}
/// Reset an existing vector value for \p Def and a given \p Part.
void reset(const VPValue *Def, Value *V) {
assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
Data.VPV2Vector[Def] = V;
}
/// Set the generated scalar \p V for \p Def and the given \p Lane.
void set(const VPValue *Def, Value *V, const VPLane &Lane) {
auto &Scalars = Data.VPV2Scalars[Def];
unsigned CacheIdx = Lane.mapToCacheIndex(VF);
if (Scalars.size() <= CacheIdx)
Scalars.resize(CacheIdx + 1);
assert(!Scalars[CacheIdx] && "should overwrite existing value");
Scalars[CacheIdx] = V;
}
/// Reset an existing scalar value for \p Def and a given \p Lane.
void reset(const VPValue *Def, Value *V, const VPLane &Lane) {
auto Iter = Data.VPV2Scalars.find(Def);
assert(Iter != Data.VPV2Scalars.end() &&
"need to overwrite existing value");
unsigned CacheIdx = Lane.mapToCacheIndex(VF);
assert(CacheIdx < Iter->second.size() &&
"need to overwrite existing value");
Iter->second[CacheIdx] = V;
}
/// Set the debug location in the builder using the debug location \p DL.
void setDebugLocFrom(DebugLoc DL);
/// Insert the scalar value of \p Def at \p Lane into \p Lane of \p WideValue
/// and return the resulting value.
Value *packScalarIntoVectorizedValue(const VPValue *Def, Value *WideValue,
const VPLane &Lane);
/// Hold state information used when constructing the CFG of the output IR,
/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
struct CFGState {
/// The previous VPBasicBlock visited. Initially set to null.
VPBasicBlock *PrevVPBB = nullptr;
/// The previous IR BasicBlock created or used. Initially set to the new
/// header BasicBlock.
BasicBlock *PrevBB = nullptr;
/// The last IR BasicBlock in the output IR. Set to the exit block of the
/// vector loop.
BasicBlock *ExitBB = nullptr;
/// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
/// of replication, maps the BasicBlock of the last replica created.
SmallDenseMap<const VPBasicBlock *, BasicBlock *> VPBB2IRBB;
/// Updater for the DominatorTree.
DomTreeUpdater DTU;
CFGState(DominatorTree *DT)
: DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
} CFG;
/// Hold a pointer to LoopInfo to register new basic blocks in the loop.
LoopInfo *LI;
/// Hold a pointer to AssumptionCache to register new assumptions after
/// replicating assume calls.
AssumptionCache *AC;
/// Hold a reference to the IRBuilder used to generate output IR code.
IRBuilderBase &Builder;
/// Pointer to the VPlan code is generated for.
VPlan *Plan;
/// The parent loop object for the current scope, or nullptr.
Loop *CurrentParentLoop = nullptr;
/// VPlan-based type analysis.
VPTypeAnalysis TypeAnalysis;
/// VPlan-based dominator tree.
VPDominatorTree VPDT;
};
/// Struct to hold various analysis needed for cost computations.
struct VPCostContext {
const TargetTransformInfo &TTI;
const TargetLibraryInfo &TLI;
VPTypeAnalysis Types;
LLVMContext &LLVMCtx;
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
Type *CanIVTy, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind)
: TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
CM(CM), CostKind(CostKind) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
/// Return true if the cost for \p UI shouldn't be computed, e.g. because it
/// has already been pre-computed.
bool skipCostComputation(Instruction *UI, bool IsVector) const;
/// Returns the OperandInfo for \p V, if it is a live-in.
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
/// Return true if \p I is considered uniform-after-vectorization in the
/// legacy cost model for \p VF. Only used to check for additional VPlan
/// simplifications.
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const;
};
/// This class can be used to assign names to VPValues. For VPValues without
/// underlying value, assign consecutive numbers and use those as names (wrapped
/// in vp<>). Otherwise, use the name from the underlying value (wrapped in
/// ir<>), appending a .V version number if there are multiple uses of the same
/// name. Allows querying names for VPValues for printing, similar to the
/// ModuleSlotTracker for IR values.
class VPSlotTracker {
/// Keep track of versioned names assigned to VPValues with underlying IR
/// values.
DenseMap<const VPValue *, std::string> VPValue2Name;
/// Keep track of the next number to use to version the base name.
StringMap<unsigned> BaseName2Version;
/// Number to assign to the next VPValue without underlying value.
unsigned NextSlot = 0;
/// Lazily created ModuleSlotTracker, used only when unnamed IR instructions
/// require slot tracking.
std::unique_ptr<ModuleSlotTracker> MST;
void assignName(const VPValue *V);
void assignNames(const VPlan &Plan);
void assignNames(const VPBasicBlock *VPBB);
std::string getName(const Value *V);
public:
VPSlotTracker(const VPlan *Plan = nullptr) {
if (Plan)
assignNames(*Plan);
}
/// Returns the name assigned to \p V, if there is one, otherwise try to
/// construct one from the underlying value, if there's one; else return
/// <badref>.
std::string getOrCreateName(const VPValue *V) const;
};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
/// indented and follows the dot format.
class VPlanPrinter {
raw_ostream &OS;
const VPlan &Plan;
unsigned Depth = 0;
unsigned TabWidth = 2;
std::string Indent;
unsigned BID = 0;
SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
VPSlotTracker SlotTracker;
/// Handle indentation.
void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
/// Print a given \p Block of the Plan.
void dumpBlock(const VPBlockBase *Block);
/// Print the information related to the CFG edges going out of a given
/// \p Block, followed by printing the successor blocks themselves.
void dumpEdges(const VPBlockBase *Block);
/// Print a given \p BasicBlock, including its VPRecipes, followed by printing
/// its successor blocks.
void dumpBasicBlock(const VPBasicBlock *BasicBlock);
/// Print a given \p Region of the Plan.
void dumpRegion(const VPRegionBlock *Region);
unsigned getOrCreateBID(const VPBlockBase *Block) {
return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
}
Twine getOrCreateName(const VPBlockBase *Block);
Twine getUID(const VPBlockBase *Block);
/// Print the information related to a CFG edge between two VPBlockBases.
void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
const Twine &Label);
public:
VPlanPrinter(raw_ostream &O, const VPlan &P)
: OS(O), Plan(P), SlotTracker(&P) {}
LLVM_DUMP_METHOD void dump();
};
#endif
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
|