1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
|
From da4504b2d3c6629fbd58634bf76f1b85939d07cf Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Fri, 15 Sep 2017 18:30:59 -0400
Subject: [PATCH] [Mem2Reg] Also handle memcpy
Summary:
In julia, when we know we're moving data between two memory locations,
we always emit that as a memcpy rather than a load/store pair. However,
this can give worse optimization results in certain cases because some
optimizations that can handle load/store pairs cannot handle memcpys.
Mem2reg is one of these optimizations. This patch adds rudamentary
support for mem2reg for recognizing memcpys that cover the whole alloca
we're promoting. While several more sophisticated passes (SROA, GVN)
can get similar optimizations, it is preferable to have these kinds
of cases caught early to expose optimization opportunities before
getting to these later passes. The approach taken here is to split
the memcpy into a load/store pair early (after legality analysis)
and retain the rest of the analysis only on loads/stores. It would
be possible of course to leave the memcpy as is and generate the
left over load or store only on demand. However, that would entail
a significantly larger patch for unclear benefit.
Reviewers: chandlerc, dberlin
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37939
---
lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 166 ++++++++++++++++++++---
test/Transforms/Mem2Reg/memcpy.ll | 101 ++++++++++++++
2 files changed, 251 insertions(+), 16 deletions(-)
create mode 100644 test/Transforms/Mem2Reg/memcpy.ll
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index ac28f59..b08a0a1 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -49,6 +49,58 @@ STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store");
STATISTIC(NumDeadAlloca, "Number of dead alloca's removed");
STATISTIC(NumPHIInsert, "Number of PHI nodes inserted");
+static bool isSplittableMemCpy(const MemCpyInst *MCI, const AllocaInst *AI) {
+ // Punt if this alloca is an array allocation
+ if (AI->isArrayAllocation())
+ return false;
+ if (MCI->isVolatile())
+ return false;
+ Value *Length = MCI->getLength();
+ if (!isa<ConstantInt>(Length))
+ return false;
+ // Anything less than the full alloca, we leave for SROA
+ const DataLayout &DL = AI->getModule()->getDataLayout();
+ size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType());
+ if (cast<ConstantInt>(Length)->getZExtValue() != AIElSize)
+ return false;
+ // If the other argument is also an alloca, we need to be sure that either
+ // the types are bitcastable, or the other alloca is not eligible for
+ // promotion (e.g. because the memcpy is for less than the whole size of
+ // that alloca), otherwise we risk turning an allocatable alloca into a
+ // non-allocatable one when splitting the memcpy.
+ AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ AI == MCI->getSource() ? MCI->getDest() : MCI->getSource());
+ if (OtherAI) {
+ if (!CastInst::isBitCastable(AI->getAllocatedType(),
+ OtherAI->getAllocatedType()) &&
+ DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize)
+ return false;
+ }
+ return true;
+}
+
+/// Look at the result of a bitcast and see if it's only used by lifetime
+/// intrinsics or splittable memcpys. This is needed, because IRBuilder
+/// will always insert a bitcast to i8* for these intrinsics.
+static bool onlyHasCanonicalizableUsers(const AllocaInst *AI, const Value *V) {
+ for (const User *U : V->users()) {
+ const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+ if (!II)
+ return false;
+
+ if (isa<MemCpyInst>(II)) {
+ if (!isSplittableMemCpy(cast<MemCpyInst>(II), AI))
+ return false;
+ continue;
+ }
+
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
+ }
+ return true;
+}
+
bool llvm::isAllocaPromotable(const AllocaInst *AI) {
// FIXME: If the memory unit is of pointer or integer type, we can permit
// assignments to subsections of the memory unit.
@@ -68,6 +120,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
// not have any meaning for a local alloca.
if (SI->isVolatile())
return false;
+ } else if (const MemCpyInst *MCI = dyn_cast<MemCpyInst>(U)) {
+ if (!isSplittableMemCpy(MCI, AI))
+ return false;
} else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
II->getIntrinsicID() != Intrinsic::lifetime_end)
@@ -75,7 +130,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
} else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
return false;
- if (!onlyUsedByLifetimeMarkers(BCI))
+ if (!onlyHasCanonicalizableUsers(AI, BCI))
return false;
} else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
@@ -181,7 +235,13 @@ public:
/// This code only looks at accesses to allocas.
static bool isInterestingInstruction(const Instruction *I) {
+ if (isa<MemCpyInst>(I)) {
+ const MemCpyInst *MCI = cast<MemCpyInst>(I);
+ return isa<AllocaInst>(MCI->getSource()) ||
+ isa<AllocaInst>(MCI->getDest());
+ } else {
return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
(isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
}
+ }
/// Get or calculate the index of the specified instruction.
@@ -208,6 +264,25 @@ public:
return It->second;
}
+ // When we split a memcpy intrinsic, we need to update the numbering in this
+ // struct. To make sure the relative ordering remains the same, we give both
+ // the LI and the SI the number that the MCI used to have (if they are both
+ // interesting). This means that they will have equal numbers, which usually
+ // can't happen. However, since they can never reference the same alloca
+ // (since memcpy operands may not overlap), this is fine, because we will
+ // never compare instruction indices for instructions that operate on distinct
+ // allocas.
+ void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) {
+ DenseMap<const Instruction *, unsigned>::iterator It =
+ InstNumbers.find(MCI);
+ if (It == InstNumbers.end())
+ return;
+ unsigned MemCpyNumber = It->second;
+ InstNumbers[LI] = MemCpyNumber;
+ InstNumbers[SI] = MemCpyNumber;
+ deleteValue(MCI);
+ }
+
void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
void clear() { InstNumbers.clear(); }
@@ -305,9 +380,58 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
AC->registerAssumption(CI);
}
-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
- // Knowing that this alloca is promotable, we know that it's safe to kill all
- // instructions except for load and store.
+/// Split a memcpy instruction into the corresponding load/store. It is a little
+/// more complicated than one might imagine, because we need to deal with the
+/// fact that the side of the copy we're not currently processing might also
+/// be a promotable alloca. We need to be careful to not break the promotable
+/// predicate for that other alloca (if any).
+static void doMemCpySplit(LargeBlockInfo &LBI, MemCpyInst *MCI,
+ AllocaInst *AI) {
+ AAMDNodes AA;
+ MCI->getAAMetadata(AA);
+ Value *MCISrc = MCI->getSource();
+ Type *LoadType = AI->getAllocatedType();
+ AllocaInst *SrcAI = dyn_cast<AllocaInst>(MCISrc);
+ if (SrcAI && SrcAI->getType() != AI->getType()) {
+ if (CastInst::isBitCastable(SrcAI->getAllocatedType(), LoadType))
+ LoadType = SrcAI->getAllocatedType();
+ }
+ if (cast<PointerType>(MCISrc->getType())->getElementType() != LoadType)
+ MCISrc = CastInst::Create(
+ Instruction::BitCast, MCISrc,
+ LoadType->getPointerTo(
+ cast<PointerType>(MCISrc->getType())->getAddressSpace()),
+ "", MCI);
+ // This might add to the end of the use list, but that's fine. At worst,
+ // we'd not visit the instructions we insert here, but we don't care
+ // about them in this loop anyway.
+ LoadInst *LI = new LoadInst(LoadType, MCISrc, "", MCI->isVolatile(),
+ MCI->getAlignment(), MCI);
+ Value *Val = LI;
+ Value *MCIDest = MCI->getDest();
+ AllocaInst *DestAI = dyn_cast<AllocaInst>(MCIDest);
+ Type *DestElTy = DestAI ? DestAI->getAllocatedType() : AI->getAllocatedType();
+ if (LI->getType() != DestElTy &&
+ CastInst::isBitCastable(LI->getType(), DestElTy))
+ Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI);
+ if (cast<PointerType>(MCIDest->getType())->getElementType() != Val->getType())
+ MCIDest = CastInst::Create(
+ Instruction::BitCast, MCIDest,
+ Val->getType()->getPointerTo(
+ cast<PointerType>(MCIDest->getType())->getAddressSpace()),
+ "", MCI);
+ StoreInst *SI =
+ new StoreInst(Val, MCIDest, MCI->isVolatile(), MCI->getAlignment(), MCI);
+ LI->setAAMetadata(AA);
+ SI->setAAMetadata(AA);
+ LBI.splitMemCpy(MCI, LI, SI);
+ MCI->eraseFromParent();
+}
+
+static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) {
+ // Knowing that this alloca is promotable, we know that it's safe to split
+ // MTIs into load/store and to kill all other instructions except for
+ // load and store.
for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) {
Instruction *I = cast<Instruction>(*UI);
@@ -315,14 +439,24 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
if (isa<LoadInst>(I) || isa<StoreInst>(I))
continue;
+ if (isa<MemCpyInst>(I)) {
+ MemCpyInst *MCI = cast<MemCpyInst>(I);
+ doMemCpySplit(LBI, MCI, AI);
+ continue;
+ }
+
if (!I->getType()->isVoidTy()) {
- // The only users of this bitcast/GEP instruction are lifetime intrinsics.
- // Follow the use/def chain to erase them now instead of leaving it for
- // dead code elimination later.
+ // The only users of this bitcast/GEP instruction are lifetime/memcpy
+ // intrinsics. Split memcpys and delete lifetime intrinsics.
for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) {
Instruction *Inst = cast<Instruction>(*UUI);
++UUI;
- Inst->eraseFromParent();
+ if (isa<MemCpyInst>(Inst)) {
+ doMemCpySplit(LBI, cast<MemCpyInst>(Inst), AI);
+ } else {
+ // Must be a lifetime intrinsic
+ Inst->eraseFromParent();
+ }
}
}
I->eraseFromParent();
@@ -542,7 +676,7 @@ void PromoteMem2Reg::run() {
assert(AI->getParent()->getParent() == &F &&
"All allocas should be in the same function, which is same as DF!");
- removeLifetimeIntrinsicUsers(AI);
+ canonicalizeUsers(LBI, AI);
if (AI->use_empty()) {
// If there are no uses of the alloca, just delete it now.
diff --git a/test/Transforms/Mem2Reg/memcpy.ll b/test/Transforms/Mem2Reg/memcpy.ll
new file mode 100644
index 0000000..fbc4096
--- /dev/null
+++ b/test/Transforms/Mem2Reg/memcpy.ll
@@ -0,0 +1,101 @@
+; RUN: opt < %s -mem2reg -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *, i8 *, i32, i32, i1)
+declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1)
+declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1)
+
+define i128 @test_cpy_different(i64) {
+; CHECK-LABEL: @test_cpy_different
+; CHECK-NOT: alloca i64
+; CHECK: store i64 %0
+ %a = alloca i64
+ %b = alloca i128
+ store i128 0, i128 *%b
+ store i64 %0, i64 *%a
+ call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0)
+ %loaded = load i128, i128 *%b
+ ret i128 %loaded
+}
+
+define i64 @test_cpy_same(i64) {
+; CHECK-LABEL: @test_cpy_same
+; CHECK-NOT: alloca
+; CHECK: ret i64 %0
+ %a = alloca i64
+ %b = alloca i64
+ store i64 %0, i64 *%a
+ call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0)
+ %loaded = load i64, i64 *%b
+ ret i64 %loaded
+}
+
+define double @test_cpy_different_type(i64) {
+; CHECK-LABEL: @test_cpy_different_type
+; CHECK-NOT: alloca
+; CHECK: bitcast i64 %0 to double
+ %a = alloca i64
+ %b = alloca double
+ store i64 %0, i64 *%a
+ call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0)
+ %loaded = load double, double *%b
+ ret double %loaded
+}
+
+define i128 @test_cpy_differenti8(i64) {
+; CHECK-LABEL: @test_cpy_differenti8
+; CHECK-NOT: alloca i64
+; CHECK: store i64 %0
+ %a = alloca i64
+ %b = alloca i128
+ store i128 0, i128 *%b
+ store i64 %0, i64 *%a
+ %acast = bitcast i64* %a to i8*
+ %bcast = bitcast i128* %b to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
+ %loaded = load i128, i128 *%b
+ ret i128 %loaded
+}
+
+define i64 @test_cpy_samei8(i64) {
+; CHECK-LABEL: @test_cpy_samei8
+; CHECK-NOT: alloca
+; CHECK: ret i64 %0
+ %a = alloca i64
+ %b = alloca i64
+ store i64 %0, i64 *%a
+ %acast = bitcast i64* %a to i8*
+ %bcast = bitcast i64* %b to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
+ %loaded = load i64, i64 *%b
+ ret i64 %loaded
+}
+
+define double @test_cpy_different_typei8(i64) {
+; CHECK-LABEL: @test_cpy_different_typei8
+; CHECK-NOT: alloca
+; CHECK: bitcast i64 %0 to double
+ %a = alloca i64
+ %b = alloca double
+ store i64 %0, i64 *%a
+ %acast = bitcast i64* %a to i8*
+ %bcast = bitcast double* %b to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
+ %loaded = load double, double *%b
+ ret double %loaded
+}
+
+define i64 @test_cpy_differenti8_reverse(i128) {
+; CHECK-LABEL: @test_cpy_differenti8_reverse
+; CHECK-NOT: alloca i64
+ %a = alloca i64
+ %b = alloca i128
+ store i128 %0, i128 *%b
+ %acast = bitcast i64* %a to i8*
+ %bcast = bitcast i128* %b to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%acast, i8 *%bcast, i32 8, i32 0, i1 0)
+ %loaded = load i64, i64 *%a
+ ret i64 %loaded
+}
--
2.9.3
|