JuliaLang · Keno · Oct 9, 2017 · Sep 18, 2017 · Sep 9, 2017 · Oct 7, 2017
diff --git a/base/inference.jl b/base/inference.jl
@@ -504,6 +504,8 @@ add_tfunc(sdiv_int, 2, 2, math_tfunc, 30)
 add_tfunc(udiv_int, 2, 2, math_tfunc, 30)
 add_tfunc(srem_int, 2, 2, math_tfunc, 30)
 add_tfunc(urem_int, 2, 2, math_tfunc, 30)
+add_tfunc(add_ptr, 2, 2, math_tfunc, 1)
+add_tfunc(sub_ptr, 2, 2, math_tfunc, 1)
 add_tfunc(neg_float, 1, 1, math_tfunc, 1)
 add_tfunc(add_float, 2, 2, math_tfunc, 1)
 add_tfunc(sub_float, 2, 2, math_tfunc, 1)

diff --git a/base/pointer.jl b/base/pointer.jl
@@ -147,8 +147,8 @@ eltype(::Type{Ptr{T}}) where {T} = T
 isless(x::Ptr, y::Ptr) = isless(UInt(x), UInt(y))
 -(x::Ptr, y::Ptr) = UInt(x) - UInt(y)
 
-+(x::Ptr, y::Integer) = oftype(x, (UInt(x) + (y % UInt) % UInt))
--(x::Ptr, y::Integer) = oftype(x, (UInt(x) - (y % UInt) % UInt))
++(x::Ptr, y::Integer) = oftype(x, Intrinsics.add_ptr(UInt(x), (y % UInt) % UInt))
+-(x::Ptr, y::Integer) = oftype(x, Intrinsics.sub_ptr(UInt(x), (y % UInt) % UInt))
 +(x::Integer, y::Ptr) = y + x
 
 """

diff --git a/deps/llvm.mk b/deps/llvm.mk
@@ -460,6 +460,7 @@ $(eval $(call LLVM_PATCH,llvm-D32593))
 $(eval $(call LLVM_PATCH,llvm-D33179))
 $(eval $(call LLVM_PATCH,llvm-PR29010-i386-xmm)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-3.9.0-D37576-NVPTX-sm_70)) # NVPTX, Remove for 6.0
+$(eval $(call LLVM_PATCH,llvm-D37939-Mem2Reg-Also-handle-memcpy))
 else ifeq ($(LLVM_VER_SHORT),4.0)
 # Cygwin and openSUSE still use win32-threads mingw, https://llvm.org/bugs/show_bug.cgi?id=26365
 $(eval $(call LLVM_PATCH,llvm-4.0.0_threads))

diff --git a/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch b/deps/patches/llvm-D37939-Mem2Reg-Also-handle-memcpy.patch
@@ -0,0 +1,365 @@
+From da4504b2d3c6629fbd58634bf76f1b85939d07cf Mon Sep 17 00:00:00 2001
+From: Keno Fischer <keno@juliacomputing.com>
+Date: Fri, 15 Sep 2017 18:30:59 -0400
+Subject: [PATCH] [Mem2Reg] Also handle memcpy
+
+Summary:
+In julia, when we know we're moving data between two memory locations,
+we always emit that as a memcpy rather than a load/store pair. However,
+this can give worse optimization results in certain cases because some
+optimizations that can handle load/store pairs cannot handle memcpys.
+Mem2reg is one of these optimizations. This patch adds rudamentary
+support for mem2reg for recognizing memcpys that cover the whole alloca
+we're promoting. While several more sophisticated passes (SROA, GVN)
+can get similar optimizations, it is preferable to have these kinds
+of cases caught early to expose optimization opportunities before
+getting to these later passes. The approach taken here is to split
+the memcpy into a load/store pair early (after legality analysis)
+and retain the rest of the analysis only on loads/stores. It would
+be possible of course to leave the memcpy as is and generate the
+left over load or store only on demand. However, that would entail
+a significantly larger patch for unclear benefit.
+
+Reviewers: chandlerc, dberlin
+
+Subscribers: llvm-commits
+
+Differential Revision: https://reviews.llvm.org/D37939
+---
+ lib/Transforms/Utils/PromoteMemoryToRegister.cpp | 166 ++++++++++++++++++++---
+ test/Transforms/Mem2Reg/memcpy.ll                | 101 ++++++++++++++
+ 2 files changed, 251 insertions(+), 16 deletions(-)
+ create mode 100644 test/Transforms/Mem2Reg/memcpy.ll
+
+diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+index ac28f59..b08a0a1 100644
+--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
++++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+@@ -49,6 +49,58 @@ STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+ STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+ STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+
++static bool isSplittableMemCpy(const MemCpyInst *MCI, const AllocaInst *AI) {
++  // Punt if this alloca is an array allocation
++  if (AI->isArrayAllocation())
++    return false;
++  if (MCI->isVolatile())
++    return false;
++  Value *Length = MCI->getLength();
++  if (!isa<ConstantInt>(Length))
++    return false;
++  // Anything less than the full alloca, we leave for SROA
++  const DataLayout &DL = AI->getModule()->getDataLayout();
++  size_t AIElSize = DL.getTypeAllocSize(AI->getAllocatedType());
++  if (cast<ConstantInt>(Length)->getZExtValue() != AIElSize)
++    return false;
++  // If the other argument is also an alloca, we need to be sure that either
++  // the types are bitcastable, or the other alloca is not eligible for
++  // promotion (e.g. because the memcpy is for less than the whole size of
++  // that alloca), otherwise we risk turning an allocatable alloca into a
++  // non-allocatable one when splitting the memcpy.
++  AllocaInst *OtherAI = dyn_cast<AllocaInst>(
++      AI == MCI->getSource() ? MCI->getDest() : MCI->getSource());
++  if (OtherAI) {
++    if (!CastInst::isBitCastable(AI->getAllocatedType(),
++                                 OtherAI->getAllocatedType()) &&
++        DL.getTypeAllocSize(OtherAI->getAllocatedType()) == AIElSize)
++      return false;
++  }
++  return true;
++}
++
++/// Look at the result of a bitcast and see if it's only used by lifetime
++/// intrinsics or splittable memcpys. This is needed, because IRBuilder
++/// will always insert a bitcast to i8* for these intrinsics.
++static bool onlyHasCanonicalizableUsers(const AllocaInst *AI, const Value *V) {
++  for (const User *U : V->users()) {
++    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
++    if (!II)
++      return false;
++
++    if (isa<MemCpyInst>(II)) {
++      if (!isSplittableMemCpy(cast<MemCpyInst>(II), AI))
++        return false;
++      continue;
++    }
++
++    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
++        II->getIntrinsicID() != Intrinsic::lifetime_end)
++      return false;
++  }
++  return true;
++}
++
+ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+   // FIXME: If the memory unit is of pointer or integer type, we can permit
+   // assignments to subsections of the memory unit.
+@@ -68,6 +120,9 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+       // not have any meaning for a local alloca.
+       if (SI->isVolatile())
+         return false;
++    } else if (const MemCpyInst *MCI = dyn_cast<MemCpyInst>(U)) {
++      if (!isSplittableMemCpy(MCI, AI))
++        return false;
+     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+       if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+           II->getIntrinsicID() != Intrinsic::lifetime_end)
+@@ -75,7 +130,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+     } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+       if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+         return false;
+-      if (!onlyUsedByLifetimeMarkers(BCI))
++      if (!onlyHasCanonicalizableUsers(AI, BCI))
+         return false;
+     } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+       if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
+@@ -181,7 +235,13 @@ public:
+   /// This code only looks at accesses to allocas.
+   static bool isInterestingInstruction(const Instruction *I) {
++    if (isa<MemCpyInst>(I)) {
++      const MemCpyInst *MCI = cast<MemCpyInst>(I);
++      return isa<AllocaInst>(MCI->getSource()) ||
++             isa<AllocaInst>(MCI->getDest());
++    } else {
+     return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+            (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+   }
++  }
+
+   /// Get or calculate the index of the specified instruction.
+@@ -208,6 +264,25 @@ public:
+     return It->second;
+   }
+
++  // When we split a memcpy intrinsic, we need to update the numbering in this
++  // struct. To make sure the relative ordering remains the same, we give both
++  // the LI and the SI the number that the MCI used to have (if they are both
++  // interesting). This means that they will have equal numbers, which usually
++  // can't happen. However, since they can never reference the same alloca
++  // (since memcpy operands may not overlap), this is fine, because we will
++  // never compare instruction indices for instructions that operate on distinct
++  // allocas.
++  void splitMemCpy(MemCpyInst *MCI, LoadInst *LI, StoreInst *SI) {
++    DenseMap<const Instruction *, unsigned>::iterator It =
++        InstNumbers.find(MCI);
++    if (It == InstNumbers.end())
++      return;
++    unsigned MemCpyNumber = It->second;
++    InstNumbers[LI] = MemCpyNumber;
++    InstNumbers[SI] = MemCpyNumber;
++    deleteValue(MCI);
++  }
++
+   void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
+
+   void clear() { InstNumbers.clear(); }
+@@ -305,9 +380,58 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+   AC->registerAssumption(CI);
+ }
+
+-static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+-  // Knowing that this alloca is promotable, we know that it's safe to kill all
+-  // instructions except for load and store.
++/// Split a memcpy instruction into the corresponding load/store. It is a little
++/// more complicated than one might imagine, because we need to deal with the
++/// fact that the side of the copy we're not currently processing might also
++/// be a promotable alloca. We need to be careful to not break the promotable
++/// predicate for that other alloca (if any).
++static void doMemCpySplit(LargeBlockInfo &LBI, MemCpyInst *MCI,
++                          AllocaInst *AI) {
++  AAMDNodes AA;
++  MCI->getAAMetadata(AA);
++  Value *MCISrc = MCI->getSource();
++  Type *LoadType = AI->getAllocatedType();
++  AllocaInst *SrcAI = dyn_cast<AllocaInst>(MCISrc);
++  if (SrcAI && SrcAI->getType() != AI->getType()) {
++    if (CastInst::isBitCastable(SrcAI->getAllocatedType(), LoadType))
++      LoadType = SrcAI->getAllocatedType();
++  }
++  if (cast<PointerType>(MCISrc->getType())->getElementType() != LoadType)
++    MCISrc = CastInst::Create(
++        Instruction::BitCast, MCISrc,
++        LoadType->getPointerTo(
++            cast<PointerType>(MCISrc->getType())->getAddressSpace()),
++        "", MCI);
++  // This might add to the end of the use list, but that's fine. At worst,
++  // we'd not visit the instructions we insert here, but we don't care
++  // about them in this loop anyway.
++  LoadInst *LI = new LoadInst(LoadType, MCISrc, "", MCI->isVolatile(),
++                              MCI->getAlignment(), MCI);
++  Value *Val = LI;
++  Value *MCIDest = MCI->getDest();
++  AllocaInst *DestAI = dyn_cast<AllocaInst>(MCIDest);
++  Type *DestElTy = DestAI ? DestAI->getAllocatedType() : AI->getAllocatedType();
++  if (LI->getType() != DestElTy &&
++      CastInst::isBitCastable(LI->getType(), DestElTy))
++    Val = CastInst::Create(Instruction::BitCast, Val, DestElTy, "", MCI);
++  if (cast<PointerType>(MCIDest->getType())->getElementType() != Val->getType())
++    MCIDest = CastInst::Create(
++        Instruction::BitCast, MCIDest,
++        Val->getType()->getPointerTo(
++            cast<PointerType>(MCIDest->getType())->getAddressSpace()),
++        "", MCI);
++  StoreInst *SI =
++      new StoreInst(Val, MCIDest, MCI->isVolatile(), MCI->getAlignment(), MCI);
++  LI->setAAMetadata(AA);
++  SI->setAAMetadata(AA);
++  LBI.splitMemCpy(MCI, LI, SI);
++  MCI->eraseFromParent();
++}
++
++static void canonicalizeUsers(LargeBlockInfo &LBI, AllocaInst *AI) {
++  // Knowing that this alloca is promotable, we know that it's safe to split
++  // MTIs into load/store and to kill all other instructions except for
++  // load and store.
+
+   for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) {
+     Instruction *I = cast<Instruction>(*UI);
+@@ -315,14 +439,24 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+     if (isa<LoadInst>(I) || isa<StoreInst>(I))
+       continue;
+
++    if (isa<MemCpyInst>(I)) {
++      MemCpyInst *MCI = cast<MemCpyInst>(I);
++      doMemCpySplit(LBI, MCI, AI);
++      continue;
++    }
++
+     if (!I->getType()->isVoidTy()) {
+-      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+-      // Follow the use/def chain to erase them now instead of leaving it for
+-      // dead code elimination later.
++      // The only users of this bitcast/GEP instruction are lifetime/memcpy
++      // intrinsics. Split memcpys and delete lifetime intrinsics.
+       for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) {
+         Instruction *Inst = cast<Instruction>(*UUI);
+         ++UUI;
+-        Inst->eraseFromParent();
++        if (isa<MemCpyInst>(Inst)) {
++          doMemCpySplit(LBI, cast<MemCpyInst>(Inst), AI);
++        } else {
++          // Must be a lifetime intrinsic
++          Inst->eraseFromParent();
++        }
+       }
+     }
+     I->eraseFromParent();
+@@ -542,7 +676,7 @@ void PromoteMem2Reg::run() {
+     assert(AI->getParent()->getParent() == &F &&
+            "All allocas should be in the same function, which is same as DF!");
+
+-    removeLifetimeIntrinsicUsers(AI);
++    canonicalizeUsers(LBI, AI);
+
+     if (AI->use_empty()) {
+       // If there are no uses of the alloca, just delete it now.
+diff --git a/test/Transforms/Mem2Reg/memcpy.ll b/test/Transforms/Mem2Reg/memcpy.ll
+new file mode 100644
+index 0000000..fbc4096
+--- /dev/null
++++ b/test/Transforms/Mem2Reg/memcpy.ll
+@@ -0,0 +1,101 @@
++; RUN: opt < %s -mem2reg -S | FileCheck %s
++
++target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
++
++declare void @llvm.memcpy.p0i128.p0i64.i32(i128 *, i64 *, i32, i32, i1)
++declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *, i8 *, i32, i32, i1)
++declare void @llvm.memcpy.p0i64.p0i64.i32(i64 *, i64 *, i32, i32, i1)
++declare void @llvm.memcpy.p0f64.p0i64.i32(double *, i64 *, i32, i32, i1)
++
++define i128 @test_cpy_different(i64) {
++; CHECK-LABEL: @test_cpy_different
++; CHECK-NOT: alloca i64
++; CHECK: store i64 %0
++    %a = alloca i64
++    %b = alloca i128
++    store i128 0, i128 *%b
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0i128.p0i64.i32(i128 *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load i128, i128 *%b
++    ret i128 %loaded
++}
++
++define i64 @test_cpy_same(i64) {
++; CHECK-LABEL: @test_cpy_same
++; CHECK-NOT: alloca
++; CHECK: ret i64 %0
++    %a = alloca i64
++    %b = alloca i64
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0i64.p0i64.i32(i64 *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%b
++    ret i64 %loaded
++}
++
++define double @test_cpy_different_type(i64) {
++; CHECK-LABEL: @test_cpy_different_type
++; CHECK-NOT: alloca
++; CHECK: bitcast i64 %0 to double
++    %a = alloca i64
++    %b = alloca double
++    store i64 %0, i64 *%a
++    call void @llvm.memcpy.p0f64.p0i64.i32(double *%b, i64 *%a, i32 8, i32 0, i1 0)
++    %loaded = load double, double *%b
++    ret double %loaded
++}
++
++define i128 @test_cpy_differenti8(i64) {
++; CHECK-LABEL: @test_cpy_differenti8
++; CHECK-NOT: alloca i64
++; CHECK: store i64 %0
++    %a = alloca i64
++    %b = alloca i128
++    store i128 0, i128 *%b
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i128* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load i128, i128 *%b
++    ret i128 %loaded
++}
++
++define i64 @test_cpy_samei8(i64) {
++; CHECK-LABEL: @test_cpy_samei8
++; CHECK-NOT: alloca
++; CHECK: ret i64 %0
++    %a = alloca i64
++    %b = alloca i64
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i64* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%b
++    ret i64 %loaded
++}
++
++define double @test_cpy_different_typei8(i64) {
++; CHECK-LABEL: @test_cpy_different_typei8
++; CHECK-NOT: alloca
++; CHECK: bitcast i64 %0 to double
++    %a = alloca i64
++    %b = alloca double
++    store i64 %0, i64 *%a
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast double* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%bcast, i8 *%acast, i32 8, i32 0, i1 0)
++    %loaded = load double, double *%b
++    ret double %loaded
++}
++
++define i64 @test_cpy_differenti8_reverse(i128) {
++; CHECK-LABEL: @test_cpy_differenti8_reverse
++; CHECK-NOT: alloca i64
++    %a = alloca i64
++    %b = alloca i128
++    store i128 %0, i128 *%b
++    %acast = bitcast i64* %a to i8*
++    %bcast = bitcast i128* %b to i8*
++    call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%acast, i8 *%bcast, i32 8, i32 0, i1 0)
++    %loaded = load i64, i64 *%a
++    ret i64 %loaded
++}
+-- 
+2.9.3
+