From ce61def529e2d9ef46b79c9d1f489d69b45b95bf Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Mon, 8 Mar 2021 20:55:53 -0800 Subject: [PATCH] [regalloc] Ensure Query::collectInterferringVregs is called before interval iteration The main part of the patch is the change in RegAllocGreedy.cpp: Q.collectInterferringVregs() needs to be called before iterating the interfering live ranges. The rest of the patch offers support that is the case: instead of clearing the query's InterferingVRegs field, we invalidate it. The clearing happens when the live reg matrix is invalidated (existing triggering mechanism). Without the change in RegAllocGreedy.cpp, the compiler ices. This patch should make it more easily discoverable by developers that collectInterferringVregs needs to be called before iterating. I will follow up with a subsequent patch to improve the usability and maintainability of Query. Differential Revision: https://reviews.llvm.org/D98232 --- llvm/include/llvm/CodeGen/LiveIntervalUnion.h | 20 ++-- llvm/lib/CodeGen/LiveIntervalUnion.cpp | 19 ++-- llvm/lib/CodeGen/LiveRegMatrix.cpp | 16 ++- llvm/lib/CodeGen/RegAllocGreedy.cpp | 40 +++----- llvm/lib/Target/AArch64/AArch64Subtarget.h | 2 +- llvm/lib/Target/X86/X86Subtarget.h | 2 +- .../AArch64/ragreedy-local-interval-cost.ll | 2 +- llvm/test/CodeGen/X86/bug26810.ll | 2 +- .../greedy_regalloc_bad_eviction_sequence.ll | 2 +- llvm/test/CodeGen/X86/i128-mul.ll | 4 +- llvm/test/CodeGen/X86/mmx-arith.ll | 15 +-- llvm/test/CodeGen/X86/optimize-max-0.ll | 98 ++++++++++--------- 12 files changed, 119 insertions(+), 103 deletions(-) diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h index ad9e06d2bcf0..4ebe0f2dcfd8 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h +++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h @@ -114,30 +114,30 @@ public: const LiveRange *LR = nullptr; LiveRange::const_iterator LRI; ///< current position in LR ConstSegmentIter LiveUnionI; ///< current position in LiveUnion - SmallVector InterferingVRegs; + Optional> InterferingVRegs; bool CheckedFirstInterference = false; bool SeenAllInterferences = false; unsigned Tag = 0; unsigned UserTag = 0; + public: + Query() = default; + Query(const LiveRange &LR, const LiveIntervalUnion &LIU) + : LiveUnion(&LIU), LR(&LR) {} + Query(const Query &) = delete; + Query &operator=(const Query &) = delete; + void reset(unsigned NewUserTag, const LiveRange &NewLR, const LiveIntervalUnion &NewLiveUnion) { LiveUnion = &NewLiveUnion; LR = &NewLR; - InterferingVRegs.clear(); + InterferingVRegs = None; CheckedFirstInterference = false; SeenAllInterferences = false; Tag = NewLiveUnion.getTag(); UserTag = NewUserTag; } - public: - Query() = default; - Query(const LiveRange &LR, const LiveIntervalUnion &LIU): - LiveUnion(&LIU), LR(&LR) {} - Query(const Query &) = delete; - Query &operator=(const Query &) = delete; - void init(unsigned NewUserTag, const LiveRange &NewLR, const LiveIntervalUnion &NewLiveUnion) { if (UserTag == NewUserTag && LR == &NewLR && LiveUnion == &NewLiveUnion && @@ -164,7 +164,7 @@ public: // Vector generated by collectInterferingVRegs. const SmallVectorImpl &interferingVRegs() const { - return InterferingVRegs; + return *InterferingVRegs; } }; diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp index 7ccb8df4bc05..dfa523d4bf41 100644 --- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp +++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp @@ -112,7 +112,7 @@ LiveInterval *LiveIntervalUnion::getOneVReg() const { // Scan the vector of interfering virtual registers in this union. Assume it's // quite small. bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const { - return is_contained(InterferingVRegs, VirtReg); + return is_contained(*InterferingVRegs, VirtReg); } // Collect virtual registers in this union that interfere with this @@ -126,9 +126,12 @@ bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const { // unsigned LiveIntervalUnion::Query:: collectInterferingVRegs(unsigned MaxInterferingRegs) { + if (!InterferingVRegs) + InterferingVRegs.emplace(); + // Fast path return if we already have the desired information. - if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs) - return InterferingVRegs.size(); + if (SeenAllInterferences || InterferingVRegs->size() >= MaxInterferingRegs) + return InterferingVRegs->size(); // Set up iterators on the first call. if (!CheckedFirstInterference) { @@ -157,14 +160,14 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { LiveInterval *VReg = LiveUnionI.value(); if (VReg != RecentReg && !isSeenInterference(VReg)) { RecentReg = VReg; - InterferingVRegs.push_back(VReg); - if (InterferingVRegs.size() >= MaxInterferingRegs) - return InterferingVRegs.size(); + InterferingVRegs->push_back(VReg); + if (InterferingVRegs->size() >= MaxInterferingRegs) + return InterferingVRegs->size(); } // This LiveUnion segment is no longer interesting. if (!(++LiveUnionI).valid()) { SeenAllInterferences = true; - return InterferingVRegs.size(); + return InterferingVRegs->size(); } } @@ -185,7 +188,7 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { LiveUnionI.advanceTo(LRI->start); } SeenAllInterferences = true; - return InterferingVRegs.size(); + return InterferingVRegs->size(); } void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc, diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index a69aa6557e46..4c0172a930b5 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -216,7 +216,21 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, // Check for interference with that segment for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - if (query(LR, *Units).checkInterference()) + // LR is stack-allocated. LiveRegMatrix caches queries by a key that + // includes the address of the live range. If (for the same reg unit) this + // checkInterference overload is called twice, without any other query() + // calls in between (on heap-allocated LiveRanges) - which would invalidate + // the cached query - the LR address seen the second time may well be the + // same as that seen the first time, while the Start/End/valno may not - yet + // the same cached result would be fetched. To avoid that, we don't cache + // this query. + // + // FIXME: the usability of the Query API needs to be improved to avoid + // subtle bugs due to query identity. Avoiding caching, for example, would + // greatly simplify things. + LiveIntervalUnion::Query Q; + Q.reset(UserTag, LR, Matrix[*Units]); + if (Q.checkInterference()) return true; } return false; diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 4ec275fdc0bf..26e7a1f17a22 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -471,12 +471,13 @@ private: bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const; bool canEvictInterference(LiveInterval &, MCRegister, bool, EvictionCost &, const SmallVirtRegSet &) const; - bool canEvictInterferenceInRange(LiveInterval &VirtReg, MCRegister PhysReg, - SlotIndex Start, SlotIndex End, - EvictionCost &MaxCost) const; + bool canEvictInterferenceInRange(const LiveInterval &VirtReg, + MCRegister PhysReg, SlotIndex Start, + SlotIndex End, EvictionCost &MaxCost) const; MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order, - LiveInterval &VirtReg, SlotIndex Start, - SlotIndex End, float *BestEvictWeight); + const LiveInterval &VirtReg, + SlotIndex Start, SlotIndex End, + float *BestEvictWeight) const; void evictInterference(LiveInterval &, MCRegister, SmallVectorImpl &); bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg, @@ -979,7 +980,7 @@ bool RAGreedy::canEvictInterference( /// \param MaxCost Only look for cheaper candidates and update with new cost /// when returning true. /// \return True when interference can be evicted cheaper than MaxCost. -bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg, +bool RAGreedy::canEvictInterferenceInRange(const LiveInterval &VirtReg, MCRegister PhysReg, SlotIndex Start, SlotIndex End, EvictionCost &MaxCost) const { @@ -987,6 +988,7 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg, for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + Q.collectInterferingVRegs(); // Check if any interfering live range is heavier than MaxWeight. for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) { @@ -1031,9 +1033,9 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg, /// \return The PhysReg which is the best candidate for eviction and the /// eviction cost in BestEvictweight MCRegister RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order, - LiveInterval &VirtReg, + const LiveInterval &VirtReg, SlotIndex Start, SlotIndex End, - float *BestEvictweight) { + float *BestEvictweight) const { EvictionCost BestEvictCost; BestEvictCost.setMax(); BestEvictCost.MaxWeight = VirtReg.weight(); @@ -1556,25 +1558,9 @@ bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit, return false; } - // Check if the local interval will evict a cheaper interval. - float CheapestEvictWeight = 0; - MCRegister FutureEvictedPhysReg = getCheapestEvicteeWeight( - Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(), - Cand.Intf.last(), &CheapestEvictWeight); - - // Have we found an interval that can be evicted? - if (FutureEvictedPhysReg) { - float splitArtifactWeight = - VRAI->futureWeight(LIS->getInterval(VirtRegToSplit), - Cand.Intf.first().getPrevIndex(), Cand.Intf.last()); - // Will the weight of the local interval be higher than the cheapest evictee - // weight? If so it will evict it and will not cause a spill. - if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight) - return false; - } - - // The local interval is not able to find non interferencing assignment and - // not able to evict a less worthy interval, therfore, it can cause a spill. + // The local interval is not able to find non interferencing assignment + // and not able to evict a less worthy interval, therfore, it can cause a + // spill. return true; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8fe2f125982f..6447103128a5 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -557,7 +557,7 @@ public: bool enableEarlyIfConversion() const override; - bool enableAdvancedRASplitCost() const override { return true; } + bool enableAdvancedRASplitCost() const override { return false; } std::unique_ptr getCustomPBQPConstraints() const override; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index fa2622333d60..96bb96060543 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -941,7 +941,7 @@ public: return TargetSubtargetInfo::ANTIDEP_CRITICAL; } - bool enableAdvancedRASplitCost() const override { return true; } + bool enableAdvancedRASplitCost() const override { return false; } }; } // end namespace llvm diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index f3bd66ceae8c..1a9bff7915ba 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-arm-none-eabi < %s | FileCheck %s +; RUN: llc -consider-local-interval-cost -mtriple=aarch64-arm-none-eabi < %s | FileCheck %s @A = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8 @B = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8 diff --git a/llvm/test/CodeGen/X86/bug26810.ll b/llvm/test/CodeGen/X86/bug26810.ll index 7146f4cc3850..f0ea14e97eda 100644 --- a/llvm/test/CodeGen/X86/bug26810.ll +++ b/llvm/test/CodeGen/X86/bug26810.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s +; RUN: llc -consider-local-interval-cost < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s ; Make sure bad eviction sequence doesnt occur ; Fix for bugzilla 26810. diff --git a/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll b/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll index f300c8ce3d81..053225c1b547 100644 --- a/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll +++ b/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s +; RUN: llc -consider-local-interval-cost < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s ; Make sure bad eviction sequence doesnt occur ; Part of the fix for bugzilla 26810. diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index 552a383e6da7..1142c31b69ec 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -162,9 +162,9 @@ define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind ; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NOBMI-NEXT: adcl $0, %edi ; X86-NOBMI-NEXT: movl %ebp, %esi -; X86-NOBMI-NEXT: xorl %ebx, %esi +; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NOBMI-NEXT: xorl %ebx, %edi ; X86-NOBMI-NEXT: orl %esi, %edi ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index c81520b98cdb..36dcdb967f1e 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -390,25 +390,28 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind { ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testl %ecx, %ecx ; X32-NEXT: je .LBB3_1 ; X32-NEXT: # %bb.2: # %bb26.preheader -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_3: # %bb26 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl (%edi,%ebx,8), %ebp +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: movl 4(%edi,%ebx,8), %ecx -; X32-NEXT: addl (%esi,%ebx,8), %ebp -; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: addl (%edi,%ebx,8), %ebp +; X32-NEXT: adcl 4(%edi,%ebx,8), %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: incl %ebx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: cmpl %esi, %ebx ; X32-NEXT: jb .LBB3_3 ; X32-NEXT: jmp .LBB3_4 ; X32-NEXT: .LBB3_1: diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 5367f390d1cb..b001e1cb0437 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -450,49 +450,51 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl %ebp, %eax -; CHECK-NEXT: imull %ecx, %eax +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: imull %eax, %edx ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: je LBB1_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shrl $2, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: movl %eax, %edi ; CHECK-NEXT: je LBB1_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je LBB1_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB1_6: ## %bb7.preheader -; CHECK-NEXT: ## =>This Loop Header: Depth=1 -; CHECK-NEXT: ## Child Loop BB1_4 Depth 2 +; CHECK-NEXT: movl %esi, %edx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_4: ## %bb6 -; CHECK-NEXT: ## Parent Loop BB1_6 Depth=1 -; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx ; CHECK-NEXT: movb %bl, (%edx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %edi, %esi ; CHECK-NEXT: jb LBB1_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 -; CHECK-NEXT: ## in Loop: Header=BB1_6 Depth=1 +; CHECK-NEXT: ## in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: addl %edi, %edx ; CHECK-NEXT: cmpl %ebp, %ecx -; CHECK-NEXT: jne LBB1_6 +; CHECK-NEXT: je LBB1_12 +; CHECK-NEXT: ## %bb.6: ## %bb7.preheader +; CHECK-NEXT: ## in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: jmp LBB1_4 ; CHECK-NEXT: LBB1_12: ## %bb18.loopexit ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -501,10 +503,10 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: cmpl $1, %ebp ; CHECK-NEXT: jbe LBB1_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, {{[0-9]+}}(%esp) +; CHECK-NEXT: cmpl $2, %edi ; CHECK-NEXT: jb LBB1_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: shrl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: shrl %eax @@ -518,14 +520,14 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %ebx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill ; CHECK-NEXT: addl %edx, %ebx ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload @@ -543,26 +545,27 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: incl %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; CHECK-NEXT: incl %ebx ; CHECK-NEXT: addl %ebp, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload ; CHECK-NEXT: addl $2, %edx ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: cmpl $1, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: cmpl $1, %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: je LBB1_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %edx +; CHECK-NEXT: cmpl $3, %esi ; CHECK-NEXT: jne LBB1_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: je LBB1_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph @@ -570,9 +573,11 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: leal 15(%ecx), %ebx -; CHECK-NEXT: andl $-16, %ebx -; CHECK-NEXT: addl %eax, %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $15, %edx +; CHECK-NEXT: andl $-16, %edx +; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: leal (%edx,%eax), %ebp @@ -580,14 +585,16 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: LBB1_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %ecx -; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ecx, %edi ; CHECK-NEXT: calll _memcpy -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %ebx, %edi +; CHECK-NEXT: addl %ebx, %ebp +; CHECK-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB1_17 ; CHECK-NEXT: LBB1_18: ## %bb26 @@ -607,21 +614,24 @@ define void @bar(i8* %r, i32 %s, i32 %w, i32 %x, i8* %j, i32 %d) nounwind { ; CHECK-NEXT: je LBB1_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 ; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: leal 15(%ecx), %ebx -; CHECK-NEXT: andl $-16, %ebx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: addl $15, %eax +; CHECK-NEXT: andl $-16, %eax +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ecx, %ebx ; CHECK-NEXT: calll _memcpy -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ebx, %ecx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %ebx, %edi +; CHECK-NEXT: addl %edi, %ebp +; CHECK-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB1_21 ; CHECK-NEXT: LBB1_22: ## %bb33