[LoopInterchange] Support loop interchange with floating point reductions

Enabled loop interchange support for floating point reductions
if it is allowed to reorder floating point operations.

Previously when we encouter a floating point PHI node in the
outer loop exit block, we bailed out since we could not detect
floating point reductions in the early days. Now we remove this
limiation since we are able to detect floating point reductions.

Reviewed By: #loopoptwg, Meinersbur

Differential Revision: https://reviews.llvm.org/D117450
This commit is contained in:
Congzhe Cao 2022-02-06 16:55:20 -05:00 committed by CongzheUalberta
parent 0d8850ae2c
commit 1ef04326ec
3 changed files with 107 additions and 26 deletions

View File

@ -733,8 +733,12 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
if (PHI->getNumIncomingValues() == 1) if (PHI->getNumIncomingValues() == 1)
continue; continue;
RecurrenceDescriptor RD; RecurrenceDescriptor RD;
if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
// Detect floating point reduction only when it can be reordered.
if (RD.getExactFPMathInst() != nullptr)
return nullptr;
return PHI; return PHI;
}
return nullptr; return nullptr;
} }
} }
@ -893,28 +897,23 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
for (PHINode &PHI : LoopNestExit->phis()) { for (PHINode &PHI : LoopNestExit->phis()) {
// FIXME: We currently are not able to detect floating point reductions
// and have to use floating point PHIs as a proxy to prevent
// interchanging in the presence of floating point reductions.
if (PHI.getType()->isFloatingPointTy())
return false;
for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
continue; continue;
// The incoming value is defined in the outer loop latch. Currently we // The incoming value is defined in the outer loop latch. Currently we
// only support that in case the outer loop latch has a single predecessor. // only support that in case the outer loop latch has a single predecessor.
// This guarantees that the outer loop latch is executed if and only if // This guarantees that the outer loop latch is executed if and only if
// the inner loop is executed (because tightlyNested() guarantees that the // the inner loop is executed (because tightlyNested() guarantees that the
// outer loop header only branches to the inner loop or the outer loop // outer loop header only branches to the inner loop or the outer loop
// latch). // latch).
// FIXME: We could weaken this logic and allow multiple predecessors, // FIXME: We could weaken this logic and allow multiple predecessors,
// if the values are produced outside the loop latch. We would need // if the values are produced outside the loop latch. We would need
// additional logic to update the PHI nodes in the exit block as // additional logic to update the PHI nodes in the exit block as
// well. // well.
if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
return false; return false;
} }
} }
return true; return true;

View File

@ -135,9 +135,8 @@ for.end16: ; preds = %for.exit
ret void ret void
} }
; FIXME: We currently do not support LCSSA phi nodes involving floating point ; Loops with floating point reductions are interchanged with fastmath.
; types, as we fail to detect floating point reductions for now. ; REMARK: Interchanged
; REMARK: UnsupportedPHIOuter
; REMARK-NEXT: lcssa_04 ; REMARK-NEXT: lcssa_04
define void @lcssa_04() { define void @lcssa_04() {
@ -146,28 +145,31 @@ entry:
outer.header: ; preds = %outer.inc, %entry outer.header: ; preds = %outer.inc, %entry
%iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
%float.outer = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %outer.inc ] %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.outer.next, %outer.inc ]
br label %for.body3 br label %for.body3
for.body3: ; preds = %for.body3, %outer.header for.body3: ; preds = %for.body3, %outer.header
%iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
%float.inner = phi float [ %float.inner.next, %for.body3 ], [ %float.outer, %outer.header ]
%arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
%vA = load i32, i32* %arrayidx5 %vA = load i32, i32* %arrayidx5
%arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
%vC = load i32, i32* %arrayidx9 %vC = load i32, i32* %arrayidx9
%add = add nsw i32 %vA, %vC %add = add nsw i32 %vA, %vC
%float.inner.next = fadd fast float %float.inner, 1.000000e+00
store i32 %add, i32* %arrayidx5 store i32 %add, i32* %arrayidx5
%iv.inner.next = add nuw nsw i64 %iv.inner, 1 %iv.inner.next = add nuw nsw i64 %iv.inner, 1
%exitcond = icmp eq i64 %iv.inner.next, 100 %exitcond = icmp eq i64 %iv.inner.next, 100
br i1 %exitcond, label %outer.inc, label %for.body3 br i1 %exitcond, label %outer.inc, label %for.body3
outer.inc: ; preds = %for.body3 outer.inc: ; preds = %for.body3
%float.outer.next = phi float [ %float.inner.next, %for.body3 ]
%iv.outer.next = add nsw i64 %iv.outer, 1 %iv.outer.next = add nsw i64 %iv.outer, 1
%cmp = icmp eq i64 %iv.outer.next, 100 %cmp = icmp eq i64 %iv.outer.next, 100
br i1 %cmp, label %outer.header, label %for.exit br i1 %cmp, label %outer.header, label %for.exit
for.exit: ; preds = %outer.inc for.exit: ; preds = %outer.inc
%float.outer.lcssa = phi float [ %float.outer, %outer.inc ] %float.outer.lcssa = phi float [ %float.outer.next, %outer.inc ]
store float %float.outer.lcssa, float* @F store float %float.outer.lcssa, float* @F
br label %for.end16 br label %for.end16

View File

@ -227,3 +227,83 @@ for1.loopexit: ; preds = %for1.inc
%il.res.lcssa2 = phi i64 [ %sum.inc.amend, %for1.inc ] %il.res.lcssa2 = phi i64 [ %sum.inc.amend, %for1.inc ]
ret i64 %il.res.lcssa2 ret i64 %il.res.lcssa2
} }
; Floating point reductions are interchanged if all the fp instructions
; involved allow reassociation.
; REMARKS: --- !Passed
; REMARKS-NEXT: Pass: loop-interchange
; REMARKS-NEXT: Name: Interchanged
; REMARKS-NEXT: Function: test5
define float @test5([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) {
entry:
br label %outer.header
outer.header: ; preds = %outer.inc, %entry
%iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
%float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ]
br label %for.body3
for.body3: ; preds = %for.body3, %outer.header
%float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ]
%iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
%arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer
%vA = load float, float* %arrayidx5
%float.inner.inc = fadd fast float %float.inner, %vA
%arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer
%vB = load float, float* %arrayidx6
%float.inner.inc.inc = fadd fast float %float.inner.inc, %vB
%iv.inner.next = add nuw nsw i64 %iv.inner, 1
%exitcond = icmp eq i64 %iv.inner.next, 100
br i1 %exitcond, label %outer.inc, label %for.body3
outer.inc: ; preds = %for.body3
%float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ]
%iv.outer.next = add nsw i64 %iv.outer, 1
%cmp = icmp eq i64 %iv.outer.next, 100
br i1 %cmp, label %outer.header, label %for.exit
for.exit: ; preds = %outer.inc
%float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ]
ret float %float.outer.lcssa
}
; Floating point reductions are not interchanged if not all the fp instructions
; involved allow reassociation.
; REMARKS: --- !Missed
; REMARKS-NEXT: Pass: loop-interchange
; REMARKS-NEXT: Name: UnsupportedPHIOuter
; REMARKS-NEXT: Function: test6
define float @test6([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) {
entry:
br label %outer.header
outer.header: ; preds = %outer.inc, %entry
%iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
%float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ]
br label %for.body3
for.body3: ; preds = %for.body3, %outer.header
%float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ]
%iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
%arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer
%vA = load float, float* %arrayidx5
%float.inner.inc = fadd float %float.inner, %vA ; do not allow reassociation
%arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer
%vB = load float, float* %arrayidx6
%float.inner.inc.inc = fadd fast float %float.inner.inc, %vB
%iv.inner.next = add nuw nsw i64 %iv.inner, 1
%exitcond = icmp eq i64 %iv.inner.next, 100
br i1 %exitcond, label %outer.inc, label %for.body3
outer.inc: ; preds = %for.body3
%float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ]
%iv.outer.next = add nsw i64 %iv.outer, 1
%cmp = icmp eq i64 %iv.outer.next, 100
br i1 %cmp, label %outer.header, label %for.exit
for.exit: ; preds = %outer.inc
%float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ]
ret float %float.outer.lcssa
}