[LoopInterchange] Support loop interchange with floating point reductions

Enabled loop interchange support for floating point reductions if it is allowed to reorder floating point operations. Previously when we encouter a floating point PHI node in the outer loop exit block, we bailed out since we could not detect floating point reductions in the early days. Now we remove this limiation since we are able to detect floating point reductions. Reviewed By: #loopoptwg, Meinersbur Differential Revision: https://reviews.llvm.org/D117450
2022-02-06 16:55:20 -05:00 · 2022-02-06 16:55:20 -05:00 · 1ef04326ec
parent 0d8850ae2c
commit 1ef04326ec
3 changed files with 107 additions and 26 deletions
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@ -733,8 +733,12 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
      if (PHI->getNumIncomingValues() == 1)
        continue;
      RecurrenceDescriptor RD;
-      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
+        // Detect floating point reduction only when it can be reordered.
+        if (RD.getExactFPMathInst() != nullptr)
+          return nullptr;
        return PHI;
+      }
      return nullptr;
    }
  }
@ -893,28 +897,23 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
  for (PHINode &PHI : LoopNestExit->phis()) {
-    //  FIXME: We currently are not able to detect floating point reductions
-    //         and have to use floating point PHIs as a proxy to prevent
-    //         interchanging in the presence of floating point reductions.
-    if (PHI.getType()->isFloatingPointTy())
-      return false;
    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
-     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
-     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
-       continue;
+      Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+      if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+        continue;

-     // The incoming value is defined in the outer loop latch. Currently we
-     // only support that in case the outer loop latch has a single predecessor.
-     // This guarantees that the outer loop latch is executed if and only if
-     // the inner loop is executed (because tightlyNested() guarantees that the
-     // outer loop header only branches to the inner loop or the outer loop
-     // latch).
-     // FIXME: We could weaken this logic and allow multiple predecessors,
-     //        if the values are produced outside the loop latch. We would need
-     //        additional logic to update the PHI nodes in the exit block as
-     //        well.
-     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
-       return false;
+      // The incoming value is defined in the outer loop latch. Currently we
+      // only support that in case the outer loop latch has a single predecessor.
+      // This guarantees that the outer loop latch is executed if and only if
+      // the inner loop is executed (because tightlyNested() guarantees that the
+      // outer loop header only branches to the inner loop or the outer loop
+      // latch).
+      // FIXME: We could weaken this logic and allow multiple predecessors,
+      //        if the values are produced outside the loop latch. We would need
+      //        additional logic to update the PHI nodes in the exit block as
+      //        well.
+      if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+        return false;
    }
  }
  return true;
--- a/llvm/test/Transforms/LoopInterchange/lcssa.ll
+++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll
@ -135,9 +135,8 @@ for.end16:                                        ; preds = %for.exit
  ret void
 }

-; FIXME: We currently do not support LCSSA phi nodes involving floating point
-;        types, as we fail to detect floating point reductions for now.
-; REMARK: UnsupportedPHIOuter
+; Loops with floating point reductions are interchanged with fastmath.
+; REMARK: Interchanged
 ; REMARK-NEXT: lcssa_04

 define void @lcssa_04() {
@ -146,28 +145,31 @@ entry:

 outer.header:                                     ; preds = %outer.inc, %entry
  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
-  %float.outer = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.outer.next, %outer.inc ]
  br label %for.body3

 for.body3:                                        ; preds = %for.body3, %outer.header
  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %float.inner = phi float [ %float.inner.next, %for.body3 ], [ %float.outer, %outer.header ]
  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
  %vA = load i32, i32* %arrayidx5
  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
  %vC = load i32, i32* %arrayidx9
  %add = add nsw i32 %vA, %vC
+  %float.inner.next = fadd fast float %float.inner, 1.000000e+00
  store i32 %add, i32* %arrayidx5
  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
  %exitcond = icmp eq i64 %iv.inner.next, 100
  br i1 %exitcond, label %outer.inc, label %for.body3

 outer.inc:                                        ; preds = %for.body3
+  %float.outer.next = phi float [ %float.inner.next, %for.body3 ]
  %iv.outer.next = add nsw i64 %iv.outer, 1
  %cmp = icmp eq i64 %iv.outer.next, 100
  br i1 %cmp, label %outer.header, label %for.exit

 for.exit:                                         ; preds = %outer.inc
-  %float.outer.lcssa = phi float [ %float.outer, %outer.inc ]
+  %float.outer.lcssa = phi float [ %float.outer.next, %outer.inc ]
  store float %float.outer.lcssa, float* @F
  br label %for.end16

--- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@ -227,3 +227,83 @@ for1.loopexit:                                 ; preds = %for1.inc
  %il.res.lcssa2 = phi i64 [ %sum.inc.amend, %for1.inc ]
  ret i64 %il.res.lcssa2
 }
+
+; Floating point reductions are interchanged if all the fp instructions
+; involved allow reassociation.
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        test5
+
+define float @test5([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %outer.header
+  %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ]
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load float, float* %arrayidx5
+  %float.inner.inc = fadd fast float %float.inner, %vA
+  %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vB = load float, float* %arrayidx6
+  %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                                        ; preds = %for.body3
+  %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ]
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ]
+  ret float %float.outer.lcssa
+}
+
+; Floating point reductions are not interchanged if not all the fp instructions
+; involved allow reassociation.
+; REMARKS: --- !Missed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            UnsupportedPHIOuter
+; REMARKS-NEXT: Function:        test6
+
+define float @test6([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %outer.header
+  %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ]
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load float, float* %arrayidx5
+  %float.inner.inc = fadd float %float.inner, %vA ; do not allow reassociation
+  %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vB = load float, float* %arrayidx6
+  %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                                        ; preds = %for.body3
+  %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ]
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ]
+  ret float %float.outer.lcssa
+}