[SLP]Fix PR51320: Try to vectorize single store operands.

Currently, we try to vectorize values, feeding into stores, only if slp-vectorize-hor-store option is provided. We can safely enable vectorization of the value operand of a single store in the basic block, if the operand value is used only in store. It should enable extra vectorization and should not increase compile time significantly. Fixes https://github.com/llvm/llvm-project/issues/51320 Differential Revision: https://reviews.llvm.org/D131894
2022-08-15 07:22:21 -07:00 · 2022-08-15 07:22:21 -07:00 · 65c7cecb13
parent b812db1464
commit 65c7cecb13
3 changed files with 297 additions and 573 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -12238,7 +12238,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
        (it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {
      KeyNodes.insert(&*it);
      bool OpsChanged = false;
-      if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+      auto *SI = dyn_cast<StoreInst>(it);
+      bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
+      if (SI) {
+        auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
+        // Try to vectorize chain in store, if this is the only store to the
+        // address in the block.
+        // TODO: This is just a temporarily solution to save compile time. Need
+        // to investigate if we can safely turn on slp-vectorize-hor-store
+        // instead to allow lookup for reduction chains in all non-vectorized
+        // stores (need to check side effects and compile time).
+        TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&
+                             SI->getValueOperand()->hasOneUse();
+      }
+      if (TryToVectorizeRoot) {
        for (auto *V : it->operand_values()) {
          // Try to match and vectorize a horizontal reduction.
          OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
@ -37,13 +37,13 @@ define float @f(<2 x float> %x) {

 define float @f_used_out_of_tree(<2 x float> %x) {
 ; CHECK-LABEL: @f_used_out_of_tree(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
-; CHECK-NEXT:    [[X0X0:%.*]] = fmul float [[X0]], [[X0]]
-; CHECK-NEXT:    [[X1X1:%.*]] = fmul float [[X1]], [[X1]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    store float [[ADD]], float* @a, align 4
-; CHECK-NEXT:    ret float [[X0]]
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
 ; THRESH1-LABEL: @f_used_out_of_tree(
 ; THRESH1-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll