[SLP]Fix PR51320: Try to vectorize single store operands.

Currently, we try to vectorize values, feeding into stores, only if
slp-vectorize-hor-store option is provided. We can safely enable
vectorization of the value operand of a single store in the basic block,
if the operand value is used only in store.
It should enable extra vectorization and should not increase compile
time significantly.
Fixes https://github.com/llvm/llvm-project/issues/51320

Differential Revision: https://reviews.llvm.org/D131894
This commit is contained in:
Alexey Bataev 2022-08-15 07:22:21 -07:00
parent b812db1464
commit 65c7cecb13
3 changed files with 297 additions and 573 deletions

View File

@ -12238,7 +12238,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
(it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {
KeyNodes.insert(&*it);
bool OpsChanged = false;
if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
auto *SI = dyn_cast<StoreInst>(it);
bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
if (SI) {
auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
// Try to vectorize chain in store, if this is the only store to the
// address in the block.
// TODO: This is just a temporarily solution to save compile time. Need
// to investigate if we can safely turn on slp-vectorize-hor-store
// instead to allow lookup for reduction chains in all non-vectorized
// stores (need to check side effects and compile time).
TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&
SI->getValueOperand()->hasOneUse();
}
if (TryToVectorizeRoot) {
for (auto *V : it->operand_values()) {
// Try to match and vectorize a horizontal reduction.
OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);

View File

@ -37,13 +37,13 @@ define float @f(<2 x float> %x) {
define float @f_used_out_of_tree(<2 x float> %x) {
; CHECK-LABEL: @f_used_out_of_tree(
; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
; CHECK-NEXT: [[X0X0:%.*]] = fmul float [[X0]], [[X0]]
; CHECK-NEXT: [[X1X1:%.*]] = fmul float [[X1]], [[X1]]
; CHECK-NEXT: [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
; CHECK-NEXT: store float [[ADD]], float* @a, align 4
; CHECK-NEXT: ret float [[X0]]
; CHECK-NEXT: ret float [[TMP1]]
;
; THRESH1-LABEL: @f_used_out_of_tree(
; THRESH1-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0

File diff suppressed because it is too large Load Diff