[AArch64] Order STP Q's by ascending address

This patch adds an AArch64 specific PostRA MachineScheduler to try to schedule
STP Q's to the same base-address in ascending order of offsets. We have found
this to improve performance on Neoverse N1 and should not hurt other AArch64
cores.

Differential Revision: https://reviews.llvm.org/D125377
This commit is contained in:
Andre Vieira 2022-05-23 09:43:39 +01:00
parent 0cc981e021
commit 572fc7d2fd
10 changed files with 125 additions and 48 deletions

View File

@ -0,0 +1,39 @@
//===- AArch64MachineScheduler.cpp - MI Scheduler for AArch64 -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AArch64MachineScheduler.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
using namespace llvm;
bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) {
bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand);
if (Cand.isValid()) {
MachineInstr *Instr0 = TryCand.SU->getInstr();
MachineInstr *Instr1 = Cand.SU->getInstr();
// When dealing with two STPqi's.
if (Instr0 && Instr1 && Instr0->getOpcode() == Instr1->getOpcode () &&
Instr0->getOpcode() == AArch64::STPQi)
{
MachineOperand &Base0 = Instr0->getOperand(2);
MachineOperand &Base1 = Instr1->getOperand(2);
int64_t Off0 = Instr0->getOperand(3).getImm();
int64_t Off1 = Instr1->getOperand(3).getImm();
// With the same base address and non-overlapping writes.
if (Base0.isIdenticalTo(Base1) && llabs (Off0 - Off1) >= 2) {
TryCand.Reason = NodeOrder;
// Order them by ascending offsets.
return Off0 < Off1;
}
}
}
return OriginalResult;
}

View File

@ -0,0 +1,33 @@
//===- AArch64MachineScheduler.h - Custom AArch64 MI scheduler --*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Custom AArch64 MI scheduler.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
/// A MachineSchedStrategy implementation for AArch64 post RA scheduling.
class AArch64PostRASchedStrategy : public PostGenericScheduler {
public:
AArch64PostRASchedStrategy(const MachineSchedContext *C) :
PostGenericScheduler(C) {}
protected:
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
};
} // end namespace llvm
#endif

View File

@ -12,6 +12,7 @@
#include "AArch64TargetMachine.h"
#include "AArch64.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64MachineScheduler.h"
#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetObjectFile.h"
@ -474,15 +475,17 @@ public:
ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override {
const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
ScheduleDAGMI *DAG =
new ScheduleDAGMI(C, std::make_unique<AArch64PostRASchedStrategy>(C),
/* RemoveKillFlags=*/true);
if (ST.hasFusion()) {
// Run the Macro Fusion after RA again since literals are expanded from
// pseudos then (v. addPreSched2()).
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
DAG->addMutation(createAArch64MacroFusionDAGMutation());
return DAG;
}
return nullptr;
return DAG;
}
void addIRPasses() override;

View File

@ -65,6 +65,7 @@ add_llvm_target(AArch64CodeGen
AArch64LoadStoreOptimizer.cpp
AArch64LowerHomogeneousPrologEpilog.cpp
AArch64MachineFunctionInfo.cpp
AArch64MachineScheduler.cpp
AArch64MacroFusion.cpp
AArch64MIPeepholeOpt.cpp
AArch64MCInstLower.cpp

View File

@ -62,11 +62,11 @@ define i32 @test_musttail_variadic_spill(i32 %arg0, ...) {
; CHECK-NEXT: mov x24, x5
; CHECK-NEXT: mov x25, x6
; CHECK-NEXT: mov x26, x7
; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill
; CHECK-NEXT: mov x27, x8
; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill
; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill
; CHECK-NEXT: stp q7, q6, [sp] ; 32-byte Folded Spill
; CHECK-NEXT: mov x27, x8
; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill
; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill
; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill
; CHECK-NEXT: bl _puts
; CHECK-NEXT: ldp q1, q0, [sp, #96] ; 32-byte Folded Reload
; CHECK-NEXT: mov w0, w19
@ -132,11 +132,11 @@ define void @f_thunk(i8* %this, ...) {
; CHECK-NEXT: mov x24, x5
; CHECK-NEXT: mov x25, x6
; CHECK-NEXT: mov x26, x7
; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill
; CHECK-NEXT: mov x27, x8
; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill
; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill
; CHECK-NEXT: stp q7, q6, [sp] ; 32-byte Folded Spill
; CHECK-NEXT: mov x27, x8
; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill
; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill
; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill
; CHECK-NEXT: str x10, [x9]
; CHECK-NEXT: bl _get_f
; CHECK-NEXT: ldp q1, q0, [sp, #96] ; 32-byte Folded Reload

View File

@ -53,8 +53,8 @@ define [ 9 x double ] @array_9() {
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: str xzr, [x8, #64]
; CHECK-NEXT: stp q0, q0, [x8, #32]
; CHECK-NEXT: stp q0, q0, [x8]
; CHECK-NEXT: stp q0, q0, [x8, #32]
; CHECK-NEXT: ret
ret [ 9 x double ] zeroinitializer
}
@ -232,8 +232,8 @@ define [ 5 x %T_STRUCT_SAMEM ] @array_of_struct_in_memory() {
; CHECK-LABEL: array_of_struct_in_memory:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: stp q0, q0, [x8, #48]
; CHECK-NEXT: stp q0, q0, [x8, #16]
; CHECK-NEXT: stp q0, q0, [x8, #48]
; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: ret
ret [ 5 x %T_STRUCT_SAMEM ] zeroinitializer
@ -350,8 +350,8 @@ define [ 2 x %T_NESTED_STRUCT_SAMEM ] @array_of_struct_nested_same_field_types_2
; CHECK-LABEL: array_of_struct_nested_same_field_types_2:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: stp q0, q0, [x8, #48]
; CHECK-NEXT: stp q0, q0, [x8, #16]
; CHECK-NEXT: stp q0, q0, [x8, #48]
; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: ret
ret [ 2 x %T_NESTED_STRUCT_SAMEM ] zeroinitializer
@ -440,8 +440,8 @@ define %T_IN_MEMORY @return_in_memory() {
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: str xzr, [x8, #64]
; CHECK-NEXT: stp q0, q0, [x8, #32]
; CHECK-NEXT: stp q0, q0, [x8]
; CHECK-NEXT: stp q0, q0, [x8, #32]
; CHECK-NEXT: ret
ret %T_IN_MEMORY zeroinitializer
}

View File

@ -52,8 +52,8 @@ define void @bzero_64_heap(i8* nocapture %c) {
; CHECK-LABEL: bzero_64_heap:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ret
call void @llvm.memset.p0i8.i64(i8* align 8 %c, i8 0, i64 64, i1 false)
ret void
@ -230,8 +230,8 @@ define void @bzero_64_stack() {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #80
@ -253,8 +253,8 @@ define void @bzero_72_stack() {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: str xzr, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
@ -275,10 +275,10 @@ define void @bzero_128_stack() {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #144
@ -300,14 +300,14 @@ define void @bzero_256_stack() {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp q0, q0, [sp, #224]
; CHECK-NEXT: stp q0, q0, [sp, #192]
; CHECK-NEXT: stp q0, q0, [sp, #160]
; CHECK-NEXT: stp q0, q0, [sp, #128]
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: stp q0, q0, [sp, #128]
; CHECK-NEXT: stp q0, q0, [sp, #160]
; CHECK-NEXT: stp q0, q0, [sp, #192]
; CHECK-NEXT: stp q0, q0, [sp, #224]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #272
@ -497,8 +497,8 @@ define void @memset_64_stack() {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v0.16b, #170
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #80
@ -521,8 +521,8 @@ define void @memset_72_stack() {
; CHECK-NEXT: mov x8, #-6148914691236517206
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: str x8, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
@ -543,10 +543,10 @@ define void @memset_128_stack() {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v0.16b, #170
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #144
@ -568,14 +568,14 @@ define void @memset_256_stack() {
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: movi v0.16b, #170
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: stp q0, q0, [sp, #224]
; CHECK-NEXT: stp q0, q0, [sp, #192]
; CHECK-NEXT: stp q0, q0, [sp, #160]
; CHECK-NEXT: stp q0, q0, [sp, #128]
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp, #96]
; CHECK-NEXT: stp q0, q0, [sp, #128]
; CHECK-NEXT: stp q0, q0, [sp, #160]
; CHECK-NEXT: stp q0, q0, [sp, #192]
; CHECK-NEXT: stp q0, q0, [sp, #224]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #272

View File

@ -152,15 +152,11 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: stp q13, q12, [x8]
; CHECK-NEXT: stp q11, q10, [x8, #32]
; CHECK-NEXT: stp q9, q8, [x8, #64]
; CHECK-NEXT: stp q4, q15, [x8, #432]
; CHECK-NEXT: stp q14, q3, [x8, #464]
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: stp q31, q30, [x8, #96]
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: stp q29, q28, [x8, #144]
; CHECK-NEXT: stp q31, q30, [x8, #96]
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: stp q29, q28, [x8, #144]
; CHECK-NEXT: stp q27, q26, [x8, #176]
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: str q25, [x8, #208]
; CHECK-NEXT: stp q24, q23, [x8, #240]
; CHECK-NEXT: stp q22, q21, [x8, #272]
@ -168,7 +164,11 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: stp q18, q17, [x8, #336]
; CHECK-NEXT: stp q16, q7, [x8, #368]
; CHECK-NEXT: stp q6, q5, [x8, #400]
; CHECK-NEXT: stp q4, q15, [x8, #432]
; CHECK-NEXT: stp q14, q3, [x8, #464]
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: str q2, [x8, #496]
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore b8

View File

@ -249,9 +249,9 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
@ -531,9 +531,9 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
@ -813,9 +813,9 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) #0 {
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b

View File

@ -131,6 +131,7 @@ static_library("LLVMAArch64CodeGen") {
"AArch64MCInstLower.cpp",
"AArch64MIPeepholeOpt.cpp",
"AArch64MachineFunctionInfo.cpp",
"AArch64MachineScheduler.cpp",
"AArch64MacroFusion.cpp",
"AArch64PBQPRegAlloc.cpp",
"AArch64PromoteConstant.cpp",