[AMDGPU] Add support for GFX11 LDSDIR hazards

Detect LDS direct WAR/WAW hazards and compute values for
wait_vdst (va_vdst) parameter.  Where appropriate this
raises wait_vdst from the default 0 to allow concurrent
issue of LDS direct with VALU execution.

Also detect LDS direct versus VMEM source VGPR hazards
and insert vm_vsrc=0 waits using s_waitcnt_depctr.

Differential Revision: https://reviews.llvm.org/D127963
This commit is contained in:
Jay Foad 2022-06-16 15:02:06 +01:00
parent bbf3fd4af1
commit 13107c2770
3 changed files with 499 additions and 8 deletions

View File

@ -427,6 +427,7 @@ void GCNHazardRecognizer::RecedeCycle() {
typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
// Search for a hazard in a block and its predecessors.
template <typename StateT>
@ -473,11 +474,11 @@ hasHazard(StateT State,
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
const MachineBasicBlock *MBB,
MachineBasicBlock::const_reverse_instr_iterator I,
int WaitStates, IsExpiredFn IsExpired,
DenseSet<const MachineBasicBlock *> &Visited) {
static int getWaitStatesSince(
GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
@ -489,7 +490,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
if (I->isInlineAsm())
continue;
WaitStates += SIInstrInfo::getNumWaitStates(*I);
WaitStates += GetNumWaitStates(*I);
if (IsExpired(*I, WaitStates))
return std::numeric_limits<int>::max();
@ -500,8 +501,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
if (!Visited.insert(Pred).second)
continue;
int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
WaitStates, IsExpired, Visited);
int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
IsExpired, Visited, GetNumWaitStates);
MinWaitStates = std::min(MinWaitStates, W);
}
@ -1075,6 +1076,10 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixSMEMtoVectorWriteHazards(MI);
fixVcmpxExecWARHazard(MI);
fixLdsBranchVmemWARHazard(MI);
if (ST.hasLdsDirect()) {
fixLdsDirectVALUHazard(MI);
fixLdsDirectVMEMHazard(MI);
}
fixVALUPartialForwardingHazard(MI);
fixVALUTransUseHazard(MI);
}
@ -1366,6 +1371,81 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
return true;
}
bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
if (!SIInstrInfo::isLDSDIR(*MI))
return false;
const int NoHazardWaitStates = 15;
const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
const Register VDSTReg = VDST->getReg();
bool VisitedTrans = false;
auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
if (!SIInstrInfo::isVALU(I))
return false;
VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
// Cover both WAR and WAW
return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
};
auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
if (WaitStates >= NoHazardWaitStates)
return true;
// Instructions which cause va_vdst==0 expire hazard
return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
};
auto GetWaitStatesFn = [](const MachineInstr &MI) {
return SIInstrInfo::isVALU(MI) ? 1 : 0;
};
DenseSet<const MachineBasicBlock *> Visited;
auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
std::next(MI->getReverseIterator()), 0,
IsExpiredFn, Visited, GetWaitStatesFn);
// Transcendentals can execute in parallel to other VALUs.
// This makes va_vdst count unusable with a mixture of VALU and TRANS.
if (VisitedTrans)
Count = 0;
MachineOperand *WaitVdstOp =
TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
return true;
}
bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
if (!SIInstrInfo::isLDSDIR(*MI))
return false;
const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
const Register VDSTReg = VDST->getReg();
auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
!SIInstrInfo::isDS(I))
return false;
return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
};
auto IsExpiredFn = [](const MachineInstr &I, int) {
return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
I.getOperand(0).getImm() == 0xffe3);
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
std::numeric_limits<int>::max())
return false;
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(0xffe3);
return true;
}
bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
if (!ST.isWave64())
return false;

View File

@ -96,6 +96,8 @@ private:
bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
bool fixVcmpxExecWARHazard(MachineInstr *MI);
bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
bool fixLdsDirectVALUHazard(MachineInstr *MI);
bool fixLdsDirectVMEMHazard(MachineInstr *MI);
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
bool fixVALUTransUseHazard(MachineInstr *MI);

View File

@ -0,0 +1,409 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
---
name: lds_param_load_no_war
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_no_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_va_vdst0_war
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_va_vdst0_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_va_vdst0_war_salu
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_va_vdst0_war_salu
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $m0 = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$m0 = S_MOV_B32 killed $sgpr0
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_va_vdst1_war
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_va_vdst1_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 1, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_va_vdst10_war
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_va_vdst10_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 10, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_va_vdst10_waw
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_va_vdst10_waw
; GCN: $vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 10, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_va_vdst20_war
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_va_vdst20_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr7 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr8 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr9 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr10 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr11 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr12 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr13 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr14 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr15 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr16 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr17 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr18 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr19 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr20 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr21 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_valu_war_trans
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_valu_war_trans
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: S_WAITCNT_DEPCTR 4095
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_trans_war_valu
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_trans_war_valu
; GCN: $vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr4 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_valu_war_vmem
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_valu_war_vmem
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr4 = IMAGE_LOAD_V1_V4 $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_valu_war_lds
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_valu_war_lds
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr10 = DS_READ_B32 $vgpr2, 0, 0, implicit $m0, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_valu_war_ldsdir
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_valu_war_ldsdir
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr10 = LDS_PARAM_LOAD 0, 1, 15, implicit $m0, implicit $exec
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 4, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr3 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr10 = LDS_PARAM_LOAD 0, 1, 15, implicit $m0, implicit $exec
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr6 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 4, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_vmem_war
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_vmem_war
; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: S_WAITCNT_DEPCTR 65507
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_vmem_war_valu
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_vmem_war_valu
; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
$vgpr5 = V_MUL_F32_e32 $vgpr2, $vgpr2, implicit $mode, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_vmem_war_exp
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_vmem_war_exp
; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_vmem_war_waitcnt
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt
; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
S_WAITCNT 0
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_vmem_war_waitcnt_depctr
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt_depctr
; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: S_WAITCNT_DEPCTR 65507
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
S_WAITCNT_DEPCTR 65507
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_param_load_vmem_war_waitcnt_depctr2
body: |
bb.0:
; GCN-LABEL: name: lds_param_load_vmem_war_waitcnt_depctr2
; GCN: $vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: S_WAITCNT_DEPCTR 65535
; GCN-NEXT: S_WAITCNT_DEPCTR 65507
; GCN-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
S_WAITCNT_DEPCTR 65535
$vgpr1 = LDS_PARAM_LOAD 0, 0, 15, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_direct_load_no_war
body: |
bb.0:
; GCN-LABEL: name: lds_direct_load_no_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_DIRECT_LOAD 15, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
$vgpr1 = LDS_DIRECT_LOAD 0, implicit $m0, implicit $exec
S_ENDPGM 0
...
---
name: lds_direct_load_va_vdst0_war
body: |
bb.0:
; GCN-LABEL: name: lds_direct_load_va_vdst0_war
; GCN: $vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr1 = LDS_DIRECT_LOAD 0, implicit $m0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
$vgpr1 = LDS_DIRECT_LOAD 15, implicit $m0, implicit $exec
S_ENDPGM 0
...