[AMDGPU][GFX11] Mitigate VALU mask write hazard

VALU use of an SGPR (pair) as mask followed by SALU write to the
same SGPR can cause incorrect execution of subsequent SALU reads
of the SGPR.

Reviewed By: foad, rampitec

Differential Revision: https://reviews.llvm.org/D134151
This commit is contained in:
Carl Ritson 2022-10-01 09:17:42 +09:00
parent a5c46bf952
commit a35013bec6
4 changed files with 701 additions and 0 deletions

View File

@ -1102,6 +1102,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
}
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@ -2709,3 +2710,140 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
return false;
}
bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
if (!ST.isWave64())
return false;
if (!ST.hasVALUMaskWriteHazard())
return false;
if (!SIInstrInfo::isSALU(*MI))
return false;
// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
// 2. SALU writes SGPR
// 3. SALU reads SGPR
// The hazard can expire if the distance between 2 and 3 is sufficient.
// In practice this happens <10% of the time, hence this always assumes
// the hazard exists if 1 and 2 are present to avoid searching.
const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
if (!SDSTOp || !SDSTOp->isReg())
return false;
const Register HazardReg = SDSTOp->getReg();
if (HazardReg == AMDGPU::EXEC ||
HazardReg == AMDGPU::EXEC_LO ||
HazardReg == AMDGPU::EXEC_HI ||
HazardReg == AMDGPU::M0)
return false;
auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
case AMDGPU::V_CNDMASK_B16_e32:
case AMDGPU::V_CNDMASK_B16_dpp:
case AMDGPU::V_CNDMASK_B32_e32:
case AMDGPU::V_CNDMASK_B32_dpp:
case AMDGPU::V_DIV_FMAS_F32_e64:
case AMDGPU::V_DIV_FMAS_F64_e64:
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
case AMDGPU::V_SUBBREV_U32_dpp:
// These implicitly read VCC as mask source.
return HazardReg == AMDGPU::VCC ||
HazardReg == AMDGPU::VCC_LO ||
HazardReg == AMDGPU::VCC_HI;
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_e64:
case AMDGPU::V_CNDMASK_B16_e64_dpp:
case AMDGPU::V_CNDMASK_B32_e64:
case AMDGPU::V_CNDMASK_B32_e64_dpp:
case AMDGPU::V_SUBB_U32_e64:
case AMDGPU::V_SUBB_U32_e64_dpp:
case AMDGPU::V_SUBBREV_U32_e64:
case AMDGPU::V_SUBBREV_U32_e64_dpp: {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
}
default:
return false;
}
};
const MachineRegisterInfo &MRI = MF.getRegInfo();
auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
!(I.getOperand(0).getImm() & 0x1))
return true;
// VALU access to any SGPR or literal constant other than HazardReg
// mitigates hazard. No need to check HazardReg here as this will
// only be called when !IsHazardFn.
if (!SIInstrInfo::isVALU(I))
return false;
for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
const MachineOperand &Op = I.getOperand(OpNo);
if (Op.isReg()) {
Register OpReg = Op.getReg();
// Only consider uses
if (!Op.isUse())
continue;
// Ignore EXEC
if (OpReg == AMDGPU::EXEC ||
OpReg == AMDGPU::EXEC_LO ||
OpReg == AMDGPU::EXEC_HI)
continue;
// Ignore all implicit uses except VCC
if (Op.isImplicit()) {
if (OpReg == AMDGPU::VCC ||
OpReg == AMDGPU::VCC_LO ||
OpReg == AMDGPU::VCC_HI)
return true;
continue;
}
if (TRI.isSGPRReg(MRI, OpReg))
return true;
} else {
const MCInstrDesc &InstDesc = I.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
if (TII.isLiteralConstant(Op, OpInfo))
return true;
}
}
return false;
};
// Check for hazard
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
std::numeric_limits<int>::max())
return false;
auto NextMI = std::next(MI->getIterator());
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(0xfffe);
// SALU write may be s_getpc in a bundle.
if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
// Update offsets of any references in the bundle.
while (NextMI != MI->getParent()->end() &&
NextMI->isBundledWithPred()) {
for (auto &Operand : NextMI->operands()) {
if (Operand.isGlobal())
Operand.setOffset(Operand.getOffset() + 4);
}
NextMI++;
}
}
return true;
}

View File

@ -106,6 +106,7 @@ private:
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);

View File

@ -1058,6 +1058,8 @@ public:
bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }
bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts; }

View File

@ -0,0 +1,560 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
--- |
@mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
define amdgpu_gs void @mask_hazard_getpc1() { ret void }
define amdgpu_gs void @mask_hazard_getpc2() { ret void }
define amdgpu_gs void @mask_hazard_vcc1() { ret void }
define amdgpu_gs void @mask_hazard_vcc2() { ret void }
define amdgpu_gs void @mask_hazard_cndmask_dpp1() { ret void }
define amdgpu_gs void @mask_hazard_cndmask_dpp2() { ret void }
define amdgpu_gs void @mask_hazard_cndmask_dpp3() { ret void }
define amdgpu_gs void @mask_hazard_cndmask_dpp4() { ret void }
define amdgpu_gs void @mask_hazard_addc1() { ret void }
define amdgpu_gs void @mask_hazard_addc2() { ret void }
define amdgpu_gs void @mask_hazard_addc3() { ret void }
define amdgpu_gs void @mask_hazard_addc4() { ret void }
define amdgpu_gs void @mask_hazard_subb1() { ret void }
define amdgpu_gs void @mask_hazard_subb2() { ret void }
define amdgpu_gs void @mask_hazard_subb3() { ret void }
define amdgpu_gs void @mask_hazard_subb4() { ret void }
define amdgpu_gs void @mask_hazard_subbrev1() { ret void }
define amdgpu_gs void @mask_hazard_subbrev2() { ret void }
define amdgpu_gs void @mask_hazard_subbrev3() { ret void }
define amdgpu_gs void @mask_hazard_subbrev4() { ret void }
define amdgpu_gs void @mask_hazard_div_fmas_f32() { ret void }
define amdgpu_gs void @mask_hazard_div_fmas_f64() { ret void }
define amdgpu_gs void @mask_hazard_subreg1() { ret void }
define amdgpu_gs void @mask_hazard_subreg2() { ret void }
define amdgpu_gs void @mask_hazard_subreg3() { ret void }
define amdgpu_gs void @mask_hazard_subreg4() { ret void }
define amdgpu_gs void @mask_hazard_subreg5() { ret void }
define amdgpu_gs void @mask_hazard_waitcnt() { ret void }
define amdgpu_gs void @mask_hazard_gap1() { ret void }
define amdgpu_gs void @mask_hazard_gap2() { ret void }
define amdgpu_gs void @mask_hazard_gap3() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard1() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard2() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard3() { ret void }
...
---
name: mask_hazard_getpc1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_getpc1
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
S_ENDPGM 0
...
---
name: mask_hazard_getpc2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_getpc2
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
; GCN-NEXT: }
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
BUNDLE implicit-def $sgpr0_sgpr1 {
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
$sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc
}
S_ENDPGM 0
...
---
name: mask_hazard_vcc1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_vcc1
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_vcc2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_vcc2
; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_cndmask_dpp1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_cndmask_dpp1
; GCN: $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_cndmask_dpp2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_cndmask_dpp2
; GCN: $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_cndmask_dpp3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_cndmask_dpp3
; GCN: $vgpr0 = V_CNDMASK_B16_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_CNDMASK_B16_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_cndmask_dpp4
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_cndmask_dpp4
; GCN: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_addc1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_addc1
; GCN: $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_addc2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_addc2
; GCN: $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_addc3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_addc3
; GCN: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_addc4
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_addc4
; GCN: $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subb1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subb1
; GCN: $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subb2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subb2
; GCN: $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subb3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subb3
; GCN: $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subb4
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subb4
; GCN: $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subbrev1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subbrev1
; GCN: $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subbrev2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subbrev2
; GCN: $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subbrev3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subbrev3
; GCN: $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_subbrev4
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subbrev4
; GCN: $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
$sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_div_fmas_f32
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_div_fmas_f32
; GCN: $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
---
name: mask_hazard_div_fmas_f64
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_div_fmas_f64
; GCN: $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec
$vcc = S_CSELECT_B64 -1, 0, implicit $scc
S_ENDPGM 0
...
# Check low word overlap
---
name: mask_hazard_subreg1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subreg1
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GCN-NEXT: $sgpr2 = S_MOV_B32 0
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
$sgpr2 = S_MOV_B32 0
S_ENDPGM 0
...
# Check high word overlap
---
name: mask_hazard_subreg2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subreg2
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GCN-NEXT: $sgpr3 = S_MOV_B32 0
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
$sgpr3 = S_MOV_B32 0
S_ENDPGM 0
...
# Check multiple subreg overlap
---
name: mask_hazard_subreg3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subreg3
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GCN-NEXT: $sgpr2 = S_MOV_B32 0
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr3 = S_MOV_B32 0
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
$sgpr2 = S_MOV_B32 0
$sgpr3 = S_MOV_B32 0
S_ENDPGM 0
...
# Check vcc_lo overlap
---
name: mask_hazard_subreg4
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subreg4
; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc_lo = S_MOV_B32 0
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc_lo = S_MOV_B32 0
$sgpr2 = S_MOV_B32 $vcc_lo
S_ENDPGM 0
...
# Check vcc_hi overlap
---
name: mask_hazard_subreg5
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_subreg5
; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
; GCN-NEXT: $vcc_hi = S_MOV_B32 0
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc_hi = S_MOV_B32 0
$sgpr2 = S_MOV_B32 $vcc_hi
S_ENDPGM 0
...
# S_WAITCNT does not mitigate hazard
---
name: mask_hazard_waitcnt
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_waitcnt
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
S_WAITCNT 0
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
S_ENDPGM 0
...
# Check implicit $exec
---
name: mask_hazard_gap1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_gap1
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
$vgpr3 = V_MOV_B32_e32 0, implicit $exec
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
S_ENDPGM 0
...
# Check implicit $mode
---
name: mask_hazard_gap2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_gap2
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
S_ENDPGM 0
...
# Check explicit $exec
---
name: mask_hazard_gap3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_gap3
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
; GCN-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
S_ENDPGM 0
...
# Different SGPR write
---
name: mask_hazard_no_hazard1
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_no_hazard1
; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
$sgpr0 = S_MOV_B32 0
S_ENDPGM 0
...
# Different SGPR write with mask read overlap
---
name: mask_hazard_no_hazard2
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_no_hazard2
; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $vcc
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$sgpr0_sgpr1 = S_MOV_B64 $vcc
S_ENDPGM 0
...
# Overlapping VGPR write
---
name: mask_hazard_no_hazard3
body: |
bb.0:
; GCN-LABEL: name: mask_hazard_no_hazard3
; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
S_ENDPGM 0
...