[AMDGPU] Shrink MAD/FMA to MADAK/MADMK/FMAAK/FMAMK on GFX10

On GFX10 VOP3 instructions can have a literal operand, so the conversion
from VOP3 MAD/FMA to VOP2 MADAK/MADMK/FMAAK/FMAMK will not happen in
SIFoldOperands. The only benefit of the VOP2 form is code size, so do it
in SIShrinkInstructions instead.

Differential Revision: https://reviews.llvm.org/D125567
This commit is contained in:
Jay Foad 2022-02-18 17:24:50 +00:00
parent 8ab819ad90
commit 27fa41583f
3 changed files with 90 additions and 3 deletions

View File

@ -46,6 +46,7 @@ public:
void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
void shrinkScalarCompare(MachineInstr &MI) const;
void shrinkMIMG(MachineInstr &MI) const;
void shrinkMadFma(MachineInstr &MI) const;
bool shrinkScalarLogicOp(MachineInstr &MI) const;
bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
Register Reg, unsigned SubReg) const;
@ -324,6 +325,82 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
}
}
// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
if (!ST->hasVOP3Literal())
return;
if (TII->hasAnyModifiersSet(MI))
return;
const unsigned Opcode = MI.getOpcode();
MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
bool Swap;
// Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
Swap = false;
else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
Swap = true;
else
return;
switch (Opcode) {
default:
llvm_unreachable("Unexpected mad/fma opcode!");
case AMDGPU::V_MAD_F32_e64:
NewOpcode = AMDGPU::V_MADAK_F32;
break;
case AMDGPU::V_FMA_F32_e64:
NewOpcode = AMDGPU::V_FMAAK_F32;
break;
}
}
// Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
if (Src1.isImm() && !TII->isInlineConstant(Src1))
Swap = false;
else if (Src0.isImm() && !TII->isInlineConstant(Src0))
Swap = true;
else
return;
switch (Opcode) {
default:
llvm_unreachable("Unexpected mad/fma opcode!");
case AMDGPU::V_MAD_F32_e64:
NewOpcode = AMDGPU::V_MADMK_F32;
break;
case AMDGPU::V_FMA_F32_e64:
NewOpcode = AMDGPU::V_FMAMK_F32;
break;
}
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
return;
if (Swap) {
// Swap Src0 and Src1 by building a new instruction.
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
MI.getOperand(0).getReg())
.add(Src1)
.add(Src0)
.add(Src2)
.setMIFlags(MI.getFlags());
MI.eraseFromParent();
} else {
TII->removeModOperands(MI);
MI.setDesc(TII->get(NewOpcode));
}
}
/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
@ -726,6 +803,16 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
if (!TII->isVOP3(MI))
continue;
// TODO: Also shrink F16 forms.
if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F32_e64) {
shrinkMadFma(MI);
continue;
}
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;

View File

@ -68,7 +68,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v4, v4, v10
; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a
; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
; GCN-NEXT: v_mul_f32_e32 v1, v3, v1
; GCN-NEXT: v_mul_f32_e32 v2, v7, v4
; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0

View File

@ -135,8 +135,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out
; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
; GFX10-FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone