[AMDGPU] Shrink MAD/FMA to MADAK/MADMK/FMAAK/FMAMK on GFX10
On GFX10 VOP3 instructions can have a literal operand, so the conversion from VOP3 MAD/FMA to VOP2 MADAK/MADMK/FMAAK/FMAMK will not happen in SIFoldOperands. The only benefit of the VOP2 form is code size, so do it in SIShrinkInstructions instead. Differential Revision: https://reviews.llvm.org/D125567
This commit is contained in:
parent
8ab819ad90
commit
27fa41583f
|
@ -46,6 +46,7 @@ public:
|
|||
void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
|
||||
void shrinkScalarCompare(MachineInstr &MI) const;
|
||||
void shrinkMIMG(MachineInstr &MI) const;
|
||||
void shrinkMadFma(MachineInstr &MI) const;
|
||||
bool shrinkScalarLogicOp(MachineInstr &MI) const;
|
||||
bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
|
||||
Register Reg, unsigned SubReg) const;
|
||||
|
@ -324,6 +325,82 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
|
|||
}
|
||||
}
|
||||
|
||||
// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
|
||||
void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
|
||||
if (!ST->hasVOP3Literal())
|
||||
return;
|
||||
|
||||
if (TII->hasAnyModifiersSet(MI))
|
||||
return;
|
||||
|
||||
const unsigned Opcode = MI.getOpcode();
|
||||
MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
||||
MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
||||
MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
|
||||
unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
|
||||
|
||||
bool Swap;
|
||||
|
||||
// Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
|
||||
if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
|
||||
if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
|
||||
Swap = false;
|
||||
else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
|
||||
Swap = true;
|
||||
else
|
||||
return;
|
||||
|
||||
switch (Opcode) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected mad/fma opcode!");
|
||||
case AMDGPU::V_MAD_F32_e64:
|
||||
NewOpcode = AMDGPU::V_MADAK_F32;
|
||||
break;
|
||||
case AMDGPU::V_FMA_F32_e64:
|
||||
NewOpcode = AMDGPU::V_FMAAK_F32;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
|
||||
if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
|
||||
if (Src1.isImm() && !TII->isInlineConstant(Src1))
|
||||
Swap = false;
|
||||
else if (Src0.isImm() && !TII->isInlineConstant(Src0))
|
||||
Swap = true;
|
||||
else
|
||||
return;
|
||||
|
||||
switch (Opcode) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected mad/fma opcode!");
|
||||
case AMDGPU::V_MAD_F32_e64:
|
||||
NewOpcode = AMDGPU::V_MADMK_F32;
|
||||
break;
|
||||
case AMDGPU::V_FMA_F32_e64:
|
||||
NewOpcode = AMDGPU::V_FMAMK_F32;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
|
||||
return;
|
||||
|
||||
if (Swap) {
|
||||
// Swap Src0 and Src1 by building a new instruction.
|
||||
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
|
||||
MI.getOperand(0).getReg())
|
||||
.add(Src1)
|
||||
.add(Src0)
|
||||
.add(Src2)
|
||||
.setMIFlags(MI.getFlags());
|
||||
MI.eraseFromParent();
|
||||
} else {
|
||||
TII->removeModOperands(MI);
|
||||
MI.setDesc(TII->get(NewOpcode));
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
|
||||
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
|
||||
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
|
||||
|
@ -726,6 +803,16 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (!TII->isVOP3(MI))
|
||||
continue;
|
||||
|
||||
// TODO: Also shrink F16 forms.
|
||||
if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
|
||||
MI.getOpcode() == AMDGPU::V_FMA_F32_e64) {
|
||||
shrinkMadFma(MI);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
|
||||
continue;
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
|
|||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v4, v4, v10
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
|
||||
; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a
|
||||
; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
|
||||
; GCN-NEXT: v_mul_f32_e32 v1, v3, v1
|
||||
; GCN-NEXT: v_mul_f32_e32 v2, v7, v4
|
||||
; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0
|
||||
|
|
|
@ -135,8 +135,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out
|
|||
; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
|
||||
; GCN-NOT: v_madak_f32
|
||||
; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
|
||||
; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
|
||||
; GFX10-FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
|
||||
; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
|
||||
; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
|
||||
; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
|
||||
define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
|
|
Loading…
Reference in New Issue