[AMDGPU] Shrink MAD/FMA to MADAK/MADMK/FMAAK/FMAMK on GFX10

On GFX10 VOP3 instructions can have a literal operand, so the conversion from VOP3 MAD/FMA to VOP2 MADAK/MADMK/FMAAK/FMAMK will not happen in SIFoldOperands. The only benefit of the VOP2 form is code size, so do it in SIShrinkInstructions instead. Differential Revision: https://reviews.llvm.org/D125567
2022-02-18 17:24:50 +00:00 · 2022-02-18 17:24:50 +00:00 · 27fa41583f
parent 8ab819ad90
commit 27fa41583f
3 changed files with 90 additions and 3 deletions
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@ -46,6 +46,7 @@ public:
  void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
  void shrinkScalarCompare(MachineInstr &MI) const;
  void shrinkMIMG(MachineInstr &MI) const;
+  void shrinkMadFma(MachineInstr &MI) const;
  bool shrinkScalarLogicOp(MachineInstr &MI) const;
  bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
                     Register Reg, unsigned SubReg) const;
@ -324,6 +325,82 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
  }
 }

+// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
+void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+  if (!ST->hasVOP3Literal())
+    return;
+
+  if (TII->hasAnyModifiersSet(MI))
+    return;
+
+  const unsigned Opcode = MI.getOpcode();
+  MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
+
+  bool Swap;
+
+  // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
+  if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
+    if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
+      Swap = false;
+    else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
+      Swap = true;
+    else
+      return;
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected mad/fma opcode!");
+    case AMDGPU::V_MAD_F32_e64:
+      NewOpcode = AMDGPU::V_MADAK_F32;
+      break;
+    case AMDGPU::V_FMA_F32_e64:
+      NewOpcode = AMDGPU::V_FMAAK_F32;
+      break;
+    }
+  }
+
+  // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
+  if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
+    if (Src1.isImm() && !TII->isInlineConstant(Src1))
+      Swap = false;
+    else if (Src0.isImm() && !TII->isInlineConstant(Src0))
+      Swap = true;
+    else
+      return;
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected mad/fma opcode!");
+    case AMDGPU::V_MAD_F32_e64:
+      NewOpcode = AMDGPU::V_MADMK_F32;
+      break;
+    case AMDGPU::V_FMA_F32_e64:
+      NewOpcode = AMDGPU::V_FMAMK_F32;
+      break;
+    }
+  }
+
+  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+    return;
+
+  if (Swap) {
+    // Swap Src0 and Src1 by building a new instruction.
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
+            MI.getOperand(0).getReg())
+        .add(Src1)
+        .add(Src0)
+        .add(Src2)
+        .setMIFlags(MI.getFlags());
+    MI.eraseFromParent();
+  } else {
+    TII->removeModOperands(MI);
+    MI.setDesc(TII->get(NewOpcode));
+  }
+}
+
 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
@ -726,6 +803,16 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
        continue;
      }

+      if (!TII->isVOP3(MI))
+        continue;
+
+      // TODO: Also shrink F16 forms.
+      if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
+          MI.getOpcode() == AMDGPU::V_FMA_F32_e64) {
+        shrinkMadFma(MI);
+        continue;
+      }
+
      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@ -68,7 +68,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v4, v4, v10
 ; GCN-NEXT:    v_mul_f32_e32 v3, v4, v6
-; GCN-NEXT:    v_fma_f32 v4, v5, s0, 0x3ca3d70a
+; GCN-NEXT:    v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
 ; GCN-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v7, v4
 ; GCN-NEXT:    v_fmac_f32_e32 v1, v2, v0
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@ -135,8 +135,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out
 ; GCN-DAG:      {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
 ; GCN-NOT:      v_madak_f32
 ; GFX6_8_9:     v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-; GFX10-MAD:    v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
-; GFX10-FMA:    v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
+; GFX10-MAD:    v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
+; GFX10-FMA:    v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
 ; GFX940-FMA:   v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
 define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 {
  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone