383 lines
12 KiB
C++
383 lines
12 KiB
C++
//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPUCombinerHelper.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
using namespace llvm;
|
|
using namespace MIPatternMatch;
|
|
|
|
LLVM_READNONE
|
|
static bool fnegFoldsIntoMI(const MachineInstr &MI) {
|
|
switch (MI.getOpcode()) {
|
|
case AMDGPU::G_FADD:
|
|
case AMDGPU::G_FSUB:
|
|
case AMDGPU::G_FMUL:
|
|
case AMDGPU::G_FMA:
|
|
case AMDGPU::G_FMAD:
|
|
case AMDGPU::G_FMINNUM:
|
|
case AMDGPU::G_FMAXNUM:
|
|
case AMDGPU::G_FMINNUM_IEEE:
|
|
case AMDGPU::G_FMAXNUM_IEEE:
|
|
case AMDGPU::G_FSIN:
|
|
case AMDGPU::G_FPEXT:
|
|
case AMDGPU::G_INTRINSIC_TRUNC:
|
|
case AMDGPU::G_FPTRUNC:
|
|
case AMDGPU::G_FRINT:
|
|
case AMDGPU::G_FNEARBYINT:
|
|
case AMDGPU::G_INTRINSIC_ROUND:
|
|
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
|
|
case AMDGPU::G_FCANONICALIZE:
|
|
case AMDGPU::G_AMDGPU_RCP_IFLAG:
|
|
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
|
|
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
|
|
return true;
|
|
case AMDGPU::G_INTRINSIC: {
|
|
unsigned IntrinsicID = MI.getIntrinsicID();
|
|
switch (IntrinsicID) {
|
|
case Intrinsic::amdgcn_rcp:
|
|
case Intrinsic::amdgcn_rcp_legacy:
|
|
case Intrinsic::amdgcn_sin:
|
|
case Intrinsic::amdgcn_fmul_legacy:
|
|
case Intrinsic::amdgcn_fmed3:
|
|
case Intrinsic::amdgcn_fma_legacy:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// \p returns true if the operation will definitely need to use a 64-bit
|
|
/// encoding, and thus will use a VOP3 encoding regardless of the source
|
|
/// modifiers.
|
|
LLVM_READONLY
|
|
static bool opMustUseVOP3Encoding(const MachineInstr &MI,
|
|
const MachineRegisterInfo &MRI) {
|
|
return MI.getNumOperands() >
|
|
(MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||
|
|
MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
|
|
}
|
|
|
|
// Most FP instructions support source modifiers.
|
|
LLVM_READONLY
|
|
static bool hasSourceMods(const MachineInstr &MI) {
|
|
if (!MI.memoperands().empty())
|
|
return false;
|
|
|
|
switch (MI.getOpcode()) {
|
|
case AMDGPU::COPY:
|
|
case AMDGPU::G_SELECT:
|
|
case AMDGPU::G_FDIV:
|
|
case AMDGPU::G_FREM:
|
|
case TargetOpcode::INLINEASM:
|
|
case TargetOpcode::INLINEASM_BR:
|
|
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
|
|
case AMDGPU::G_BITCAST:
|
|
case AMDGPU::G_ANYEXT:
|
|
case AMDGPU::G_BUILD_VECTOR:
|
|
case AMDGPU::G_BUILD_VECTOR_TRUNC:
|
|
case AMDGPU::G_PHI:
|
|
return false;
|
|
case AMDGPU::G_INTRINSIC: {
|
|
unsigned IntrinsicID = MI.getIntrinsicID();
|
|
switch (IntrinsicID) {
|
|
case Intrinsic::amdgcn_interp_p1:
|
|
case Intrinsic::amdgcn_interp_p2:
|
|
case Intrinsic::amdgcn_interp_mov:
|
|
case Intrinsic::amdgcn_interp_p1_f16:
|
|
case Intrinsic::amdgcn_interp_p2_f16:
|
|
case Intrinsic::amdgcn_div_scale:
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
unsigned CostThreshold = 4) {
|
|
// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
|
|
// it is truly free to use a source modifier in all cases. If there are
|
|
// multiple users but for each one will necessitate using VOP3, there will be
|
|
// a code size increase. Try to avoid increasing code size unless we know it
|
|
// will save on the instruction count.
|
|
unsigned NumMayIncreaseSize = 0;
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
|
|
if (!hasSourceMods(Use))
|
|
return false;
|
|
|
|
if (!opMustUseVOP3Encoding(Use, MRI)) {
|
|
if (++NumMayIncreaseSize > CostThreshold)
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool mayIgnoreSignedZero(MachineInstr &MI) {
|
|
const TargetOptions &Options = MI.getMF()->getTarget().Options;
|
|
return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
|
|
}
|
|
|
|
static bool isInv2Pi(const APFloat &APF) {
|
|
static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
|
|
static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
|
|
static const APFloat KF64(APFloat::IEEEdouble(),
|
|
APInt(64, 0x3fc45f306dc9c882));
|
|
|
|
return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
|
|
APF.bitwiseIsEqual(KF64);
|
|
}
|
|
|
|
// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
|
|
// additional cost to negate them.
|
|
static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
|
|
MachineRegisterInfo &MRI) {
|
|
Optional<FPValueAndVReg> FPValReg;
|
|
if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
|
|
if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
|
|
return true;
|
|
|
|
const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
|
|
if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static unsigned inverseMinMax(unsigned Opc) {
|
|
switch (Opc) {
|
|
case AMDGPU::G_FMAXNUM:
|
|
return AMDGPU::G_FMINNUM;
|
|
case AMDGPU::G_FMINNUM:
|
|
return AMDGPU::G_FMAXNUM;
|
|
case AMDGPU::G_FMAXNUM_IEEE:
|
|
return AMDGPU::G_FMINNUM_IEEE;
|
|
case AMDGPU::G_FMINNUM_IEEE:
|
|
return AMDGPU::G_FMAXNUM_IEEE;
|
|
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
|
|
return AMDGPU::G_AMDGPU_FMIN_LEGACY;
|
|
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
|
|
return AMDGPU::G_AMDGPU_FMAX_LEGACY;
|
|
default:
|
|
llvm_unreachable("invalid min/max opcode");
|
|
}
|
|
}
|
|
|
|
bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
|
|
MachineInstr *&MatchInfo) {
|
|
Register Src = MI.getOperand(1).getReg();
|
|
MatchInfo = MRI.getVRegDef(Src);
|
|
|
|
// If the input has multiple uses and we can either fold the negate down, or
|
|
// the other uses cannot, give up. This both prevents unprofitable
|
|
// transformations and infinite loops: we won't repeatedly try to fold around
|
|
// a negate that has no 'good' form.
|
|
if (MRI.hasOneNonDBGUse(Src)) {
|
|
if (allUsesHaveSourceMods(MI, MRI, 0))
|
|
return false;
|
|
} else {
|
|
if (fnegFoldsIntoMI(*MatchInfo) &&
|
|
(allUsesHaveSourceMods(MI, MRI) ||
|
|
!allUsesHaveSourceMods(*MatchInfo, MRI)))
|
|
return false;
|
|
}
|
|
|
|
switch (MatchInfo->getOpcode()) {
|
|
case AMDGPU::G_FMINNUM:
|
|
case AMDGPU::G_FMAXNUM:
|
|
case AMDGPU::G_FMINNUM_IEEE:
|
|
case AMDGPU::G_FMAXNUM_IEEE:
|
|
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
|
|
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
|
|
// 0 doesn't have a negated inline immediate.
|
|
return !isConstantCostlierToNegate(*MatchInfo,
|
|
MatchInfo->getOperand(2).getReg(), MRI);
|
|
case AMDGPU::G_FADD:
|
|
case AMDGPU::G_FSUB:
|
|
case AMDGPU::G_FMA:
|
|
case AMDGPU::G_FMAD:
|
|
return mayIgnoreSignedZero(*MatchInfo);
|
|
case AMDGPU::G_FMUL:
|
|
case AMDGPU::G_FPEXT:
|
|
case AMDGPU::G_INTRINSIC_TRUNC:
|
|
case AMDGPU::G_FPTRUNC:
|
|
case AMDGPU::G_FRINT:
|
|
case AMDGPU::G_FNEARBYINT:
|
|
case AMDGPU::G_INTRINSIC_ROUND:
|
|
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
|
|
case AMDGPU::G_FSIN:
|
|
case AMDGPU::G_FCANONICALIZE:
|
|
case AMDGPU::G_AMDGPU_RCP_IFLAG:
|
|
return true;
|
|
case AMDGPU::G_INTRINSIC: {
|
|
unsigned IntrinsicID = MatchInfo->getIntrinsicID();
|
|
switch (IntrinsicID) {
|
|
case Intrinsic::amdgcn_rcp:
|
|
case Intrinsic::amdgcn_rcp_legacy:
|
|
case Intrinsic::amdgcn_sin:
|
|
case Intrinsic::amdgcn_fmul_legacy:
|
|
case Intrinsic::amdgcn_fmed3:
|
|
return true;
|
|
case Intrinsic::amdgcn_fma_legacy:
|
|
return mayIgnoreSignedZero(*MatchInfo);
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
|
|
MachineInstr *&MatchInfo) {
|
|
// Transform:
|
|
// %A = inst %Op1, ...
|
|
// %B = fneg %A
|
|
//
|
|
// into:
|
|
//
|
|
// (if %A has one use, specifically fneg above)
|
|
// %B = inst (maybe fneg %Op1), ...
|
|
//
|
|
// (if %A has multiple uses)
|
|
// %B = inst (maybe fneg %Op1), ...
|
|
// %A = fneg %B
|
|
|
|
// Replace register in operand with a register holding negated value.
|
|
auto NegateOperand = [&](MachineOperand &Op) {
|
|
Register Reg = Op.getReg();
|
|
if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
|
|
Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
|
|
replaceRegOpWith(MRI, Op, Reg);
|
|
};
|
|
|
|
// Replace either register in operands with a register holding negated value.
|
|
auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
|
|
Register XReg = X.getReg();
|
|
Register YReg = Y.getReg();
|
|
if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
|
|
replaceRegOpWith(MRI, X, XReg);
|
|
else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
|
|
replaceRegOpWith(MRI, Y, YReg);
|
|
else {
|
|
YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
|
|
replaceRegOpWith(MRI, Y, YReg);
|
|
}
|
|
};
|
|
|
|
Builder.setInstrAndDebugLoc(*MatchInfo);
|
|
|
|
// Negate appropriate operands so that resulting value of MatchInfo is
|
|
// negated.
|
|
switch (MatchInfo->getOpcode()) {
|
|
case AMDGPU::G_FADD:
|
|
case AMDGPU::G_FSUB:
|
|
NegateOperand(MatchInfo->getOperand(1));
|
|
NegateOperand(MatchInfo->getOperand(2));
|
|
break;
|
|
case AMDGPU::G_FMUL:
|
|
NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
|
|
break;
|
|
case AMDGPU::G_FMINNUM:
|
|
case AMDGPU::G_FMAXNUM:
|
|
case AMDGPU::G_FMINNUM_IEEE:
|
|
case AMDGPU::G_FMAXNUM_IEEE:
|
|
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
|
|
case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
|
|
NegateOperand(MatchInfo->getOperand(1));
|
|
NegateOperand(MatchInfo->getOperand(2));
|
|
unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
|
|
replaceOpcodeWith(*MatchInfo, Opposite);
|
|
break;
|
|
}
|
|
case AMDGPU::G_FMA:
|
|
case AMDGPU::G_FMAD:
|
|
NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
|
|
NegateOperand(MatchInfo->getOperand(3));
|
|
break;
|
|
case AMDGPU::G_FPEXT:
|
|
case AMDGPU::G_INTRINSIC_TRUNC:
|
|
case AMDGPU::G_FRINT:
|
|
case AMDGPU::G_FNEARBYINT:
|
|
case AMDGPU::G_INTRINSIC_ROUND:
|
|
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
|
|
case AMDGPU::G_FSIN:
|
|
case AMDGPU::G_FCANONICALIZE:
|
|
case AMDGPU::G_AMDGPU_RCP_IFLAG:
|
|
case AMDGPU::G_FPTRUNC:
|
|
NegateOperand(MatchInfo->getOperand(1));
|
|
break;
|
|
case AMDGPU::G_INTRINSIC: {
|
|
unsigned IntrinsicID = MatchInfo->getIntrinsicID();
|
|
switch (IntrinsicID) {
|
|
case Intrinsic::amdgcn_rcp:
|
|
case Intrinsic::amdgcn_rcp_legacy:
|
|
case Intrinsic::amdgcn_sin:
|
|
NegateOperand(MatchInfo->getOperand(2));
|
|
break;
|
|
case Intrinsic::amdgcn_fmul_legacy:
|
|
NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
|
|
break;
|
|
case Intrinsic::amdgcn_fmed3:
|
|
NegateOperand(MatchInfo->getOperand(2));
|
|
NegateOperand(MatchInfo->getOperand(3));
|
|
NegateOperand(MatchInfo->getOperand(4));
|
|
break;
|
|
case Intrinsic::amdgcn_fma_legacy:
|
|
NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
|
|
NegateOperand(MatchInfo->getOperand(4));
|
|
break;
|
|
default:
|
|
llvm_unreachable("folding fneg not supported for this intrinsic");
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
llvm_unreachable("folding fneg not supported for this instruction");
|
|
}
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
|
|
|
|
if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
|
|
// MatchInfo now has negated value so use that instead of old Dst.
|
|
replaceRegWith(MRI, Dst, MatchInfoDst);
|
|
} else {
|
|
// We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
|
|
// but replaceRegWith will replace defs as well. It is easier to replace one
|
|
// def with a new register.
|
|
LLT Type = MRI.getType(Dst);
|
|
Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
|
|
replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
|
|
|
|
// MatchInfo now has negated value so use that instead of old Dst.
|
|
replaceRegWith(MRI, Dst, NegatedMatchInfo);
|
|
|
|
// Recreate non negated value for other uses of old MatchInfoDst
|
|
auto NextInst = ++MatchInfo->getIterator();
|
|
Builder.setInstrAndDebugLoc(*NextInst);
|
|
Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
}
|