forked from OSchip/llvm-project
458 lines
15 KiB
C++
458 lines
15 KiB
C++
//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "llvm/ADT/SetVector.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-insert-delay-alu"
|
|
|
|
namespace {
|
|
|
|
class AMDGPUInsertDelayAlu : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
const SIInstrInfo *SII;
|
|
const TargetRegisterInfo *TRI;
|
|
|
|
TargetSchedModel SchedModel;
|
|
|
|
AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
// Return true if MI waits for all outstanding VALU instructions to complete.
|
|
static bool instructionWaitsForVALU(const MachineInstr &MI) {
|
|
// These instruction types wait for VA_VDST==0 before issuing.
|
|
const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
|
|
SIInstrFlags::FLAT | SIInstrFlags::MIMG |
|
|
SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
|
|
if (MI.getDesc().TSFlags & VA_VDST_0)
|
|
return true;
|
|
if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
|
|
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
|
|
return true;
|
|
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
|
|
(MI.getOperand(0).getImm() & 0xf000) == 0)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
// Types of delay that can be encoded in an s_delay_alu instruction.
|
|
enum DelayType { VALU, TRANS, SALU, OTHER };
|
|
|
|
// Get the delay type for an instruction with the specified TSFlags.
|
|
static DelayType getDelayType(uint64_t TSFlags) {
|
|
if (TSFlags & SIInstrFlags::TRANS)
|
|
return TRANS;
|
|
if (TSFlags & SIInstrFlags::VALU)
|
|
return VALU;
|
|
if (TSFlags & SIInstrFlags::SALU)
|
|
return SALU;
|
|
return OTHER;
|
|
}
|
|
|
|
// Information about the last instruction(s) that wrote to a particular
|
|
// regunit. In straight-line code there will only be one such instruction, but
|
|
// when control flow converges we merge the delay information from each path
|
|
// to represent the union of the worst-case delays of each type.
|
|
struct DelayInfo {
|
|
// One larger than the maximum number of (non-TRANS) VALU instructions we
|
|
// can encode in an s_delay_alu instruction.
|
|
static const unsigned VALU_MAX = 5;
|
|
|
|
// One larger than the maximum number of TRANS instructions we can encode in
|
|
// an s_delay_alu instruction.
|
|
static const unsigned TRANS_MAX = 4;
|
|
|
|
// If it was written by a (non-TRANS) VALU, remember how many clock cycles
|
|
// are left until it completes, and how many other (non-TRANS) VALU we have
|
|
// seen since it was issued.
|
|
uint8_t VALUCycles = 0;
|
|
uint8_t VALUNum = VALU_MAX;
|
|
|
|
// If it was written by a TRANS, remember how many clock cycles are left
|
|
// until it completes, and how many other TRANS we have seen since it was
|
|
// issued.
|
|
uint8_t TRANSCycles = 0;
|
|
uint8_t TRANSNum = TRANS_MAX;
|
|
// Also remember how many other (non-TRANS) VALU we have seen since it was
|
|
// issued. When an instruction depends on both a prior TRANS and a prior
|
|
// non-TRANS VALU, this is used to decide whether to encode a wait for just
|
|
// one or both of them.
|
|
uint8_t TRANSNumVALU = VALU_MAX;
|
|
|
|
// If it was written by an SALU, remember how many clock cycles are left
|
|
// until it completes.
|
|
uint8_t SALUCycles = 0;
|
|
|
|
DelayInfo() = default;
|
|
|
|
DelayInfo(DelayType Type, unsigned Cycles) {
|
|
switch (Type) {
|
|
default:
|
|
llvm_unreachable("unexpected type");
|
|
case VALU:
|
|
VALUCycles = Cycles;
|
|
VALUNum = 0;
|
|
break;
|
|
case TRANS:
|
|
TRANSCycles = Cycles;
|
|
TRANSNum = 0;
|
|
TRANSNumVALU = 0;
|
|
break;
|
|
case SALU:
|
|
SALUCycles = Cycles;
|
|
break;
|
|
}
|
|
}
|
|
|
|
bool operator==(const DelayInfo &RHS) const {
|
|
return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
|
|
TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
|
|
TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
|
|
}
|
|
|
|
bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
|
|
|
|
// Merge another DelayInfo into this one, to represent the union of the
|
|
// worst-case delays of each type.
|
|
void merge(const DelayInfo &RHS) {
|
|
VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
|
|
VALUNum = std::min(VALUNum, RHS.VALUNum);
|
|
TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
|
|
TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
|
|
TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
|
|
SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
|
|
}
|
|
|
|
// Update this DelayInfo after issuing an instruction. IsVALU should be 1
|
|
// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
|
|
// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
|
|
// instruction. Return true if there is no longer any useful delay info.
|
|
bool advance(DelayType Type, unsigned Cycles) {
|
|
bool Erase = true;
|
|
|
|
VALUNum += (Type == VALU);
|
|
if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
|
|
// Forget about the VALU instruction. It was too far back or has
|
|
// definitely completed by now.
|
|
VALUNum = VALU_MAX;
|
|
VALUCycles = 0;
|
|
} else {
|
|
VALUCycles -= Cycles;
|
|
Erase = false;
|
|
}
|
|
|
|
TRANSNum += (Type == TRANS);
|
|
TRANSNumVALU += (Type == VALU);
|
|
if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
|
|
// Forget about any TRANS instruction. It was too far back or has
|
|
// definitely completed by now.
|
|
TRANSNum = TRANS_MAX;
|
|
TRANSNumVALU = VALU_MAX;
|
|
TRANSCycles = 0;
|
|
} else {
|
|
TRANSCycles -= Cycles;
|
|
Erase = false;
|
|
}
|
|
|
|
if (SALUCycles <= Cycles) {
|
|
// Forget about any SALU instruction. It has definitely completed by
|
|
// now.
|
|
SALUCycles = 0;
|
|
} else {
|
|
SALUCycles -= Cycles;
|
|
Erase = false;
|
|
}
|
|
|
|
return Erase;
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void dump() const {
|
|
if (VALUCycles)
|
|
dbgs() << " VALUCycles=" << (int)VALUCycles;
|
|
if (VALUNum < VALU_MAX)
|
|
dbgs() << " VALUNum=" << (int)VALUNum;
|
|
if (TRANSCycles)
|
|
dbgs() << " TRANSCycles=" << (int)TRANSCycles;
|
|
if (TRANSNum < TRANS_MAX)
|
|
dbgs() << " TRANSNum=" << (int)TRANSNum;
|
|
if (TRANSNumVALU < VALU_MAX)
|
|
dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
|
|
if (SALUCycles)
|
|
dbgs() << " SALUCycles=" << (int)SALUCycles;
|
|
}
|
|
#endif
|
|
};
|
|
|
|
// A map from regunits to the delay info for that regunit.
|
|
struct DelayState : DenseMap<unsigned, DelayInfo> {
|
|
// Merge another DelayState into this one by merging the delay info for each
|
|
// regunit.
|
|
void merge(const DelayState &RHS) {
|
|
for (const auto &KV : RHS) {
|
|
iterator It;
|
|
bool Inserted;
|
|
std::tie(It, Inserted) = insert(KV);
|
|
if (!Inserted)
|
|
It->second.merge(KV.second);
|
|
}
|
|
}
|
|
|
|
// Advance the delay info for each regunit, erasing any that are no longer
|
|
// useful.
|
|
void advance(DelayType Type, unsigned Cycles) {
|
|
iterator Next;
|
|
for (auto I = begin(), E = end(); I != E; I = Next) {
|
|
Next = std::next(I);
|
|
if (I->second.advance(Type, Cycles))
|
|
erase(I);
|
|
}
|
|
}
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
void dump(const TargetRegisterInfo *TRI) const {
|
|
if (empty()) {
|
|
dbgs() << " empty\n";
|
|
return;
|
|
}
|
|
|
|
// Dump DelayInfo for each RegUnit in numerical order.
|
|
SmallVector<const_iterator, 8> Order;
|
|
Order.reserve(size());
|
|
for (const_iterator I = begin(), E = end(); I != E; ++I)
|
|
Order.push_back(I);
|
|
llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
|
|
return A->first < B->first;
|
|
});
|
|
for (const_iterator I : Order) {
|
|
dbgs() << " " << printRegUnit(I->first, TRI);
|
|
I->second.dump();
|
|
dbgs() << "\n";
|
|
}
|
|
}
|
|
#endif
|
|
};
|
|
|
|
// The saved delay state at the end of each basic block.
|
|
DenseMap<MachineBasicBlock *, DelayState> BlockState;
|
|
|
|
// Emit an s_delay_alu instruction if necessary before MI.
|
|
MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
|
|
MachineInstr *LastDelayAlu) {
|
|
unsigned Imm = 0;
|
|
|
|
// Wait for a TRANS instruction.
|
|
if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
|
|
Imm |= 4 + Delay.TRANSNum;
|
|
|
|
// Wait for a VALU instruction (if it's more recent than any TRANS
|
|
// instruction that we're also waiting for).
|
|
if (Delay.VALUNum < DelayInfo::VALU_MAX &&
|
|
Delay.VALUNum <= Delay.TRANSNumVALU) {
|
|
if (Imm & 0xf)
|
|
Imm |= Delay.VALUNum << 7;
|
|
else
|
|
Imm |= Delay.VALUNum;
|
|
}
|
|
|
|
// Wait for an SALU instruction.
|
|
if (Delay.SALUCycles) {
|
|
if (Imm & 0x780) {
|
|
// We have already encoded a VALU and a TRANS delay. There's no room in
|
|
// the encoding for an SALU delay as well, so just drop it.
|
|
} else if (Imm & 0xf) {
|
|
Imm |= (Delay.SALUCycles + 8) << 7;
|
|
} else {
|
|
Imm |= Delay.SALUCycles + 8;
|
|
}
|
|
}
|
|
|
|
// Don't emit the s_delay_alu instruction if there's nothing to wait for.
|
|
if (!Imm)
|
|
return LastDelayAlu;
|
|
|
|
// If we only need to wait for one instruction, try encoding it in the last
|
|
// s_delay_alu that we emitted.
|
|
if (!(Imm & 0x780) && LastDelayAlu) {
|
|
unsigned Skip = 0;
|
|
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
|
|
E = MachineBasicBlock::instr_iterator(MI);
|
|
++I != E;) {
|
|
if (!I->isBundle() && !I->isMetaInstruction())
|
|
++Skip;
|
|
}
|
|
if (Skip < 6) {
|
|
MachineOperand &Op = LastDelayAlu->getOperand(0);
|
|
unsigned LastImm = Op.getImm();
|
|
assert((LastImm & ~0xf) == 0 &&
|
|
"Remembered an s_delay_alu with no room for another delay!");
|
|
LastImm |= Imm << 7 | Skip << 4;
|
|
Op.setImm(LastImm);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
auto &MBB = *MI.getParent();
|
|
MachineInstr *DelayAlu =
|
|
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
|
|
// Remember the s_delay_alu for next time if there is still room in it to
|
|
// encode another delay.
|
|
return (Imm & 0x780) ? nullptr : DelayAlu;
|
|
}
|
|
|
|
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
|
|
DelayState State;
|
|
for (auto *Pred : MBB.predecessors())
|
|
State.merge(BlockState[Pred]);
|
|
|
|
LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
|
|
<< "\n";
|
|
State.dump(TRI););
|
|
|
|
bool Changed = false;
|
|
MachineInstr *LastDelayAlu = nullptr;
|
|
|
|
// Iterate over the contents of bundles, but don't emit any instructions
|
|
// inside a bundle.
|
|
for (auto &MI : MBB.instrs()) {
|
|
if (MI.isBundle() || MI.isMetaInstruction())
|
|
continue;
|
|
|
|
// Ignore some more instructions that do not generate any code.
|
|
switch (MI.getOpcode()) {
|
|
case AMDGPU::SI_RETURN_TO_EPILOG:
|
|
continue;
|
|
}
|
|
|
|
DelayType Type = getDelayType(MI.getDesc().TSFlags);
|
|
|
|
if (instructionWaitsForVALU(MI)) {
|
|
// Forget about all outstanding VALU delays.
|
|
State = DelayState();
|
|
} else if (Type != OTHER) {
|
|
DelayInfo Delay;
|
|
// TODO: Scan implicit uses too?
|
|
for (const auto &Op : MI.explicit_uses()) {
|
|
if (Op.isReg()) {
|
|
// One of the operands of the writelane is also the output operand.
|
|
// This creates the insertion of redundant delays. Hence, we have to
|
|
// ignore this operand.
|
|
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
|
|
continue;
|
|
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
|
|
auto It = State.find(*UI);
|
|
if (It != State.end()) {
|
|
Delay.merge(It->second);
|
|
State.erase(*UI);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (Emit && !MI.isBundledWithPred()) {
|
|
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
|
|
// just ignore them?
|
|
LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
|
|
}
|
|
}
|
|
|
|
if (Type != OTHER) {
|
|
// TODO: Scan implicit defs too?
|
|
for (const auto &Op : MI.defs()) {
|
|
unsigned Latency = SchedModel.computeOperandLatency(
|
|
&MI, MI.getOperandNo(&Op), nullptr, 0);
|
|
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
|
|
State[*UI] = DelayInfo(Type, Latency);
|
|
}
|
|
}
|
|
|
|
// Advance by the number of cycles it takes to issue this instruction.
|
|
// TODO: Use a more advanced model that accounts for instructions that
|
|
// take multiple cycles to issue on a particular pipeline.
|
|
unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
|
|
// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
|
|
// instructions on the assumption that they will usually have to be issued
|
|
// twice?
|
|
State.advance(Type, Cycles);
|
|
|
|
LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
|
|
}
|
|
|
|
if (Emit) {
|
|
assert(State == BlockState[&MBB] &&
|
|
"Basic block state should not have changed on final pass!");
|
|
} else if (State != BlockState[&MBB]) {
|
|
BlockState[&MBB] = std::move(State);
|
|
Changed = true;
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
if (skipFunction(MF.getFunction()))
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
|
|
<< "\n");
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
if (!ST.hasDelayAlu())
|
|
return false;
|
|
|
|
SII = ST.getInstrInfo();
|
|
TRI = ST.getRegisterInfo();
|
|
|
|
SchedModel.init(&ST);
|
|
|
|
// Calculate the delay state for each basic block, iterating until we reach
|
|
// a fixed point.
|
|
SetVector<MachineBasicBlock *> WorkList;
|
|
for (auto &MBB : reverse(MF))
|
|
WorkList.insert(&MBB);
|
|
while (!WorkList.empty()) {
|
|
auto &MBB = *WorkList.pop_back_val();
|
|
bool Changed = runOnMachineBasicBlock(MBB, false);
|
|
if (Changed)
|
|
WorkList.insert(MBB.succ_begin(), MBB.succ_end());
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
|
|
|
|
// Make one last pass over all basic blocks to emit s_delay_alu
|
|
// instructions.
|
|
bool Changed = false;
|
|
for (auto &MBB : MF)
|
|
Changed |= runOnMachineBasicBlock(MBB, true);
|
|
return Changed;
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char AMDGPUInsertDelayAlu::ID = 0;
|
|
|
|
char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
|
|
false, false)
|