forked from OSchip/llvm-project
[AMDGPU] New AMDGPUInsertDelayAlu pass
Differential Revision: https://reviews.llvm.org/D128270
This commit is contained in:
parent
8d29f0fdb9
commit
cfb7ffdec0
|
@ -299,6 +299,9 @@ extern char &SIMemoryLegalizerID;
|
|||
void initializeSIModeRegisterPass(PassRegistry&);
|
||||
extern char &SIModeRegisterID;
|
||||
|
||||
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
|
||||
extern char &AMDGPUInsertDelayAluID;
|
||||
|
||||
void initializeSIInsertHardClausesPass(PassRegistry &);
|
||||
extern char &SIInsertHardClausesID;
|
||||
|
||||
|
|
|
@ -0,0 +1,457 @@
|
|||
//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "GCNSubtarget.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-insert-delay-alu"
|
||||
|
||||
namespace {
|
||||
|
||||
class AMDGPUInsertDelayAlu : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
const SIInstrInfo *SII;
|
||||
const TargetRegisterInfo *TRI;
|
||||
|
||||
TargetSchedModel SchedModel;
|
||||
|
||||
AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
// Return true if MI waits for all outstanding VALU instructions to complete.
|
||||
static bool instructionWaitsForVALU(const MachineInstr &MI) {
|
||||
// These instruction types wait for VA_VDST==0 before issuing.
|
||||
const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
|
||||
SIInstrFlags::FLAT | SIInstrFlags::MIMG |
|
||||
SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
|
||||
if (MI.getDesc().TSFlags & VA_VDST_0)
|
||||
return true;
|
||||
if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
|
||||
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
|
||||
return true;
|
||||
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
|
||||
(MI.getOperand(0).getImm() & 0xf000) == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Types of delay that can be encoded in an s_delay_alu instruction.
|
||||
enum DelayType { VALU, TRANS, SALU, OTHER };
|
||||
|
||||
// Get the delay type for an instruction with the specified TSFlags.
|
||||
static DelayType getDelayType(uint64_t TSFlags) {
|
||||
if (TSFlags & SIInstrFlags::TRANS)
|
||||
return TRANS;
|
||||
if (TSFlags & SIInstrFlags::VALU)
|
||||
return VALU;
|
||||
if (TSFlags & SIInstrFlags::SALU)
|
||||
return SALU;
|
||||
return OTHER;
|
||||
}
|
||||
|
||||
// Information about the last instruction(s) that wrote to a particular
|
||||
// regunit. In straight-line code there will only be one such instruction, but
|
||||
// when control flow converges we merge the delay information from each path
|
||||
// to represent the union of the worst-case delays of each type.
|
||||
struct DelayInfo {
|
||||
// One larger than the maximum number of (non-TRANS) VALU instructions we
|
||||
// can encode in an s_delay_alu instruction.
|
||||
static const unsigned VALU_MAX = 5;
|
||||
|
||||
// One larger than the maximum number of TRANS instructions we can encode in
|
||||
// an s_delay_alu instruction.
|
||||
static const unsigned TRANS_MAX = 4;
|
||||
|
||||
// If it was written by a (non-TRANS) VALU, remember how many clock cycles
|
||||
// are left until it completes, and how many other (non-TRANS) VALU we have
|
||||
// seen since it was issued.
|
||||
uint8_t VALUCycles = 0;
|
||||
uint8_t VALUNum = VALU_MAX;
|
||||
|
||||
// If it was written by a TRANS, remember how many clock cycles are left
|
||||
// until it completes, and how many other TRANS we have seen since it was
|
||||
// issued.
|
||||
uint8_t TRANSCycles = 0;
|
||||
uint8_t TRANSNum = TRANS_MAX;
|
||||
// Also remember how many other (non-TRANS) VALU we have seen since it was
|
||||
// issued. When an instruction depends on both a prior TRANS and a prior
|
||||
// non-TRANS VALU, this is used to decide whether to encode a wait for just
|
||||
// one or both of them.
|
||||
uint8_t TRANSNumVALU = VALU_MAX;
|
||||
|
||||
// If it was written by an SALU, remember how many clock cycles are left
|
||||
// until it completes.
|
||||
uint8_t SALUCycles = 0;
|
||||
|
||||
DelayInfo() = default;
|
||||
|
||||
DelayInfo(DelayType Type, unsigned Cycles) {
|
||||
switch (Type) {
|
||||
default:
|
||||
llvm_unreachable("unexpected type");
|
||||
case VALU:
|
||||
VALUCycles = Cycles;
|
||||
VALUNum = 0;
|
||||
break;
|
||||
case TRANS:
|
||||
TRANSCycles = Cycles;
|
||||
TRANSNum = 0;
|
||||
TRANSNumVALU = 0;
|
||||
break;
|
||||
case SALU:
|
||||
SALUCycles = Cycles;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const DelayInfo &RHS) const {
|
||||
return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
|
||||
TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
|
||||
TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
|
||||
}
|
||||
|
||||
bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
|
||||
|
||||
// Merge another DelayInfo into this one, to represent the union of the
|
||||
// worst-case delays of each type.
|
||||
void merge(const DelayInfo &RHS) {
|
||||
VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
|
||||
VALUNum = std::min(VALUNum, RHS.VALUNum);
|
||||
TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
|
||||
TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
|
||||
TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
|
||||
SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
|
||||
}
|
||||
|
||||
// Update this DelayInfo after issuing an instruction. IsVALU should be 1
|
||||
// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
|
||||
// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
|
||||
// instruction. Return true if there is no longer any useful delay info.
|
||||
bool advance(DelayType Type, unsigned Cycles) {
|
||||
bool Erase = true;
|
||||
|
||||
VALUNum += (Type == VALU);
|
||||
if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
|
||||
// Forget about the VALU instruction. It was too far back or has
|
||||
// definitely completed by now.
|
||||
VALUNum = VALU_MAX;
|
||||
VALUCycles = 0;
|
||||
} else {
|
||||
VALUCycles -= Cycles;
|
||||
Erase = false;
|
||||
}
|
||||
|
||||
TRANSNum += (Type == TRANS);
|
||||
TRANSNumVALU += (Type == VALU);
|
||||
if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
|
||||
// Forget about any TRANS instruction. It was too far back or has
|
||||
// definitely completed by now.
|
||||
TRANSNum = TRANS_MAX;
|
||||
TRANSNumVALU = VALU_MAX;
|
||||
TRANSCycles = 0;
|
||||
} else {
|
||||
TRANSCycles -= Cycles;
|
||||
Erase = false;
|
||||
}
|
||||
|
||||
if (SALUCycles <= Cycles) {
|
||||
// Forget about any SALU instruction. It has definitely completed by
|
||||
// now.
|
||||
SALUCycles = 0;
|
||||
} else {
|
||||
SALUCycles -= Cycles;
|
||||
Erase = false;
|
||||
}
|
||||
|
||||
return Erase;
|
||||
}
|
||||
|
||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||
void dump() const {
|
||||
if (VALUCycles)
|
||||
dbgs() << " VALUCycles=" << (int)VALUCycles;
|
||||
if (VALUNum < VALU_MAX)
|
||||
dbgs() << " VALUNum=" << (int)VALUNum;
|
||||
if (TRANSCycles)
|
||||
dbgs() << " TRANSCycles=" << (int)TRANSCycles;
|
||||
if (TRANSNum < TRANS_MAX)
|
||||
dbgs() << " TRANSNum=" << (int)TRANSNum;
|
||||
if (TRANSNumVALU < VALU_MAX)
|
||||
dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
|
||||
if (SALUCycles)
|
||||
dbgs() << " SALUCycles=" << (int)SALUCycles;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// A map from regunits to the delay info for that regunit.
|
||||
struct DelayState : DenseMap<unsigned, DelayInfo> {
|
||||
// Merge another DelayState into this one by merging the delay info for each
|
||||
// regunit.
|
||||
void merge(const DelayState &RHS) {
|
||||
for (const auto &KV : RHS) {
|
||||
iterator It;
|
||||
bool Inserted;
|
||||
std::tie(It, Inserted) = insert(KV);
|
||||
if (!Inserted)
|
||||
It->second.merge(KV.second);
|
||||
}
|
||||
}
|
||||
|
||||
// Advance the delay info for each regunit, erasing any that are no longer
|
||||
// useful.
|
||||
void advance(DelayType Type, unsigned Cycles) {
|
||||
iterator Next;
|
||||
for (auto I = begin(), E = end(); I != E; I = Next) {
|
||||
Next = std::next(I);
|
||||
if (I->second.advance(Type, Cycles))
|
||||
erase(I);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||
void dump(const TargetRegisterInfo *TRI) const {
|
||||
if (empty()) {
|
||||
dbgs() << " empty\n";
|
||||
return;
|
||||
}
|
||||
|
||||
// Dump DelayInfo for each RegUnit in numerical order.
|
||||
SmallVector<const_iterator, 8> Order;
|
||||
Order.reserve(size());
|
||||
for (const_iterator I = begin(), E = end(); I != E; ++I)
|
||||
Order.push_back(I);
|
||||
llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
|
||||
return A->first < B->first;
|
||||
});
|
||||
for (const_iterator I : Order) {
|
||||
dbgs() << " " << printRegUnit(I->first, TRI);
|
||||
I->second.dump();
|
||||
dbgs() << "\n";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// The saved delay state at the end of each basic block.
|
||||
DenseMap<MachineBasicBlock *, DelayState> BlockState;
|
||||
|
||||
// Emit an s_delay_alu instruction if necessary before MI.
|
||||
MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
|
||||
MachineInstr *LastDelayAlu) {
|
||||
unsigned Imm = 0;
|
||||
|
||||
// Wait for a TRANS instruction.
|
||||
if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
|
||||
Imm |= 4 + Delay.TRANSNum;
|
||||
|
||||
// Wait for a VALU instruction (if it's more recent than any TRANS
|
||||
// instruction that we're also waiting for).
|
||||
if (Delay.VALUNum < DelayInfo::VALU_MAX &&
|
||||
Delay.VALUNum <= Delay.TRANSNumVALU) {
|
||||
if (Imm & 0xf)
|
||||
Imm |= Delay.VALUNum << 7;
|
||||
else
|
||||
Imm |= Delay.VALUNum;
|
||||
}
|
||||
|
||||
// Wait for an SALU instruction.
|
||||
if (Delay.SALUCycles) {
|
||||
if (Imm & 0x780) {
|
||||
// We have already encoded a VALU and a TRANS delay. There's no room in
|
||||
// the encoding for an SALU delay as well, so just drop it.
|
||||
} else if (Imm & 0xf) {
|
||||
Imm |= (Delay.SALUCycles + 8) << 7;
|
||||
} else {
|
||||
Imm |= Delay.SALUCycles + 8;
|
||||
}
|
||||
}
|
||||
|
||||
// Don't emit the s_delay_alu instruction if there's nothing to wait for.
|
||||
if (!Imm)
|
||||
return LastDelayAlu;
|
||||
|
||||
// If we only need to wait for one instruction, try encoding it in the last
|
||||
// s_delay_alu that we emitted.
|
||||
if (!(Imm & 0x780) && LastDelayAlu) {
|
||||
unsigned Skip = 0;
|
||||
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
|
||||
E = MachineBasicBlock::instr_iterator(MI);
|
||||
++I != E;) {
|
||||
if (!I->isBundle() && !I->isMetaInstruction())
|
||||
++Skip;
|
||||
}
|
||||
if (Skip < 6) {
|
||||
MachineOperand &Op = LastDelayAlu->getOperand(0);
|
||||
unsigned LastImm = Op.getImm();
|
||||
assert((LastImm & ~0xf) == 0 &&
|
||||
"Remembered an s_delay_alu with no room for another delay!");
|
||||
LastImm |= Imm << 7 | Skip << 4;
|
||||
Op.setImm(LastImm);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto &MBB = *MI.getParent();
|
||||
MachineInstr *DelayAlu =
|
||||
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
|
||||
// Remember the s_delay_alu for next time if there is still room in it to
|
||||
// encode another delay.
|
||||
return (Imm & 0x780) ? nullptr : DelayAlu;
|
||||
}
|
||||
|
||||
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
|
||||
DelayState State;
|
||||
for (auto *Pred : MBB.predecessors())
|
||||
State.merge(BlockState[Pred]);
|
||||
|
||||
LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
|
||||
<< "\n";
|
||||
State.dump(TRI););
|
||||
|
||||
bool Changed = false;
|
||||
MachineInstr *LastDelayAlu = nullptr;
|
||||
|
||||
// Iterate over the contents of bundles, but don't emit any instructions
|
||||
// inside a bundle.
|
||||
for (auto &MI : MBB.instrs()) {
|
||||
if (MI.isBundle() || MI.isMetaInstruction())
|
||||
continue;
|
||||
|
||||
// Ignore some more instructions that do not generate any code.
|
||||
switch (MI.getOpcode()) {
|
||||
case AMDGPU::SI_RETURN_TO_EPILOG:
|
||||
continue;
|
||||
}
|
||||
|
||||
DelayType Type = getDelayType(MI.getDesc().TSFlags);
|
||||
|
||||
if (instructionWaitsForVALU(MI)) {
|
||||
// Forget about all outstanding VALU delays.
|
||||
State = DelayState();
|
||||
} else if (Type != OTHER) {
|
||||
DelayInfo Delay;
|
||||
// TODO: Scan implicit uses too?
|
||||
for (const auto &Op : MI.explicit_uses()) {
|
||||
if (Op.isReg()) {
|
||||
// One of the operands of the writelane is also the output operand.
|
||||
// This creates the insertion of redundant delays. Hence, we have to
|
||||
// ignore this operand.
|
||||
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
|
||||
continue;
|
||||
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
|
||||
auto It = State.find(*UI);
|
||||
if (It != State.end()) {
|
||||
Delay.merge(It->second);
|
||||
State.erase(*UI);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Emit && !MI.isBundledWithPred()) {
|
||||
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
|
||||
// just ignore them?
|
||||
LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
|
||||
}
|
||||
}
|
||||
|
||||
if (Type != OTHER) {
|
||||
// TODO: Scan implicit defs too?
|
||||
for (const auto &Op : MI.defs()) {
|
||||
unsigned Latency = SchedModel.computeOperandLatency(
|
||||
&MI, MI.getOperandNo(&Op), nullptr, 0);
|
||||
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
|
||||
State[*UI] = DelayInfo(Type, Latency);
|
||||
}
|
||||
}
|
||||
|
||||
// Advance by the number of cycles it takes to issue this instruction.
|
||||
// TODO: Use a more advanced model that accounts for instructions that
|
||||
// take multiple cycles to issue on a particular pipeline.
|
||||
unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
|
||||
// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
|
||||
// instructions on the assumption that they will usually have to be issued
|
||||
// twice?
|
||||
State.advance(Type, Cycles);
|
||||
|
||||
LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
|
||||
}
|
||||
|
||||
if (Emit) {
|
||||
assert(State == BlockState[&MBB] &&
|
||||
"Basic block state should not have changed on final pass!");
|
||||
} else if (State != BlockState[&MBB]) {
|
||||
BlockState[&MBB] = std::move(State);
|
||||
Changed = true;
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override {
|
||||
if (skipFunction(MF.getFunction()))
|
||||
return false;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
|
||||
<< "\n");
|
||||
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
if (!ST.hasDelayAlu())
|
||||
return false;
|
||||
|
||||
SII = ST.getInstrInfo();
|
||||
TRI = ST.getRegisterInfo();
|
||||
|
||||
SchedModel.init(&ST);
|
||||
|
||||
// Calculate the delay state for each basic block, iterating until we reach
|
||||
// a fixed point.
|
||||
SetVector<MachineBasicBlock *> WorkList;
|
||||
for (auto &MBB : reverse(MF))
|
||||
WorkList.insert(&MBB);
|
||||
while (!WorkList.empty()) {
|
||||
auto &MBB = *WorkList.pop_back_val();
|
||||
bool Changed = runOnMachineBasicBlock(MBB, false);
|
||||
if (Changed)
|
||||
WorkList.insert(MBB.succ_begin(), MBB.succ_end());
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
|
||||
|
||||
// Make one last pass over all basic blocks to emit s_delay_alu
|
||||
// instructions.
|
||||
bool Changed = false;
|
||||
for (auto &MBB : MF)
|
||||
Changed |= runOnMachineBasicBlock(MBB, true);
|
||||
return Changed;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char AMDGPUInsertDelayAlu::ID = 0;
|
||||
|
||||
char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
|
||||
|
||||
INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
|
||||
false, false)
|
|
@ -272,6 +272,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
|
|||
cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
// Enable GFX11+ s_delay_alu insertion
|
||||
static cl::opt<bool>
|
||||
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
|
||||
cl::desc("Enable s_delay_alu insertion"),
|
||||
cl::init(true), cl::Hidden);
|
||||
|
||||
// Option is used in lit tests to prevent deadcoding of patterns inspected.
|
||||
static cl::opt<bool>
|
||||
EnableDCEInRA("amdgpu-dce-in-ra",
|
||||
|
@ -363,6 +369,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
|||
initializeAMDGPURewriteOutArgumentsPass(*PR);
|
||||
initializeAMDGPUUnifyMetadataPass(*PR);
|
||||
initializeSIAnnotateControlFlowPass(*PR);
|
||||
initializeAMDGPUInsertDelayAluPass(*PR);
|
||||
initializeSIInsertHardClausesPass(*PR);
|
||||
initializeSIInsertWaitcntsPass(*PR);
|
||||
initializeSIModeRegisterPass(*PR);
|
||||
|
@ -1413,6 +1420,10 @@ void GCNPassConfig::addPreEmitPass() {
|
|||
// Here we add a stand-alone hazard recognizer pass which can handle all
|
||||
// cases.
|
||||
addPass(&PostRAHazardRecognizerID);
|
||||
|
||||
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
|
||||
addPass(&AMDGPUInsertDelayAluID);
|
||||
|
||||
addPass(&BranchRelaxationPassID);
|
||||
}
|
||||
|
||||
|
|
|
@ -57,6 +57,7 @@ add_llvm_target(AMDGPUCodeGen
|
|||
AMDGPUFrameLowering.cpp
|
||||
AMDGPUGlobalISelUtils.cpp
|
||||
AMDGPUHSAMetadataStreamer.cpp
|
||||
AMDGPUInsertDelayAlu.cpp
|
||||
AMDGPUInstCombineIntrinsic.cpp
|
||||
AMDGPUInstrInfo.cpp
|
||||
AMDGPUInstructionSelector.cpp
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
|
||||
|
||||
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-LABEL: v_fdot2:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
|
||||
|
||||
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
|
||||
; GFX6-LABEL: gather4_2d:
|
||||
|
|
|
@ -589,6 +589,7 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
|
||||
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
|
||||
; GFX10-LABEL: sample_d_1d:
|
||||
|
|
|
@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
|
|||
; GCN-NEXT: s_mov_b32 exec_lo, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
|
||||
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
|
||||
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
|
|||
; GCN-NEXT: s_mov_b32 exec_lo, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
|
||||
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
|
||||
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
|
||||
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
|
||||
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
|
|||
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
|
||||
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
|
||||
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
|
||||
|
@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
|
|||
; GCN-NEXT: s_mov_b32 exec_lo, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1
|
||||
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
|
||||
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
|
|
|
@ -68,8 +68,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
|
|||
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
|
||||
; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v7, v5
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v8, v11
|
||||
; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v10, v9
|
||||
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
|
||||
|
@ -133,8 +135,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
|
|||
; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v7
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
|
||||
; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_and_or_b32 v8, 0xffff, v9, v12
|
||||
; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v11, v10
|
||||
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
|
||||
|
@ -235,8 +239,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
|
|||
; GFX11-NEXT: v_readfirstlane_b32 s5, v12
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s6, v13
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s7, v14
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
|
||||
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7]
|
||||
|
@ -359,8 +365,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
|
|||
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
|
||||
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16
|
||||
|
@ -474,8 +482,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
|
|||
; GFX11-NEXT: v_readfirstlane_b32 s5, v13
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s6, v14
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s7, v15
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
|
||||
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
|
||||
|
@ -605,8 +615,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
|
|||
; GFX11-NEXT: v_readfirstlane_b32 s5, v11
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s6, v12
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s7, v13
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
|
||||
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
|
||||
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16
|
||||
|
@ -984,6 +996,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
|
|||
; GFX11-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7
|
||||
; GFX11-NEXT: s_movk_i32 s5, 0x102
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v10, s5
|
||||
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
|
||||
|
@ -1123,6 +1136,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
|
|||
; GFX11-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6
|
||||
; GFX11-NEXT: s_movk_i32 s5, 0x102
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, s5
|
||||
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
|
||||
; GFX6-LABEL: v_saddsat_i7:
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
|
||||
; GFX6-LABEL: v_ssubsat_i7:
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_i7:
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
|
||||
; GFX6-LABEL: v_usubsat_i7:
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -265,6 +265,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
|
||||
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
|
@ -342,6 +343,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
|
||||
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
|
@ -423,6 +425,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, f
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
|
||||
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
|
||||
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
|
||||
|
@ -1650,6 +1653,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, f
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
@ -1788,6 +1792,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
@ -1858,6 +1863,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspac
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
@ -2582,6 +2588,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out,
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
|
||||
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
|
@ -2664,6 +2671,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out,
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
|
||||
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
|
@ -2818,6 +2826,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <
|
|||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
@ -3298,6 +3307,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, flo
|
|||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
|
||||
; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
|
||||
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
|
|
@ -476,8 +476,10 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
|
|||
; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v10, 1.0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8
|
||||
; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v4
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, v4
|
||||
|
|
|
@ -14,24 +14,31 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpC
|
|||
; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15
|
||||
; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15
|
||||
; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7
|
||||
; GCN-NEXT: v_and_b32_e32 v7, 1, v7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
|
||||
; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2
|
||||
; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6
|
||||
; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7
|
||||
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6]
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
|
||||
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
|
||||
; GCN-NEXT: s_mov_b32 exec_lo, s1
|
||||
; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off
|
||||
|
|
|
@ -67,6 +67,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
|
||||
|
@ -154,6 +155,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
|
||||
|
@ -241,6 +243,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
|
||||
|
@ -311,6 +314,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
|
|||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
|
||||
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
|
||||
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -327,6 +331,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
|
@ -401,6 +406,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
|
|||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
|
||||
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
|
||||
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -418,6 +424,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
|
@ -492,6 +499,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
|
|||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
|
||||
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
|
||||
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -509,6 +517,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
|
@ -580,6 +589,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
|
|||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0
|
||||
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
|
||||
; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc
|
||||
|
@ -597,6 +607,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
|
@ -671,6 +682,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
|
|||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0
|
||||
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
|
||||
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
|
||||
|
@ -689,6 +701,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
|
@ -763,6 +776,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
|
|||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
|
||||
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
|
||||
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -780,6 +794,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
|
|||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
|
||||
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
|
||||
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
|
||||
|
|
|
@ -54,6 +54,7 @@ define amdgpu_kernel void @zero_init_kernel() {
|
|||
; GFX11-LABEL: zero_init_kernel:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -169,6 +170,7 @@ define amdgpu_kernel void @zero_init_kernel() {
|
|||
; GFX11-PAL-LABEL: zero_init_kernel:
|
||||
; GFX11-PAL: ; %bb.0:
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -231,6 +233,7 @@ define void @zero_init_foo() {
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -304,6 +307,7 @@ define void @zero_init_foo() {
|
|||
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -681,6 +685,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
|
|||
; GFX11: ; %bb.0: ; %bb
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0
|
||||
; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -743,6 +748,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
|
|||
; GFX11-PAL: ; %bb.0: ; %bb
|
||||
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0
|
||||
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
|
||||
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -810,6 +816,7 @@ define void @store_load_vindex_foo(i32 %idx) {
|
|||
; GFX11-NEXT: v_and_b32_e32 v1, 15, v0
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 15
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -865,6 +872,7 @@ define void @store_load_vindex_foo(i32 %idx) {
|
|||
; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0
|
||||
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc
|
||||
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -1021,6 +1029,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
|
|||
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -1148,6 +1157,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
|
|||
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
|
||||
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -1219,6 +1229,7 @@ define void @zero_init_small_offset_foo() {
|
|||
; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -1300,6 +1311,7 @@ define void @zero_init_small_offset_foo() {
|
|||
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc
|
||||
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
|
||||
|
@ -4217,6 +4229,7 @@ define amdgpu_ps void @large_offset() {
|
|||
; GFX11-LABEL: large_offset:
|
||||
; GFX11: ; %bb.0: ; %bb
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v0
|
||||
|
@ -4317,6 +4330,7 @@ define amdgpu_ps void @large_offset() {
|
|||
; GFX11-PAL-LABEL: large_offset:
|
||||
; GFX11-PAL: ; %bb.0: ; %bb
|
||||
; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0
|
||||
|
|
|
@ -0,0 +1,561 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
|
||||
|
||||
---
|
||||
name: valu_dep_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_3
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_3:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_4
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_4:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
|
||||
$vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
# There's no encoding for VALU_DEP_5. A normal VALU instruction will have
|
||||
# completed already.
|
||||
---
|
||||
name: valu_dep_5
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_5:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
|
||||
$vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
|
||||
$vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: trans32_dep_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}trans32_dep_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_exp_f32_e32 v0, v0
|
||||
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: trans32_dep_2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}trans32_dep_2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_exp_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_exp_f32_e32 v1, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: trans32_dep_3
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}trans32_dep_3:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_exp_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_exp_f32_e32 v1, v1
|
||||
; CHECK-NEXT: v_exp_f32_e32 v2, v2
|
||||
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
|
||||
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
|
||||
# completed already.
|
||||
---
|
||||
name: trans32_dep_4
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}trans32_dep_4:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_exp_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_exp_f32_e32 v1, v1
|
||||
; CHECK-NEXT: v_exp_f32_e32 v2, v2
|
||||
; CHECK-NEXT: v_exp_f32_e32 v3, v3
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
|
||||
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
|
||||
$vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: salu_cycle_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}salu_cycle_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s0, 0
|
||||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
|
||||
$sgpr0 = S_MOV_B32 0
|
||||
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
# There's no need for SALU_CYCLE_2 here because the s_mov will have completed
|
||||
# already.
|
||||
---
|
||||
name: salu_cycle_2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}salu_cycle_2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s0, 0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
|
||||
$sgpr0 = S_MOV_B32 0
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_1_same_trans32_dep_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_exp_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
|
||||
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
|
||||
...
|
||||
|
||||
# There's no need to encode the VALU depdendency because it will complete before
|
||||
# the TRANS.
|
||||
---
|
||||
name: trans32_dep_1_only
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}trans32_dep_1_only:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_exp_f32_e32 v1, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_1_same_salu_cycle_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: s_mov_b32 s0, 0
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$sgpr0 = S_MOV_B32 0
|
||||
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_1_next_valu_dep_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_2_next_valu_dep_2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
...
|
||||
|
||||
# There's no need to encode a dependency for the second mul, because the
|
||||
# dependency for the first mul has already guaranteed that the add has
|
||||
# completed.
|
||||
---
|
||||
name: valu_dep_1_no_next_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
|
||||
; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
|
||||
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
|
||||
...
|
||||
|
||||
# There's no need to encode a dependency for the second add, because the
|
||||
# dependency for the second mul has already guaranteed that a later VALU has
|
||||
# completed.
|
||||
---
|
||||
name: valu_dep_1_no_next_2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
|
||||
$vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
|
||||
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
|
||||
...
|
||||
|
||||
# There are no wait states between an add/sub/cmp generating carry and an
|
||||
# add/sub/cndmask that consumes it, so no need to encode a dependency.
|
||||
|
||||
---
|
||||
name: implicit_cmp_cndmask
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
|
||||
implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
|
||||
$vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
|
||||
...
|
||||
|
||||
# TODO: There should be no s_delay_alu here.
|
||||
---
|
||||
name: explicit_cmp_cndmask
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
|
||||
$sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
|
||||
$vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: implicit_addc_addc
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}implicit_addc_addc:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
|
||||
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
|
||||
$vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
$vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: explicit_addc_addc
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}explicit_addc_addc:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
|
||||
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
|
||||
$vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
|
||||
$vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: valu_dep_3_bundle
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}valu_dep_3_bundle:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
BUNDLE {
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
|
||||
}
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: if
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}if:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB23_2
|
||||
; CHECK-NEXT: %bb.1:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: .LBB23_2:
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
S_CBRANCH_VCCZ %bb.2, implicit $vcc
|
||||
bb.1:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
bb.2:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: else
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}else:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB24_2
|
||||
; CHECK-NEXT: %bb.1
|
||||
; CHECK-NEXT: s_branch .LBB24_3
|
||||
; CHECK-NEXT: .LBB24_2:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: .LBB24_3:
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
S_CBRANCH_VCCZ %bb.2, implicit $vcc
|
||||
bb.1:
|
||||
S_BRANCH %bb.3
|
||||
bb.2:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
bb.3:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
---
|
||||
name: if_else
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}if_else:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB25_2
|
||||
; CHECK-NEXT: %bb.1:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: s_branch .LBB25_3
|
||||
; CHECK-NEXT: .LBB25_2:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
|
||||
; CHECK-NEXT: .LBB25_3:
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
S_CBRANCH_VCCZ %bb.2, implicit $vcc
|
||||
bb.1:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
S_BRANCH %bb.3
|
||||
bb.2:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
|
||||
bb.3:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
# Dependency from outside the loop.
|
||||
---
|
||||
name: loop_1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}loop_1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: .LBB26_1:
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB26_1
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
bb.1:
|
||||
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
S_CBRANCH_VCCZ %bb.1, implicit $vcc
|
||||
bb.2:
|
||||
...
|
||||
|
||||
# Dependency from inside the loop.
|
||||
---
|
||||
name: loop_2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}loop_2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: .LBB27_1:
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB27_1
|
||||
bb.1:
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
S_CBRANCH_VCCZ %bb.1, implicit $vcc
|
||||
bb.2:
|
||||
...
|
||||
|
||||
# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
|
||||
# to complete.
|
||||
---
|
||||
name: sendmsg_rtn
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}sendmsg_rtn:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
|
||||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$sgpr0 = S_SENDMSG_RTN_B32 128
|
||||
$sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
# No VALU delay before or across FLAT because it waits for all outstanding VALU
|
||||
# to complete.
|
||||
---
|
||||
name: flat_load
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}flat_load:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: flat_load_b32 v0, v[0:1]
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
|
||||
...
|
||||
|
||||
# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
|
||||
# to complete.
|
||||
---
|
||||
name: waitcnt_depctr
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}waitcnt_depctr:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt_depctr 0xfff
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
S_WAITCNT_DEPCTR 4095
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
||||
|
||||
# Check that no delays are emitted for writelane instructions.
|
||||
---
|
||||
name: writelane1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}writelane1:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_writelane_b32 v0, s0, 0
|
||||
; CHECK-NEXT: v_writelane_b32 v0, s0, 1
|
||||
; CHECK-NEXT: v_writelane_b32 v0, s0, 2
|
||||
; CHECK-NEXT: v_writelane_b32 v0, s0, 3
|
||||
$vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
|
||||
$vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
|
||||
$vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
|
||||
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
|
||||
...
|
||||
|
||||
# Check if a VALU delay is added after writelane.
|
||||
---
|
||||
name: writelane2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: {{^}}writelane2:
|
||||
; CHECK: %bb.0:
|
||||
; CHECK-NEXT: v_writelane_b32 v0, s0, 3
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
|
||||
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
|
||||
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
|
||||
...
|
|
@ -388,6 +388,7 @@
|
|||
; GCN-O1-NEXT: SI Final Branch Preparation
|
||||
; GCN-O1-NEXT: SI peephole optimizations
|
||||
; GCN-O1-NEXT: Post RA hazard recognizer
|
||||
; GCN-O1-NEXT: AMDGPU Insert Delay ALU
|
||||
; GCN-O1-NEXT: Branch relaxation pass
|
||||
; GCN-O1-NEXT: Register Usage Information Collector Pass
|
||||
; GCN-O1-NEXT: Live DEBUG_VALUE analysis
|
||||
|
@ -676,6 +677,7 @@
|
|||
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
|
||||
; GCN-O1-OPTS-NEXT: SI peephole optimizations
|
||||
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU
|
||||
; GCN-O1-OPTS-NEXT: Branch relaxation pass
|
||||
; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass
|
||||
; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis
|
||||
|
@ -966,6 +968,7 @@
|
|||
; GCN-O2-NEXT: SI Final Branch Preparation
|
||||
; GCN-O2-NEXT: SI peephole optimizations
|
||||
; GCN-O2-NEXT: Post RA hazard recognizer
|
||||
; GCN-O2-NEXT: AMDGPU Insert Delay ALU
|
||||
; GCN-O2-NEXT: Branch relaxation pass
|
||||
; GCN-O2-NEXT: Register Usage Information Collector Pass
|
||||
; GCN-O2-NEXT: Live DEBUG_VALUE analysis
|
||||
|
@ -1268,6 +1271,7 @@
|
|||
; GCN-O3-NEXT: SI Final Branch Preparation
|
||||
; GCN-O3-NEXT: SI peephole optimizations
|
||||
; GCN-O3-NEXT: Post RA hazard recognizer
|
||||
; GCN-O3-NEXT: AMDGPU Insert Delay ALU
|
||||
; GCN-O3-NEXT: Branch relaxation pass
|
||||
; GCN-O3-NEXT: Register Usage Information Collector Pass
|
||||
; GCN-O3-NEXT: Live DEBUG_VALUE analysis
|
||||
|
|
|
@ -86,6 +86,7 @@ define amdgpu_kernel void @id_row_i32() #0 {
|
|||
; GFX11-SDAG: ; %bb.0:
|
||||
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
|
||||
; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en
|
||||
; GFX11-SDAG-NEXT: s_endpgm
|
||||
|
|
|
@ -15,6 +15,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
|
|||
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
@ -43,6 +44,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
|
|||
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; VERDE-LABEL: load_1d:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
|
||||
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
|
||||
; GFX9-LABEL: gather4_2d:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
|
||||
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
|
||||
; GFX9-LABEL: sample_1d:
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
|
||||
; TONGA-LABEL: image_sample_2d_f16:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
|
||||
|
||||
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
; VERDE-LABEL: sample_1d:
|
||||
|
|
|
@ -34,6 +34,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
|||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
|
||||
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
|
||||
|
@ -62,8 +63,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
|||
; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf]
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
|
||||
; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08]
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
|
||||
|
@ -105,6 +108,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
|||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04]
|
||||
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
|
||||
|
@ -147,6 +151,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
|||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
|
||||
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
|
||||
|
@ -193,6 +198,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
|||
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04]
|
||||
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08]
|
||||
|
@ -226,6 +232,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
|
|||
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
|
||||
; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08]
|
||||
|
@ -259,6 +266,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
|
|||
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
|
||||
; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
|
||||
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
|
||||
|
||||
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
|
||||
; GFX10-LABEL: sample_d_1d:
|
||||
|
|
|
@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
|
|||
; GCN-NEXT: s_mov_b32 exec_lo, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
|
||||
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
|
||||
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
|
|||
; GCN-NEXT: s_mov_b32 exec_lo, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
|
||||
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
|
||||
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
|
||||
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
|
||||
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
|
|||
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
|
||||
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
|
||||
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
|
||||
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
|
||||
|
@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
|
|||
; GCN-NEXT: s_mov_b32 exec_lo, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1
|
||||
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
|
||||
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
|
||||
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
|
||||
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
|
|
|
@ -233,6 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
|
|||
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
|
||||
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
|
||||
|
@ -325,6 +326,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
|
|||
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
|
||||
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
|
||||
|
@ -428,6 +430,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
|
|||
; GFX11-NEXT: v_mov_b32_e32 v10, 0x102
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
|
||||
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
|
@ -515,6 +518,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
|
|||
; GFX11-NEXT: v_mov_b32_e32 v7, 0x102
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
|
||||
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
|
||||
|
|
|
@ -14,6 +14,7 @@ define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
|
|||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_permlane64_b32 v0, v0
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
@ -28,6 +29,7 @@ define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
|
|||
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0x63
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_permlane64_b32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
|
|
|
@ -83,20 +83,25 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v5, v3, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v1
|
||||
; GFX11-NEXT: v_mul_lo_u32 v5, v5, v2
|
||||
; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3
|
||||
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_add3_u32 v1, v1, v4, v5
|
||||
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
|
||||
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
|
||||
|
@ -223,31 +228,40 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[11:12], null, v5, v3, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v1
|
||||
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
|
||||
; GFX11-NEXT: v_mul_lo_u32 v8, v5, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
|
||||
; GFX11-NEXT: v_mul_lo_u32 v9, v4, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2
|
||||
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
|
||||
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
|
||||
; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
||||
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
|
||||
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
|
||||
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
|
||||
|
@ -372,6 +386,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
|
|||
; GFX11-NEXT: s_add_i32 s1, s1, s6
|
||||
; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2
|
||||
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
|
||||
|
@ -548,8 +563,10 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
|
|||
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
|
||||
; GFX11-NEXT: s_cmp_lt_i32 s3, 0
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo
|
||||
; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
|
||||
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
|
||||
; GFX11-NEXT: s_add_i32 s1, s8, s7
|
||||
|
@ -558,7 +575,9 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
|
|||
; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
|
||||
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mov_b32 s5, s4
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo
|
||||
|
@ -617,9 +636,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
|
||||
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -677,8 +698,10 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
|
|||
; GFX11-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
|
|
@ -34,6 +34,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = sext i32 %arg0 to i64
|
||||
|
@ -71,6 +72,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = sext i32 %arg0 to i64
|
||||
|
@ -108,6 +110,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = zext i32 %arg0 to i64
|
||||
|
@ -145,6 +148,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = zext i32 %arg0 to i64
|
||||
|
@ -248,22 +252,29 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
|
|||
; GFX11-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, v10
|
||||
; GFX11-NEXT: v_mov_b32_e32 v10, v8
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
|
||||
; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = sext i32 %arg0 to i128
|
||||
|
@ -301,6 +312,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = sext i32 %arg0 to i63
|
||||
|
@ -346,6 +358,7 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_bfe_i32 v4, v1, 0, 31
|
||||
; GFX11-NEXT: v_bfe_i32 v5, v0, 0, 31
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%sext0 = sext i31 %arg0 to i63
|
||||
|
@ -394,9 +407,11 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%ext0 = sext i32 %arg0 to i64
|
||||
|
@ -433,6 +448,7 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%trunc.lhs = and i64 %arg0, 4294967295
|
||||
|
@ -481,8 +497,10 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
|
|||
; GFX11-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
|
||||
; GFX11-NEXT: v_and_b32_e32 v5, 1, v6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -532,9 +550,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 1, v3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%trunc.lhs = and i64 %arg0, 4294967295
|
||||
|
@ -571,6 +591,7 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%shl.lhs = shl i64 %arg0, 32
|
||||
|
@ -610,6 +631,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -731,6 +753,7 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -794,14 +817,17 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
|
||||
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -852,8 +878,10 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -908,9 +936,11 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
|
|||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
|
||||
; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3
|
||||
; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%m = mul i48 %arg0, %arg1
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
|
||||
|
||||
define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX9-LABEL: mad_i32_vvv:
|
||||
|
|
|
@ -269,6 +269,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
|
|||
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
|
||||
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
|
||||
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
|
||||
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
|
||||
|
@ -283,6 +284,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
|
|||
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
|
||||
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
|
||||
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
|
||||
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
|
||||
|
@ -561,6 +563,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
|
|||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
|
||||
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
|
||||
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
|
||||
|
@ -575,6 +578,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
|
|||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
|
||||
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
|
||||
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
|
||||
|
|
|
@ -165,6 +165,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
|
|||
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
|
||||
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
|
||||
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
|
||||
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
|
||||
|
@ -180,6 +181,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
|
|||
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
|
||||
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
|
||||
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
|
||||
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
|
||||
|
@ -359,6 +361,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
|
|||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
|
||||
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
|
||||
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
|
||||
|
@ -374,6 +377,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
|
|||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
|
||||
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
|
||||
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
|
||||
|
|
Loading…
Reference in New Issue