[AMDGPU] New AMDGPUInsertDelayAlu pass

Differential Revision: https://reviews.llvm.org/D128270
This commit is contained in:
Jay Foad 2022-06-21 11:46:28 +01:00
parent 8d29f0fdb9
commit cfb7ffdec0
39 changed files with 1442 additions and 15 deletions

View File

@ -299,6 +299,9 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;

View File

@ -0,0 +1,457 @@
//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-insert-delay-alu"
namespace {
class AMDGPUInsertDelayAlu : public MachineFunctionPass {
public:
static char ID;
const SIInstrInfo *SII;
const TargetRegisterInfo *TRI;
TargetSchedModel SchedModel;
AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
// Return true if MI waits for all outstanding VALU instructions to complete.
static bool instructionWaitsForVALU(const MachineInstr &MI) {
// These instruction types wait for VA_VDST==0 before issuing.
const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
SIInstrFlags::FLAT | SIInstrFlags::MIMG |
SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
if (MI.getDesc().TSFlags & VA_VDST_0)
return true;
if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
return true;
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
(MI.getOperand(0).getImm() & 0xf000) == 0)
return true;
return false;
}
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
// Get the delay type for an instruction with the specified TSFlags.
static DelayType getDelayType(uint64_t TSFlags) {
if (TSFlags & SIInstrFlags::TRANS)
return TRANS;
if (TSFlags & SIInstrFlags::VALU)
return VALU;
if (TSFlags & SIInstrFlags::SALU)
return SALU;
return OTHER;
}
// Information about the last instruction(s) that wrote to a particular
// regunit. In straight-line code there will only be one such instruction, but
// when control flow converges we merge the delay information from each path
// to represent the union of the worst-case delays of each type.
struct DelayInfo {
// One larger than the maximum number of (non-TRANS) VALU instructions we
// can encode in an s_delay_alu instruction.
static const unsigned VALU_MAX = 5;
// One larger than the maximum number of TRANS instructions we can encode in
// an s_delay_alu instruction.
static const unsigned TRANS_MAX = 4;
// If it was written by a (non-TRANS) VALU, remember how many clock cycles
// are left until it completes, and how many other (non-TRANS) VALU we have
// seen since it was issued.
uint8_t VALUCycles = 0;
uint8_t VALUNum = VALU_MAX;
// If it was written by a TRANS, remember how many clock cycles are left
// until it completes, and how many other TRANS we have seen since it was
// issued.
uint8_t TRANSCycles = 0;
uint8_t TRANSNum = TRANS_MAX;
// Also remember how many other (non-TRANS) VALU we have seen since it was
// issued. When an instruction depends on both a prior TRANS and a prior
// non-TRANS VALU, this is used to decide whether to encode a wait for just
// one or both of them.
uint8_t TRANSNumVALU = VALU_MAX;
// If it was written by an SALU, remember how many clock cycles are left
// until it completes.
uint8_t SALUCycles = 0;
DelayInfo() = default;
DelayInfo(DelayType Type, unsigned Cycles) {
switch (Type) {
default:
llvm_unreachable("unexpected type");
case VALU:
VALUCycles = Cycles;
VALUNum = 0;
break;
case TRANS:
TRANSCycles = Cycles;
TRANSNum = 0;
TRANSNumVALU = 0;
break;
case SALU:
SALUCycles = Cycles;
break;
}
}
bool operator==(const DelayInfo &RHS) const {
return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
}
bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
// Merge another DelayInfo into this one, to represent the union of the
// worst-case delays of each type.
void merge(const DelayInfo &RHS) {
VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
VALUNum = std::min(VALUNum, RHS.VALUNum);
TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
}
// Update this DelayInfo after issuing an instruction. IsVALU should be 1
// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
// instruction. Return true if there is no longer any useful delay info.
bool advance(DelayType Type, unsigned Cycles) {
bool Erase = true;
VALUNum += (Type == VALU);
if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
// Forget about the VALU instruction. It was too far back or has
// definitely completed by now.
VALUNum = VALU_MAX;
VALUCycles = 0;
} else {
VALUCycles -= Cycles;
Erase = false;
}
TRANSNum += (Type == TRANS);
TRANSNumVALU += (Type == VALU);
if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
// Forget about any TRANS instruction. It was too far back or has
// definitely completed by now.
TRANSNum = TRANS_MAX;
TRANSNumVALU = VALU_MAX;
TRANSCycles = 0;
} else {
TRANSCycles -= Cycles;
Erase = false;
}
if (SALUCycles <= Cycles) {
// Forget about any SALU instruction. It has definitely completed by
// now.
SALUCycles = 0;
} else {
SALUCycles -= Cycles;
Erase = false;
}
return Erase;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump() const {
if (VALUCycles)
dbgs() << " VALUCycles=" << (int)VALUCycles;
if (VALUNum < VALU_MAX)
dbgs() << " VALUNum=" << (int)VALUNum;
if (TRANSCycles)
dbgs() << " TRANSCycles=" << (int)TRANSCycles;
if (TRANSNum < TRANS_MAX)
dbgs() << " TRANSNum=" << (int)TRANSNum;
if (TRANSNumVALU < VALU_MAX)
dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
if (SALUCycles)
dbgs() << " SALUCycles=" << (int)SALUCycles;
}
#endif
};
// A map from regunits to the delay info for that regunit.
struct DelayState : DenseMap<unsigned, DelayInfo> {
// Merge another DelayState into this one by merging the delay info for each
// regunit.
void merge(const DelayState &RHS) {
for (const auto &KV : RHS) {
iterator It;
bool Inserted;
std::tie(It, Inserted) = insert(KV);
if (!Inserted)
It->second.merge(KV.second);
}
}
// Advance the delay info for each regunit, erasing any that are no longer
// useful.
void advance(DelayType Type, unsigned Cycles) {
iterator Next;
for (auto I = begin(), E = end(); I != E; I = Next) {
Next = std::next(I);
if (I->second.advance(Type, Cycles))
erase(I);
}
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const TargetRegisterInfo *TRI) const {
if (empty()) {
dbgs() << " empty\n";
return;
}
// Dump DelayInfo for each RegUnit in numerical order.
SmallVector<const_iterator, 8> Order;
Order.reserve(size());
for (const_iterator I = begin(), E = end(); I != E; ++I)
Order.push_back(I);
llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
return A->first < B->first;
});
for (const_iterator I : Order) {
dbgs() << " " << printRegUnit(I->first, TRI);
I->second.dump();
dbgs() << "\n";
}
}
#endif
};
// The saved delay state at the end of each basic block.
DenseMap<MachineBasicBlock *, DelayState> BlockState;
// Emit an s_delay_alu instruction if necessary before MI.
MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
MachineInstr *LastDelayAlu) {
unsigned Imm = 0;
// Wait for a TRANS instruction.
if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
Imm |= 4 + Delay.TRANSNum;
// Wait for a VALU instruction (if it's more recent than any TRANS
// instruction that we're also waiting for).
if (Delay.VALUNum < DelayInfo::VALU_MAX &&
Delay.VALUNum <= Delay.TRANSNumVALU) {
if (Imm & 0xf)
Imm |= Delay.VALUNum << 7;
else
Imm |= Delay.VALUNum;
}
// Wait for an SALU instruction.
if (Delay.SALUCycles) {
if (Imm & 0x780) {
// We have already encoded a VALU and a TRANS delay. There's no room in
// the encoding for an SALU delay as well, so just drop it.
} else if (Imm & 0xf) {
Imm |= (Delay.SALUCycles + 8) << 7;
} else {
Imm |= Delay.SALUCycles + 8;
}
}
// Don't emit the s_delay_alu instruction if there's nothing to wait for.
if (!Imm)
return LastDelayAlu;
// If we only need to wait for one instruction, try encoding it in the last
// s_delay_alu that we emitted.
if (!(Imm & 0x780) && LastDelayAlu) {
unsigned Skip = 0;
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
E = MachineBasicBlock::instr_iterator(MI);
++I != E;) {
if (!I->isBundle() && !I->isMetaInstruction())
++Skip;
}
if (Skip < 6) {
MachineOperand &Op = LastDelayAlu->getOperand(0);
unsigned LastImm = Op.getImm();
assert((LastImm & ~0xf) == 0 &&
"Remembered an s_delay_alu with no room for another delay!");
LastImm |= Imm << 7 | Skip << 4;
Op.setImm(LastImm);
return nullptr;
}
}
auto &MBB = *MI.getParent();
MachineInstr *DelayAlu =
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
// Remember the s_delay_alu for next time if there is still room in it to
// encode another delay.
return (Imm & 0x780) ? nullptr : DelayAlu;
}
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
DelayState State;
for (auto *Pred : MBB.predecessors())
State.merge(BlockState[Pred]);
LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
<< "\n";
State.dump(TRI););
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
if (MI.isBundle() || MI.isMetaInstruction())
continue;
// Ignore some more instructions that do not generate any code.
switch (MI.getOpcode()) {
case AMDGPU::SI_RETURN_TO_EPILOG:
continue;
}
DelayType Type = getDelayType(MI.getDesc().TSFlags);
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
State = DelayState();
} else if (Type != OTHER) {
DelayInfo Delay;
// TODO: Scan implicit uses too?
for (const auto &Op : MI.explicit_uses()) {
if (Op.isReg()) {
// One of the operands of the writelane is also the output operand.
// This creates the insertion of redundant delays. Hence, we have to
// ignore this operand.
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
continue;
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
auto It = State.find(*UI);
if (It != State.end()) {
Delay.merge(It->second);
State.erase(*UI);
}
}
}
}
if (Emit && !MI.isBundledWithPred()) {
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
// just ignore them?
LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
}
}
if (Type != OTHER) {
// TODO: Scan implicit defs too?
for (const auto &Op : MI.defs()) {
unsigned Latency = SchedModel.computeOperandLatency(
&MI, MI.getOperandNo(&Op), nullptr, 0);
for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
State[*UI] = DelayInfo(Type, Latency);
}
}
// Advance by the number of cycles it takes to issue this instruction.
// TODO: Use a more advanced model that accounts for instructions that
// take multiple cycles to issue on a particular pipeline.
unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
// instructions on the assumption that they will usually have to be issued
// twice?
State.advance(Type, Cycles);
LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
}
if (Emit) {
assert(State == BlockState[&MBB] &&
"Basic block state should not have changed on final pass!");
} else if (State != BlockState[&MBB]) {
BlockState[&MBB] = std::move(State);
Changed = true;
}
return Changed;
}
bool runOnMachineFunction(MachineFunction &MF) override {
if (skipFunction(MF.getFunction()))
return false;
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
<< "\n");
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasDelayAlu())
return false;
SII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
SchedModel.init(&ST);
// Calculate the delay state for each basic block, iterating until we reach
// a fixed point.
SetVector<MachineBasicBlock *> WorkList;
for (auto &MBB : reverse(MF))
WorkList.insert(&MBB);
while (!WorkList.empty()) {
auto &MBB = *WorkList.pop_back_val();
bool Changed = runOnMachineBasicBlock(MBB, false);
if (Changed)
WorkList.insert(MBB.succ_begin(), MBB.succ_end());
}
LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
// Make one last pass over all basic blocks to emit s_delay_alu
// instructions.
bool Changed = false;
for (auto &MBB : MF)
Changed |= runOnMachineBasicBlock(MBB, true);
return Changed;
}
};
} // namespace
char AMDGPUInsertDelayAlu::ID = 0;
char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
false, false)

View File

@ -272,6 +272,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);
// Enable GFX11+ s_delay_alu insertion
static cl::opt<bool>
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
@ -363,6 +369,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIModeRegisterPass(*PR);
@ -1413,6 +1420,10 @@ void GCNPassConfig::addPreEmitPass() {
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
addPass(&PostRAHazardRecognizerID);
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
addPass(&AMDGPUInsertDelayAluID);
addPass(&BranchRelaxationPassID);
}

View File

@ -57,6 +57,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp

View File

@ -2,7 +2,7 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2:

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; GFX6-LABEL: gather4_2d:

View File

@ -589,6 +589,7 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX11-NEXT: ; return to shader part epilog

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:

View File

@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
; GCN-NEXT: s_endpgm
@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
; GCN-NEXT: s_endpgm
@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog

View File

@ -68,8 +68,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v7, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v8, v11
; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v10, v9
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
@ -133,8 +135,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v8, 0xffff, v9, v12
; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v11, v10
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
@ -235,8 +239,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX11-NEXT: v_readfirstlane_b32 s5, v12
; GFX11-NEXT: v_readfirstlane_b32 s6, v13
; GFX11-NEXT: v_readfirstlane_b32 s7, v14
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7]
@ -359,8 +365,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16
@ -474,8 +482,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GFX11-NEXT: v_readfirstlane_b32 s5, v13
; GFX11-NEXT: v_readfirstlane_b32 s6, v14
; GFX11-NEXT: v_readfirstlane_b32 s7, v15
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
@ -605,8 +615,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX11-NEXT: v_readfirstlane_b32 s5, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v12
; GFX11-NEXT: v_readfirstlane_b32 s7, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16
@ -984,6 +996,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7
; GFX11-NEXT: s_movk_i32 s5, 0x102
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v10, s5
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@ -1123,6 +1136,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6
; GFX11-NEXT: s_movk_i32 s5, 0x102
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v7, s5
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo

View File

@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_saddsat_i7:

View File

@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_ssubsat_i7:

View File

@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_uaddsat_i7:

View File

@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_usubsat_i7:

View File

@ -265,6 +265,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@ -342,6 +343,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@ -423,6 +425,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, f
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
@ -1650,6 +1653,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, f
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@ -1788,6 +1792,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@ -1858,6 +1863,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspac
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@ -2582,6 +2588,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@ -2664,6 +2671,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@ -2818,6 +2826,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@ -3298,6 +3307,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, flo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
; GFX11-NEXT: s_endpgm

View File

@ -476,8 +476,10 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v10, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8
; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v7, v4

View File

@ -14,24 +14,31 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpC
; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15
; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15
; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7
; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6
; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6]
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo
; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
; GCN-NEXT: s_mov_b32 exec_lo, s1
; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off

View File

@ -67,6 +67,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
@ -154,6 +155,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
@ -241,6 +243,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
@ -311,6 +314,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@ -327,6 +331,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@ -401,6 +406,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@ -418,6 +424,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@ -492,6 +499,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@ -509,6 +517,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@ -580,6 +589,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc
@ -597,6 +607,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@ -671,6 +682,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
@ -689,6 +701,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@ -763,6 +776,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@ -780,6 +794,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2

View File

@ -54,6 +54,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-LABEL: zero_init_kernel:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@ -169,6 +170,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-PAL-LABEL: zero_init_kernel:
; GFX11-PAL: ; %bb.0:
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@ -231,6 +233,7 @@ define void @zero_init_foo() {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@ -304,6 +307,7 @@ define void @zero_init_foo() {
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@ -681,6 +685,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 15
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0
; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@ -743,6 +748,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
@ -810,6 +816,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX11-NEXT: v_and_b32_e32 v1, 15, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 15
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@ -865,6 +872,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
@ -1021,6 +1029,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@ -1148,6 +1157,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@ -1219,6 +1229,7 @@ define void @zero_init_small_offset_foo() {
; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@ -1300,6 +1311,7 @@ define void @zero_init_small_offset_foo() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@ -4217,6 +4229,7 @@ define amdgpu_ps void @large_offset() {
; GFX11-LABEL: large_offset:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
@ -4317,6 +4330,7 @@ define amdgpu_ps void @large_offset() {
; GFX11-PAL-LABEL: large_offset:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0

View File

@ -0,0 +1,561 @@
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
---
name: valu_dep_1
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: valu_dep_2
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_2:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: valu_dep_3
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_3:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: valu_dep_4
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_4:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
# There's no encoding for VALU_DEP_5. A normal VALU instruction will have
# completed already.
---
name: valu_dep_5
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_5:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
$vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
$vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: trans32_dep_1
body: |
bb.0:
; CHECK-LABEL: {{^}}trans32_dep_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: trans32_dep_2
body: |
bb.0:
; CHECK-LABEL: {{^}}trans32_dep_2:
; CHECK: %bb.0:
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: v_exp_f32_e32 v1, v1
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: trans32_dep_3
body: |
bb.0:
; CHECK-LABEL: {{^}}trans32_dep_3:
; CHECK: %bb.0:
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: v_exp_f32_e32 v1, v1
; CHECK-NEXT: v_exp_f32_e32 v2, v2
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
# completed already.
---
name: trans32_dep_4
body: |
bb.0:
; CHECK-LABEL: {{^}}trans32_dep_4:
; CHECK: %bb.0:
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: v_exp_f32_e32 v1, v1
; CHECK-NEXT: v_exp_f32_e32 v2, v2
; CHECK-NEXT: v_exp_f32_e32 v3, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
$vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: salu_cycle_1
body: |
bb.0:
; CHECK-LABEL: {{^}}salu_cycle_1:
; CHECK: %bb.0:
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...
# There's no need for SALU_CYCLE_2 here because the s_mov will have completed
# already.
---
name: salu_cycle_2
body: |
bb.0:
; CHECK-LABEL: {{^}}salu_cycle_2:
; CHECK: %bb.0:
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
$sgpr0 = S_MOV_B32 0
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...
---
name: valu_dep_1_same_trans32_dep_1
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_exp_f32_e32 v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
...
# There's no need to encode the VALU depdendency because it will complete before
# the TRANS.
---
name: trans32_dep_1_only
body: |
bb.0:
; CHECK-LABEL: {{^}}trans32_dep_1_only:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_exp_f32_e32 v1, v1
; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
...
---
name: valu_dep_1_same_salu_cycle_1
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...
---
name: valu_dep_1_next_valu_dep_1
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: valu_dep_2_next_valu_dep_2
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
...
# There's no need to encode a dependency for the second mul, because the
# dependency for the first mul has already guaranteed that the add has
# completed.
---
name: valu_dep_1_no_next_1
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
$vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
...
# There's no need to encode a dependency for the second add, because the
# dependency for the second mul has already guaranteed that a later VALU has
# completed.
---
name: valu_dep_1_no_next_2
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
$vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
...
# There are no wait states between an add/sub/cmp generating carry and an
# add/sub/cndmask that consumes it, so no need to encode a dependency.
---
name: implicit_cmp_cndmask
body: |
bb.0:
; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
; CHECK: %bb.0:
; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
...
# TODO: There should be no s_delay_alu here.
---
name: explicit_cmp_cndmask
body: |
bb.0:
; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
; CHECK: %bb.0:
; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
$sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
$vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
...
---
name: implicit_addc_addc
body: |
bb.0:
; CHECK-LABEL: {{^}}implicit_addc_addc:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
$vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
$vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...
---
name: explicit_addc_addc
body: |
bb.0:
; CHECK-LABEL: {{^}}explicit_addc_addc:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
$vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
$vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...
---
name: valu_dep_3_bundle
body: |
bb.0:
; CHECK-LABEL: {{^}}valu_dep_3_bundle:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
BUNDLE {
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
}
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: if
body: |
bb.0:
; CHECK-LABEL: {{^}}if:
; CHECK: %bb.0:
; CHECK-NEXT: s_cbranch_vccz .LBB23_2
; CHECK-NEXT: %bb.1:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
S_CBRANCH_VCCZ %bb.2, implicit $vcc
bb.1:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
bb.2:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: else
body: |
bb.0:
; CHECK-LABEL: {{^}}else:
; CHECK: %bb.0:
; CHECK-NEXT: s_cbranch_vccz .LBB24_2
; CHECK-NEXT: %bb.1
; CHECK-NEXT: s_branch .LBB24_3
; CHECK-NEXT: .LBB24_2:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: .LBB24_3:
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
S_CBRANCH_VCCZ %bb.2, implicit $vcc
bb.1:
S_BRANCH %bb.3
bb.2:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
bb.3:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
---
name: if_else
body: |
bb.0:
; CHECK-LABEL: {{^}}if_else:
; CHECK: %bb.0:
; CHECK-NEXT: s_cbranch_vccz .LBB25_2
; CHECK-NEXT: %bb.1:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: s_branch .LBB25_3
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
; CHECK-NEXT: .LBB25_3:
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
S_CBRANCH_VCCZ %bb.2, implicit $vcc
bb.1:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_BRANCH %bb.3
bb.2:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
bb.3:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
# Dependency from outside the loop.
---
name: loop_1
body: |
bb.0:
; CHECK-LABEL: {{^}}loop_1:
; CHECK: %bb.0:
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: .LBB26_1:
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
; CHECK-NEXT: s_cbranch_vccz .LBB26_1
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
bb.1:
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_CBRANCH_VCCZ %bb.1, implicit $vcc
bb.2:
...
# Dependency from inside the loop.
---
name: loop_2
body: |
bb.0:
; CHECK-LABEL: {{^}}loop_2:
; CHECK: %bb.0:
; CHECK-NEXT: .LBB27_1:
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
; CHECK-NEXT: s_cbranch_vccz .LBB27_1
bb.1:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_CBRANCH_VCCZ %bb.1, implicit $vcc
bb.2:
...
# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
# to complete.
---
name: sendmsg_rtn
body: |
bb.0:
; CHECK-LABEL: {{^}}sendmsg_rtn:
; CHECK: %bb.0:
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_add_u32 s0, s0, s0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_SENDMSG_RTN_B32 128
$sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
# No VALU delay before or across FLAT because it waits for all outstanding VALU
# to complete.
---
name: flat_load
body: |
bb.0:
; CHECK-LABEL: {{^}}flat_load:
; CHECK: %bb.0:
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: flat_load_b32 v0, v[0:1]
; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
$vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
...
# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
# to complete.
---
name: waitcnt_depctr
body: |
bb.0:
; CHECK-LABEL: {{^}}waitcnt_depctr:
; CHECK: %bb.0:
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt_depctr 0xfff
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT_DEPCTR 4095
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
# Check that no delays are emitted for writelane instructions.
---
name: writelane1
body: |
bb.0:
; CHECK-LABEL: {{^}}writelane1:
; CHECK: %bb.0:
; CHECK-NEXT: v_writelane_b32 v0, s0, 0
; CHECK-NEXT: v_writelane_b32 v0, s0, 1
; CHECK-NEXT: v_writelane_b32 v0, s0, 2
; CHECK-NEXT: v_writelane_b32 v0, s0, 3
$vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
...
# Check if a VALU delay is added after writelane.
---
name: writelane2
body: |
bb.0:
; CHECK-LABEL: {{^}}writelane2:
; CHECK: %bb.0:
; CHECK-NEXT: v_writelane_b32 v0, s0, 3
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...

View File

@ -388,6 +388,7 @@
; GCN-O1-NEXT: SI Final Branch Preparation
; GCN-O1-NEXT: SI peephole optimizations
; GCN-O1-NEXT: Post RA hazard recognizer
; GCN-O1-NEXT: AMDGPU Insert Delay ALU
; GCN-O1-NEXT: Branch relaxation pass
; GCN-O1-NEXT: Register Usage Information Collector Pass
; GCN-O1-NEXT: Live DEBUG_VALUE analysis
@ -676,6 +677,7 @@
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
; GCN-O1-OPTS-NEXT: SI peephole optimizations
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU
; GCN-O1-OPTS-NEXT: Branch relaxation pass
; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass
; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis
@ -966,6 +968,7 @@
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: AMDGPU Insert Delay ALU
; GCN-O2-NEXT: Branch relaxation pass
; GCN-O2-NEXT: Register Usage Information Collector Pass
; GCN-O2-NEXT: Live DEBUG_VALUE analysis
@ -1268,6 +1271,7 @@
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
; GCN-O3-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: AMDGPU Insert Delay ALU
; GCN-O3-NEXT: Branch relaxation pass
; GCN-O3-NEXT: Register Usage Information Collector Pass
; GCN-O3-NEXT: Live DEBUG_VALUE analysis

View File

@ -86,6 +86,7 @@ define amdgpu_kernel void @id_row_i32() #0 {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en
; GFX11-SDAG-NEXT: s_endpgm

View File

@ -15,6 +15,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
@ -43,6 +44,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm

View File

@ -4,7 +4,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
; VERDE-LABEL: load_1d:

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX9-LABEL: gather4_2d:

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
; GFX9-LABEL: sample_1d:

View File

@ -3,7 +3,7 @@
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; TONGA-LABEL: image_sample_2d_f16:

View File

@ -2,7 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
; VERDE-LABEL: sample_1d:

View File

@ -34,6 +34,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
@ -62,8 +63,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf]
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
@ -105,6 +108,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04]
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
@ -147,6 +151,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
@ -193,6 +198,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04]
; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08]
@ -226,6 +232,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08]
@ -259,6 +266,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08]

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:

View File

@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
; GCN-NEXT: s_endpgm
@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
; GCN-NEXT: s_endpgm
@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog

View File

@ -233,6 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
@ -325,6 +326,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
@ -428,6 +430,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: v_mov_b32_e32 v10, 0x102
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
@ -515,6 +518,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: v_mov_b32_e32 v7, 0x102
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200

View File

@ -14,6 +14,7 @@ define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
@ -28,6 +29,7 @@ define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x63
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]

View File

@ -83,20 +83,25 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v5, v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v8, v1
; GFX11-NEXT: v_mul_lo_u32 v5, v5, v2
; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v1, v1, v4, v5
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@ -223,31 +228,40 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
; GFX11-NEXT: v_mad_i64_i32 v[11:12], null, v5, v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v1
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v8, v5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v9, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
@ -372,6 +386,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s1, s1, s6
; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@ -548,8 +563,10 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_lt_i32 s3, 0
; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_add_i32 s1, s8, s7
@ -558,7 +575,9 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s5, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo
@ -617,9 +636,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@ -677,8 +698,10 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]

View File

@ -34,6 +34,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
@ -71,6 +72,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
@ -108,6 +110,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
@ -145,6 +148,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
@ -248,22 +252,29 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0
; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
; GFX11-NEXT: v_mov_b32_e32 v7, v10
; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
; GFX11-NEXT: v_mov_b32_e32 v8, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
; GFX11-NEXT: v_mov_b32_e32 v7, v11
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i128
@ -301,6 +312,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i63
@ -346,6 +358,7 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_bfe_i32 v4, v1, 0, 31
; GFX11-NEXT: v_bfe_i32 v5, v0, 0, 31
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i31 %arg0 to i63
@ -394,9 +407,11 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext0 = sext i32 %arg0 to i64
@ -433,6 +448,7 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
@ -481,8 +497,10 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v5, 1, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@ -532,9 +550,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v4, 1, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
@ -571,6 +591,7 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl.lhs = shl i64 %arg0, 32
@ -610,6 +631,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
@ -731,6 +753,7 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
; GFX11-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2
; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
@ -794,14 +817,17 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
@ -852,8 +878,10 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
@ -908,9 +936,11 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3
; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%m = mul i48 %arg0, %arg1

View File

@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GFX9-LABEL: mad_i32_vvv:

View File

@ -269,6 +269,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
@ -283,6 +284,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
@ -561,6 +563,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
@ -575,6 +578,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0

View File

@ -165,6 +165,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
@ -180,6 +181,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
@ -359,6 +361,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
@ -374,6 +377,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0