diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h index c3fd2b2cc667..48aa4b034fee 100644 --- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -1130,9 +1130,6 @@ void CodeGenPassBuilder::addMachineLateOptimization( if (!TM.requiresStructuredCFG()) addPass(TailDuplicatePass()); - // Cleanup of redundant (identical) address/immediate loads. - addPass(MachineLateInstrsCleanupPass()); - // Copy propagation. addPass(MachineCopyPropagationPass()); } diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def index 0bb46e405bbe..c1ceff9680d6 100644 --- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -151,7 +151,6 @@ DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass, ()) DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ()) -DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass, ()) DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ()) DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ()) DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ()) diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index b121ecbd9627..5701dd13e152 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -334,10 +334,6 @@ namespace llvm { MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr); - /// MachineLateInstrsCleanup - This pass removes redundant identical - /// instructions after register allocation and rematerialization. - extern char &MachineLateInstrsCleanupID; - /// PeepholeOptimizer - This pass performs peephole optimizations - /// like extension and comparison eliminations. extern char &PeepholeOptimizerID; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 680cb37ce4cd..26e2d7e596dd 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -277,7 +277,6 @@ void initializeMachineDominanceFrontierPass(PassRegistry&); void initializeMachineDominatorTreePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeMachineFunctionSplitterPass(PassRegistry &); -void initializeMachineLateInstrsCleanupPass(PassRegistry&); void initializeMachineLICMPass(PassRegistry&); void initializeMachineLoopInfoPass(PassRegistry&); void initializeMachineModuleInfoWrapperPassPass(PassRegistry &); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 35d10120a311..db3b6183b5fd 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -119,7 +119,6 @@ add_llvm_component_library(LLVMCodeGen MachineFunctionSplitter.cpp MachineInstrBundle.cpp MachineInstr.cpp - MachineLateInstrsCleanup.cpp MachineLICM.cpp MachineLoopInfo.cpp MachineLoopUtils.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index beedd9418005..a8bde3b70097 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -78,7 +78,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineCycleInfoWrapperPassPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachineFunctionPrinterPassPass(Registry); - initializeMachineLateInstrsCleanupPass(Registry); initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoWrapperPassPass(Registry); diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp deleted file mode 100644 index 41a0223a3ece..000000000000 --- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp +++ /dev/null @@ -1,240 +0,0 @@ -//==--- MachineLateInstrsCleanup.cpp - Late Instructions Cleanup Pass -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This simple pass removes any identical and redundant immediate or address -// loads to the same register. The immediate loads removed can originally be -// the result of rematerialization, while the addresses are redundant frame -// addressing anchor points created during Frame Indices elimination. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -#define DEBUG_TYPE "machine-latecleanup" - -STATISTIC(NumRemoved, "Number of redundant instructions removed."); - -namespace { - -class MachineLateInstrsCleanup : public MachineFunctionPass { - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - - // Data structures to map regs to their definitions per MBB. - using Reg2DefMap = std::map; - std::vector RegDefs; - - // Walk through the instructions in MBB and remove any redundant - // instructions. - bool processBlock(MachineBasicBlock *MBB); - -public: - static char ID; // Pass identification, replacement for typeid - - MachineLateInstrsCleanup() : MachineFunctionPass(ID) { - initializeMachineLateInstrsCleanupPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } -}; - -} // end anonymous namespace - -char MachineLateInstrsCleanup::ID = 0; - -char &llvm::MachineLateInstrsCleanupID = MachineLateInstrsCleanup::ID; - -INITIALIZE_PASS(MachineLateInstrsCleanup, DEBUG_TYPE, - "Machine Late Instructions Cleanup Pass", false, false) - -bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - bool Changed = false; - - TRI = MF.getSubtarget().getRegisterInfo(); - TII = MF.getSubtarget().getInstrInfo(); - - RegDefs.clear(); - RegDefs.resize(MF.getNumBlockIDs()); - - // Visit all MBBs in an order that maximises the reuse from predecessors. - ReversePostOrderTraversal RPOT(&MF); - for (MachineBasicBlock *MBB : RPOT) - Changed |= processBlock(MBB); - - return Changed; -} - -// Clear any previous kill flag on Reg found before I in MBB. Walk backwards -// in MBB and if needed continue in predecessors until a use/def of Reg is -// encountered. This seems to be faster in practice than tracking kill flags -// in a map. -static void clearKillsForDef(Register Reg, MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - BitVector &VisitedPreds, - const TargetRegisterInfo *TRI) { - VisitedPreds.set(MBB->getNumber()); - while (I != MBB->begin()) { - I--; - bool Found = false; - for (auto &MO : I->operands()) - if (MO.isReg() && TRI->regsOverlap(MO.getReg(), Reg)) { - if (MO.isDef()) - return; - if (MO.readsReg()) { - MO.setIsKill(false); - Found = true; // Keep going for an implicit kill of the super-reg. - } - } - if (Found) - return; - } - - // If an earlier def is not in MBB, continue in predecessors. - if (!MBB->isLiveIn(Reg)) - MBB->addLiveIn(Reg); - assert(!MBB->pred_empty() && "Predecessor def not found!"); - for (MachineBasicBlock *Pred : MBB->predecessors()) - if (!VisitedPreds.test(Pred->getNumber())) - clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds, TRI); -} - -static void removeRedundantDef(MachineInstr *MI, - const TargetRegisterInfo *TRI) { - Register Reg = MI->getOperand(0).getReg(); - BitVector VisitedPreds(MI->getMF()->getNumBlockIDs()); - clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds, TRI); - MI->eraseFromParent(); - ++NumRemoved; -} - -// Return true if MI is a potential candidate for reuse/removal and if so -// also the register it defines in DefedReg. A candidate is a simple -// instruction that does not touch memory, has only one register definition -// and the only reg it may use is FrameReg. Typically this is an immediate -// load or a load-address instruction. -static bool isCandidate(const MachineInstr *MI, Register &DefedReg, - Register FrameReg) { - DefedReg = MCRegister::NoRegister; - bool SawStore = true; - if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() || - MI->isInlineAsm()) - return false; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (MO.isReg()) { - if (MO.isDef()) { - if (i == 0 && !MO.isImplicit() && !MO.isDead()) - DefedReg = MO.getReg(); - else - return false; - } else if (MO.getReg() && MO.getReg() != FrameReg) - return false; - } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() || - MO.isGlobal() || MO.isSymbol())) - return false; - } - return DefedReg.isValid(); -} - -bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { - bool Changed = false; - - Reg2DefMap &MBBDefs = RegDefs[MBB->getNumber()]; - - // Find reusable definitions in the predecessor(s). - if (!MBB->pred_empty()) { - MachineBasicBlock *FirstPred = *MBB->pred_begin(); - for (auto [Reg, DefMI] : RegDefs[FirstPred->getNumber()]) - if (llvm::all_of( - drop_begin(MBB->predecessors()), - [&, &Reg = Reg, &DefMI = DefMI](const MachineBasicBlock *Pred) { - auto PredDefI = RegDefs[Pred->getNumber()].find(Reg); - return PredDefI != RegDefs[Pred->getNumber()].end() && - DefMI->isIdenticalTo(*PredDefI->second); - })) { - MBBDefs[Reg] = DefMI; - LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in " - << printMBBReference(*MBB) << ": " << *DefMI;); - } - } - - // Process MBB. - MachineFunction *MF = MBB->getParent(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - Register FrameReg = TRI->getFrameRegister(*MF); - for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { - // If FrameReg is modified, no previous load-address instructions are valid. - if (MI.modifiesRegister(FrameReg, TRI)) { - MBBDefs.clear(); - continue; - } - - Register DefedReg; - bool IsCandidate = isCandidate(&MI, DefedReg, FrameReg); - - // Check for an earlier identical and reusable instruction. - if (IsCandidate) { - auto DefI = MBBDefs.find(DefedReg); - if (DefI != MBBDefs.end() && MI.isIdenticalTo(*DefI->second)) { - LLVM_DEBUG(dbgs() << "Removing redundant instruction in " - << printMBBReference(*MBB) << ": " << MI;); - removeRedundantDef(&MI, TRI); - Changed = true; - continue; - } - } - - // Clear any entries in map that MI clobbers. - for (auto DefI = MBBDefs.begin(); DefI != MBBDefs.end();) { - Register Reg = DefI->first; - if (MI.modifiesRegister(Reg, TRI)) - DefI = MBBDefs.erase(DefI); - else - ++DefI; - } - - // Record this MI for potential later reuse. - if (IsCandidate) { - LLVM_DEBUG(dbgs() << "Found interesting instruction in " - << printMBBReference(*MBB) << ": " << MI;); - MBBDefs[DefedReg] = &MI; - } - } - - return Changed; -} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index b95d5790e10b..59e714c4f280 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1522,9 +1522,6 @@ void TargetPassConfig::addOptimizedRegAlloc() { /// Add passes that optimize machine instructions after register allocation. void TargetPassConfig::addMachineLateOptimization() { - // Cleanup of redundant immediate/address loads. - addPass(&MachineLateInstrsCleanupID); - // Branch folding must be run after regalloc and prolog/epilog insertion. addPass(&BranchFolderPassID); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 259ecbfacb21..56c605d997d4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -291,7 +291,6 @@ void NVPTXPassConfig::addIRPasses() { // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); - disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&TailDuplicateID); disablePass(&StackMapLivenessID); diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 43e8c7f551ea..dff8ccccc3a6 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -286,10 +286,6 @@ void RISCVPassConfig::addPreRegAlloc() { void RISCVPassConfig::addPostRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination) addPass(createRISCVRedundantCopyEliminationPass()); - - // Temporarily disabled until post-RA pseudo expansion problem is fixed, - // see D123394 and D139169. - disablePass(&MachineLateInstrsCleanupID); } yaml::MachineFunctionInfo * diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 497ba8d0b2f7..ffc3de244ede 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -501,7 +501,6 @@ void WebAssemblyPassConfig::addPostRegAlloc() { // them. // These functions all require the NoVRegs property. - disablePass(&MachineLateInstrsCleanupID); disablePass(&MachineCopyPropagationID); disablePass(&PostRAMachineSinkingID); disablePass(&PostRASchedulerID); diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 15b906d1eb39..90cf49e8ed8f 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -188,7 +188,6 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization -; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication diff --git a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll index b71e9e2de7c9..bb3397efbefa 100644 --- a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll @@ -29,8 +29,14 @@ define i32 @test_stack_guard_remat2() ssp { ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x9, [x9] ; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE ; CHECK-NEXT: stur x9, [x29, #-8] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF] ; CHECK-NEXT: ldur x9, [x29, #-8] +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %entry @@ -40,6 +46,7 @@ define i32 @test_stack_guard_remat2() ssp { ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %entry ; CHECK-NEXT: bl ___stack_chk_fail +; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh6, Lloh7, Lloh8 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5 ; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4 entry: diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index 34d4612fb925..554f9b986b23 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -59,23 +59,26 @@ define float @foo2(double* %x0, double* %x1) nounwind { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w1, #1 ; CHECK-NEXT: mov w2, #2 +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w3, #3 ; CHECK-NEXT: mov w4, #4 ; CHECK-NEXT: mov w5, #5 ; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: mov w7, #7 -; CHECK-NEXT: add x9, sp, #16 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1d { z16.d }, p0, [x9] ; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: mov w7, #7 ; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl] +; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: st1d { z19.d }, p0, [x9, #3, mul vl] ; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl callee2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 7ed11c7abbb6..37592a7f99ee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -157,6 +157,8 @@ define amdgpu_kernel void @kernel_caller_byval() { ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 ; FLATSCR-NEXT: s_mov_b32 s11, 0 ; FLATSCR-NEXT: s_mov_b32 s10, 0 ; FLATSCR-NEXT: s_mov_b32 s9, 0 @@ -169,8 +171,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_mov_b32 s3, 0 ; FLATSCR-NEXT: s_mov_b32 s2, 0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 +; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 +; FLATSCR-NEXT: s_mov_b32 s40, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:24 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:32 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:40 @@ -185,7 +188,6 @@ define amdgpu_kernel void @kernel_caller_byval() { ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 offset:112 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:120 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:128 -; FLATSCR-NEXT: s_mov_b32 s40, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s40 offset:8 ; FLATSCR-NEXT: s_mov_b32 s39, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s39 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 5f71713f4a6c..d8705bffe7e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1354,6 +1354,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz .LBB13_2 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index a3e22a3bc7db..83647a04467f 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -537,6 +537,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ;;#ASMEND +; GFX803-NEXT: s_mov_b32 s4, 0x40000 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -553,6 +554,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x40000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -567,6 +569,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill +; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload @@ -581,6 +585,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill +; GFX1100-NEXT: s_movk_i32 s0, 0x1000 ; GFX1100-NEXT: ;;#ASMSTART ; GFX1100-NEXT: ;;#ASMEND ; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 3f50130cce48..b4c00f331eed 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -76,10 +76,12 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: ; implicit-def: $sgpr16_sgpr17 ; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 06ee6b4998ea..c0590ce38f28 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -22,12 +22,18 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 vcc_lo, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_kernel: @@ -37,6 +43,7 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -48,12 +55,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_kernel: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -64,6 +74,9 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:36 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -87,10 +100,16 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_kernel: @@ -105,6 +124,11 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_kernel: @@ -129,10 +153,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:52 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:36 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v4 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_kernel: @@ -147,6 +176,7 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -158,12 +188,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v4 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_kernel: ; GFX11-PAL: ; %bb.0: ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -174,11 +207,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -198,6 +235,11 @@ define void @zero_init_foo() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -206,6 +248,7 @@ define void @zero_init_foo() { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s32 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -217,6 +260,9 @@ define void @zero_init_foo() { ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -225,7 +271,7 @@ define void @zero_init_foo() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s32 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -236,6 +282,9 @@ define void @zero_init_foo() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -254,6 +303,11 @@ define void @zero_init_foo() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -270,6 +324,11 @@ define void @zero_init_foo() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -278,6 +337,7 @@ define void @zero_init_foo() { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -289,6 +349,9 @@ define void @zero_init_foo() { ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v4 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -297,7 +360,7 @@ define void @zero_init_foo() { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_mov_b32_e32 v4, s32 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -308,26 +371,15 @@ define void @zero_init_foo() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: zero_init_foo: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -348,6 +400,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_kernel: @@ -368,6 +424,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_kernel: @@ -384,6 +444,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: @@ -406,6 +470,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_kernel: @@ -422,6 +490,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX940-NEXT: s_add_i32 s0, s0, 4 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: @@ -447,6 +519,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: @@ -463,22 +539,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_sindex_kernel: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 15 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s1, s0, 2 -; GCN-NEXT: s_and_b32 s0, s0, 15 -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: s_add_u32 s1, 4, s1 -; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_add_u32 s0, 4, s0 -; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -489,6 +554,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -507,6 +573,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_foo: @@ -525,6 +595,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_foo: @@ -539,6 +613,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_foo: @@ -560,6 +638,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_foo: @@ -574,6 +656,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX940-NEXT: s_add_i32 s0, s0, 4 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_sindex_foo: @@ -597,6 +683,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_foo: @@ -611,20 +701,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_sindex_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_lshl_b32 s1, s0, 2 -; GCN-NEXT: s_and_b32 s0, s0, 15 -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: s_add_u32 s1, 4, s1 -; GCN-NEXT: v_mov_b32_e32 v0, 15 -; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_add_u32 s0, 4, s0 -; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -635,6 +716,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -651,6 +733,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_kernel: @@ -667,6 +753,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: @@ -678,6 +768,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: @@ -697,6 +791,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_kernel: @@ -708,6 +806,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: @@ -729,6 +831,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: @@ -740,17 +846,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_vindex_kernel: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 15 -; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 -; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -763,6 +863,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -779,6 +880,9 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_foo: @@ -793,6 +897,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_foo: @@ -807,6 +915,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s32 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_foo: @@ -821,6 +933,9 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_foo: @@ -834,6 +949,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_foo: @@ -848,6 +967,10 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_foo: @@ -862,19 +985,11 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_vindex_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -885,6 +1000,7 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -948,13 +1064,6 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: private_ptr_foo: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 -; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 store float 1.000000e+01, float addrspace(5)* %gep, align 4 ret void @@ -977,12 +1086,22 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 vcc_lo, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_small_offset_kernel: @@ -994,6 +1113,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1001,10 +1121,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_small_offset_kernel: @@ -1012,7 +1139,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 @@ -1023,6 +1150,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:276 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1049,10 +1182,20 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_small_offset_kernel: @@ -1069,6 +1212,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: @@ -1095,11 +1247,20 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 ; GFX1010-PAL-NEXT: s_mov_b32 s2, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 +; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:260 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:276 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:308 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v4 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v5 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: @@ -1116,6 +1277,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 @@ -1123,10 +1285,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x104 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v4 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v5 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_small_offset_kernel: @@ -1134,7 +1303,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 @@ -1145,6 +1314,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1153,6 +1328,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -1174,6 +1351,15 @@ define void @zero_init_small_offset_foo() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX9-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1184,6 +1370,7 @@ define void @zero_init_small_offset_foo() { ; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -1191,10 +1378,18 @@ define void @zero_init_small_offset_foo() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s32 +; GFX10-NEXT: v_mov_b32_e32 v5, vcc_lo ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1205,17 +1400,24 @@ define void @zero_init_small_offset_foo() { ; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1236,6 +1438,15 @@ define void @zero_init_small_offset_foo() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1254,6 +1465,16 @@ define void @zero_init_small_offset_foo() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -1264,6 +1485,7 @@ define void @zero_init_small_offset_foo() { ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -1271,10 +1493,18 @@ define void @zero_init_small_offset_foo() { ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v5, vcc_lo ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v4 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v5 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -1285,42 +1515,34 @@ define void @zero_init_small_offset_foo() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: zero_init_small_offset_foo: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 -; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -1343,6 +1565,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: @@ -1355,6 +1585,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 @@ -1365,6 +1596,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: @@ -1372,7 +1610,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -1383,6 +1621,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1408,6 +1653,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: @@ -1426,6 +1679,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x104 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1441,6 +1702,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1454,6 +1716,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1471,6 +1740,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1481,6 +1751,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: @@ -1488,7 +1765,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -1499,6 +1776,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1513,6 +1797,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1534,6 +1820,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_foo: @@ -1554,13 +1848,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_small_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 @@ -1570,6 +1872,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1594,6 +1903,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_small_offset_foo: @@ -1610,6 +1927,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x104 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1636,6 +1961,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: @@ -1661,13 +1994,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 @@ -1677,6 +2018,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1691,6 +2039,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1710,6 +2060,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: @@ -1728,6 +2086,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: @@ -1738,8 +2104,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1762,6 +2136,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_small_offset_kernel: @@ -1775,6 +2157,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0x104 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1799,6 +2189,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1822,6 +2220,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: @@ -1832,8 +2238,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) @@ -1850,6 +2264,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -1869,6 +2285,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_small_offset_foo: @@ -1876,17 +2299,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-NEXT: s_add_i32 s1, s32, 0x100 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0 ; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_small_offset_foo: @@ -1894,6 +2326,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1902,6 +2335,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -1919,6 +2359,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_small_offset_foo: @@ -1934,6 +2381,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s32 +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -1941,17 +2397,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x100 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0 ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v1 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: @@ -1959,6 +2424,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1967,21 +2433,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_vindex_small_offset_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -1995,6 +2454,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2022,6 +2483,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_large_offset_kernel: @@ -2044,10 +2514,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX10-NEXT: s_movk_i32 s2, 0x4004 ; GFX10-NEXT: s_movk_i32 s1, 0x4004 ; GFX10-NEXT: s_movk_i32 s0, 0x4004 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: zero_init_large_offset_kernel: @@ -2064,11 +2542,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-NEXT: s_movk_i32 s2, 0x4004 ; GFX11-NEXT: s_movk_i32 s1, 0x4004 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2100,6 +2585,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: s_nop 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: zero_init_large_offset_kernel: @@ -2120,6 +2614,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: @@ -2149,9 +2652,17 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v4 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v5 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: @@ -2179,10 +2690,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_movk_i32 s2, 0x4004 ; GFX1030-PAL-NEXT: s_movk_i32 s1, 0x4004 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x4004 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x4004 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v4 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v5 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: zero_init_large_offset_kernel: @@ -2199,11 +2718,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX11-PAL-NEXT: s_movk_i32 s2, 0x4004 ; GFX11-PAL-NEXT: s_movk_i32 s1, 0x4004 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PAL-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2212,6 +2738,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -2229,14 +2757,24 @@ define void @zero_init_large_offset_foo() { ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_add_i32 s3, s32, 0x4004 +; GFX9-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX9-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s3 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2247,6 +2785,7 @@ define void @zero_init_large_offset_foo() { ; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_add_i32 s4, s32, 0x4004 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 ; GFX10-NEXT: s_mov_b32 s3, s0 @@ -2254,14 +2793,23 @@ define void @zero_init_large_offset_foo() { ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_add_i32 s3, s32, 4 ; GFX10-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX10-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v4 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v5 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2272,21 +2820,29 @@ define void @zero_init_large_offset_foo() { ; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s4, s32, 4 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_add_i32 s3, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v5 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2303,14 +2859,24 @@ define void @zero_init_large_offset_foo() { ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-PAL-NEXT: s_add_i32 s3, s32, 0x4004 +; GFX9-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s3 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2325,14 +2891,25 @@ define void @zero_init_large_offset_foo() { ; GFX940-NEXT: s_mov_b32 s3, s0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NEXT: s_add_i32 s3, s32, 0x4004 +; GFX940-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX940-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s3 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32 +; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 ; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 -; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -2343,6 +2920,7 @@ define void @zero_init_large_offset_foo() { ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_mov_b32 s0, 0 +; GFX10-PAL-NEXT: s_add_i32 s4, s32, 0x4004 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0 @@ -2350,14 +2928,23 @@ define void @zero_init_large_offset_foo() { ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-PAL-NEXT: s_add_i32 s3, s32, 4 ; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 +; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s4 +; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-PAL-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v4 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v5 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; @@ -2368,21 +2955,29 @@ define void @zero_init_large_offset_foo() { ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_mov_b32 s0, 0 -; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_add_i32 s4, s32, 4 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-PAL-NEXT: s_add_i32 s3, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v4 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v5 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2391,6 +2986,8 @@ define void @zero_init_large_offset_foo() { %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0 ret void } @@ -2413,6 +3010,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: @@ -2425,6 +3030,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 @@ -2435,6 +3041,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: @@ -2442,7 +3055,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -2453,6 +3066,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2478,6 +3098,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: @@ -2496,6 +3124,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2511,6 +3147,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2524,6 +3161,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2541,6 +3185,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -2551,6 +3196,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: @@ -2558,7 +3210,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -2569,6 +3221,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2583,6 +3242,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2604,6 +3265,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_foo: @@ -2624,13 +3293,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_sindex_large_offset_foo: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 @@ -2640,6 +3317,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -2664,6 +3348,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_sindex_large_offset_foo: @@ -2680,6 +3372,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX940-NEXT: s_addk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -2706,6 +3406,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: @@ -2731,13 +3439,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 @@ -2747,6 +3463,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2761,6 +3484,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2780,6 +3505,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: @@ -2798,6 +3531,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: @@ -2809,8 +3550,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2833,6 +3582,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vindex_large_offset_kernel: @@ -2847,6 +3604,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004 +; GFX940-NEXT: v_mov_b32_e32 v1, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2871,6 +3636,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v1 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2894,6 +3667,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v1 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: @@ -2905,8 +3686,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -2923,6 +3712,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -2932,8 +3723,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-NEXT: v_mov_b32_e32 v1, vcc_lo ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 @@ -2942,6 +3733,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_large_offset_foo: @@ -2949,17 +3748,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX10-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s1 ; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_add_i32 s0, s32, 4 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_vindex_large_offset_foo: @@ -2967,17 +3776,25 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX11-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX11-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX11-NEXT: s_add_i32 s0, s32, 4 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, s2 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v1, s1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -2985,8 +3802,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 @@ -2995,6 +3812,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_vindex_large_offset_foo: @@ -3004,14 +3829,24 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX940-NEXT: s_add_i32 s1, s32, 0x4004 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1 +; GFX940-NEXT: scratch_store_dword v1, v2, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 -; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 +; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3019,17 +3854,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0 +; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s1 ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_add_i32 s0, s32, 4 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v1 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: @@ -3037,34 +3882,26 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004 +; GFX11-PAL-NEXT: s_add_i32 s0, s32, 4 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s2 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s1 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_vindex_large_offset_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -3078,6 +3915,8 @@ bb: %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0 + call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0 ret void } @@ -3097,6 +3936,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: @@ -3115,6 +3958,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: @@ -3127,6 +3974,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3149,6 +4000,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_large_imm_offset_kernel: @@ -3162,6 +4017,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3186,6 +4045,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use v0 +; GFX1010-PAL-NEXT: ;;#ASMEND ; GFX1010-PAL-NEXT: s_endpgm ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3209,6 +4072,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use v0 +; GFX1030-PAL-NEXT: ;;#ASMEND ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: @@ -3221,6 +4088,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) @@ -3230,6 +4101,7 @@ bb: store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0 ret void } @@ -3239,15 +4111,20 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi +; GFX9-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_large_imm_offset_foo: @@ -3257,14 +4134,19 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-NEXT: s_add_i32 s1, s32, 4 +; GFX10-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: store_load_large_imm_offset_foo: @@ -3273,12 +4155,17 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3286,15 +4173,20 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi +; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_lo ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: store_load_large_imm_offset_foo: @@ -3309,6 +4201,11 @@ define void @store_load_large_imm_offset_foo() { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3318,14 +4215,19 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo +; GFX10-PAL-NEXT: s_add_i32 s1, s32, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: @@ -3334,26 +4236,18 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_large_imm_offset_foo: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 13 -; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 -; GCN-NEXT: v_mov_b32_e32 v1, 15 -; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -3362,6 +4256,7 @@ bb: store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 + call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0 ret void } @@ -3372,14 +4267,17 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-NEXT: scratch_store_dword v0, v2, off offset:1024 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v1 +; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vidx_sidx_offset: @@ -3397,6 +4295,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vidx_sidx_offset: @@ -3409,6 +4311,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3418,17 +4324,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 +; GFX9-PAL-NEXT: scratch_store_dword v0, v2, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: ;;#ASMSTART +; GFX9-PAL-NEXT: ; use v1 +; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; ; GFX940-LABEL: store_load_vidx_sidx_offset: @@ -3441,6 +4350,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3463,6 +4376,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-PAL-NEXT: ;;#ASMSTART +; GFX10-PAL-NEXT: ; use v0 +; GFX10-PAL-NEXT: ;;#ASMEND ; GFX10-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: @@ -3475,18 +4392,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4 +; GFX11-PAL-NEXT: ;;#ASMSTART +; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm -; GCN-LABEL: store_load_vidx_sidx_offset: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 15 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 -; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -3495,6 +4405,7 @@ bb: %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 store volatile i32 15, i32 addrspace(5)* %gep, align 4 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 + call void asm sideeffect "; use $0", "s"([32 x i32] addrspace(5)* %alloca) #0 ret void } @@ -3577,16 +4488,6 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_i64_aligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 @@ -3672,16 +4573,6 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_i64_unaligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 @@ -3774,17 +4665,6 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) ; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_v3i32_unaligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, 2 -; GCN-NEXT: v_mov_b32_e32 v4, 3 -; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 @@ -3882,18 +4762,6 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; GCN-LABEL: store_load_v4i32_unaligned: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, 2 -; GCN-NEXT: v_mov_b32_e32 v4, 3 -; GCN-NEXT: v_mov_b32_e32 v5, 4 -; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 33d1bd6b09a7..8ceb5ec1ed8a 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -374,7 +374,6 @@ ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Shrink Wrapping analysis ; GCN-O1-NEXT: Prologue/Epilogue Insertion & Frame Finalization -; GCN-O1-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-NEXT: Control Flow Optimizer ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Tail Duplication @@ -671,7 +670,6 @@ ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Shrink Wrapping analysis ; GCN-O1-OPTS-NEXT: Prologue/Epilogue Insertion & Frame Finalization -; GCN-O1-OPTS-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O1-OPTS-NEXT: Control Flow Optimizer ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Tail Duplication @@ -970,7 +968,6 @@ ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Shrink Wrapping analysis ; GCN-O2-NEXT: Prologue/Epilogue Insertion & Frame Finalization -; GCN-O2-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O2-NEXT: Control Flow Optimizer ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Tail Duplication @@ -1282,7 +1279,6 @@ ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Shrink Wrapping analysis ; GCN-O3-NEXT: Prologue/Epilogue Insertion & Frame Finalization -; GCN-O3-NEXT: Machine Late Instructions Cleanup Pass ; GCN-O3-NEXT: Control Flow Optimizer ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Tail Duplication diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 463aacd8e28e..d125f4304c91 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -188,6 +188,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index fb070e830491..50a8d7815b93 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -187,6 +187,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; SI-NEXT: s_branch .LBB3_3 ; SI-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[14:15], -1 ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; SI-NEXT: s_and_b64 vcc, exec, s[14:15] @@ -204,6 +206,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; SI-NEXT: s_cbranch_vccz .LBB3_1 ; SI-NEXT: ; %bb.5: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; SI-NEXT: s_mov_b64 s[14:15], -1 ; SI-NEXT: s_mov_b64 vcc, s[6:7] ; SI-NEXT: s_cbranch_vccz .LBB3_7 ; SI-NEXT: ; %bb.6: ; %if.else @@ -260,6 +263,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; FLAT-NEXT: s_branch .LBB3_3 ; FLAT-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_mov_b64 s[8:9], 0 +; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 s[14:15], -1 ; FLAT-NEXT: .LBB3_2: ; %Flow ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] @@ -277,6 +282,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; FLAT-NEXT: s_cbranch_vccz .LBB3_1 ; FLAT-NEXT: ; %bb.5: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1 +; FLAT-NEXT: s_mov_b64 s[14:15], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[6:7] ; FLAT-NEXT: s_cbranch_vccz .LBB3_7 ; FLAT-NEXT: ; %bb.6: ; %if.else diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index d94fed4d1294..9af9894110c0 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -60,6 +60,7 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n) ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 ; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: .LBB0_4: ; %Flow3 ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec @@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n) ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll index 56d7fc335911..c3d3993a2736 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -140,6 +140,7 @@ define void @my_func(i32 %0) { ; GCN-NEXT: s_cbranch_scc1 .LBB0_10 ; GCN-NEXT: ; %bb.9: ; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] ; GCN-NEXT: s_cbranch_execnz .LBB0_3 ; GCN-NEXT: s_branch .LBB0_4 @@ -172,6 +173,7 @@ define void @my_func(i32 %0) { ; GCN-NEXT: ; %bb.15: ; %LeafBlock9 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc ; GCN-NEXT: ; %bb.16: ; %do.body.i.i.i.i ; GCN-NEXT: s_mov_b64 s[4:5], exec diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index b9b7a5d0f9a2..34bc7523051f 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_movk_i32 s0, 0xffc ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 @@ -70,6 +71,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_mov_b32 s4, 0x40000 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 @@ -88,6 +90,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() { ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 @@ -234,6 +237,7 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_movk_i32 s8, 0x1004 ; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -316,6 +320,7 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_movk_i32 s0, 0xff8 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART @@ -362,6 +367,7 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload @@ -385,6 +391,7 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_movk_i32 s0, 0xffc ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index ab8efa9f21a0..bdeb97cede4c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10551,6 +10551,7 @@ define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b32 s2, 0x84800 ; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10795,7 +10796,7 @@ define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_nop 0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload @@ -11031,6 +11032,7 @@ define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND +; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67 diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index 411335a98c75..253e17cdd303 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -149,7 +149,6 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization -; CHECK-NEXT: Machine Late Instructions Cleanup Pass ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll index fa9481250953..b5c63af5a348 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -1652,6 +1652,7 @@ define void @infiniteloop3() "frame-pointer"="all" { ; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-ENABLE-NEXT: @ %bb.1: @ %loop2a.preheader +; THUMB-ENABLE-NEXT: movs r0, #0 ; THUMB-ENABLE-NEXT: movs r1, #0 ; THUMB-ENABLE-NEXT: mov r2, r0 ; THUMB-ENABLE-NEXT: b LBB11_3 @@ -1678,6 +1679,7 @@ define void @infiniteloop3() "frame-pointer"="all" { ; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: cbnz r0, LBB11_5 ; THUMB-DISABLE-NEXT: @ %bb.1: @ %loop2a.preheader +; THUMB-DISABLE-NEXT: movs r0, #0 ; THUMB-DISABLE-NEXT: movs r1, #0 ; THUMB-DISABLE-NEXT: mov r2, r0 ; THUMB-DISABLE-NEXT: b LBB11_3 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll index 18fa1ad2f132..48241424ac6e 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -3764,6 +3764,7 @@ define i64 @stest_f32i64_mm(float %x) { ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB48_19: @ %entry +; SOFT-NEXT: ldr r0, .LCPI48_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB48_21 @@ -4346,6 +4347,7 @@ define i64 @stest_f16i64_mm(half %x) { ; SOFT-NEXT: @ %bb.18: @ %entry ; SOFT-NEXT: mov r3, r6 ; SOFT-NEXT: .LBB51_19: @ %entry +; SOFT-NEXT: ldr r0, .LCPI51_0 ; SOFT-NEXT: cmp r4, r0 ; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; SOFT-NEXT: beq .LBB51_21 diff --git a/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll index 6e5db3ffa5c2..af2009c7a252 100644 --- a/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ b/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -22,7 +22,7 @@ entry: ; for.body -> for.cond.backedge (100%) ; -> cond.false.i (0%) ; CHECK: bb.1.for.body: -; CHECK: successors: %bb.2(0x80000000), %bb.5(0x00000000) +; CHECK: successors: %bb.2(0x80000000), %bb.4(0x00000000) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 diff --git a/llvm/test/CodeGen/ARM/jump-table-islands.ll b/llvm/test/CodeGen/ARM/jump-table-islands.ll index c327affc0453..755ca30199ad 100644 --- a/llvm/test/CodeGen/ARM/jump-table-islands.ll +++ b/llvm/test/CodeGen/ARM/jump-table-islands.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s -%BigInt = type i8500 +%BigInt = type i5500 define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) { ; CHECK-LABEL: test_moved_jumptable: diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll index db620f65855c..976dddc694d8 100644 --- a/llvm/test/CodeGen/ARM/reg_sequence.ll +++ b/llvm/test/CodeGen/ARM/reg_sequence.ll @@ -283,6 +283,7 @@ define arm_aapcs_vfpcc i32 @t10(float %x) nounwind { ; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: bxne lr ; CHECK-NEXT: LBB9_1: ; CHECK-NEXT: trap diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll index a1f8fdb28b20..a87c51770523 100644 --- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll +++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll @@ -14,8 +14,9 @@ define i32 @test(i32, i32) local_unnamed_addr #0 { ;