[AMDGPU] GFX11: automatically release VGPRs at the end of the shader

GFX11 has a new message type MSG_DEALLOC_VGPRS which can be used to
release a shader's VGPRs. Sending this at the end of a shader (just
before the s_endpgm) can help overall system performance in cases where
the s_endpgm would have to wait for outstanding VMEM stores to complete
before releasing the VGPRs.

Differential Revision: https://reviews.llvm.org/D128442
This commit is contained in:
Jay Foad 2022-06-30 16:31:49 +01:00
parent cc88445a91
commit 0f94d2b385
54 changed files with 3789 additions and 774 deletions

View File

@ -299,6 +299,9 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
void initializeAMDGPUReleaseVGPRsPass(PassRegistry &);
extern char &AMDGPUReleaseVGPRsID;
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;

View File

@ -0,0 +1,140 @@
//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert S_SENDMSG instructions to release vgprs on GFX11+.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineOperand.h"
using namespace llvm;
#define DEBUG_TYPE "release-vgprs"
namespace {
class AMDGPUReleaseVGPRs : public MachineFunctionPass {
public:
static char ID;
const SIInstrInfo *SII;
const SIRegisterInfo *TRI;
AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
// Used to cache the result of isLastInstructionVMEMStore for each block
using BlockVMEMStoreType = DenseMap<MachineBasicBlock *, bool>;
BlockVMEMStoreType BlockVMEMStore;
// Return true if the last instruction referencing a vgpr in this MBB
// is a VMEM store, otherwise return false.
// Visit previous basic blocks to find this last instruction if needed.
// Because this pass is late in the pipeline, it is expected that the
// last vgpr use will likely be one of vmem store, ds, exp.
// Loads and others vgpr operations would have been
// deleted by this point, except for complex control flow involving loops.
// This is why we are just testing the type of instructions rather
// than the operands.
bool isLastVGPRUseVMEMStore(MachineBasicBlock &MBB) {
// Use the cache to break infinite loop and save some time. Initialize to
// false in case we have a cycle.
BlockVMEMStoreType::iterator It;
bool Inserted;
std::tie(It, Inserted) = BlockVMEMStore.insert({&MBB, false});
bool &CacheEntry = It->second;
if (!Inserted)
return CacheEntry;
for (auto &MI : reverse(MBB.instrs())) {
// If it's a VMEM store, a vgpr will be used, return true.
if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) && MI.mayStore())
return CacheEntry = true;
// If it's referencing a VGPR but is not a VMEM store, return false.
if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) ||
SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) ||
SIInstrInfo::isVALU(MI))
return CacheEntry = false;
}
// Recursive call into parent blocks. Look into predecessors if there is no
// vgpr used in this block.
return CacheEntry = llvm::any_of(MBB.predecessors(),
[this](MachineBasicBlock *Parent) {
return isLastVGPRUseVMEMStore(*Parent);
});
}
bool runOnMachineBasicBlock(MachineBasicBlock &MBB) {
bool Changed = false;
for (auto &MI : MBB.terminators()) {
// Look for S_ENDPGM instructions
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
// If the last instruction using a VGPR in the block is a VMEM store,
// release VGPRs. The VGPRs release will be placed just before ending
// the program
if (isLastVGPRUseVMEMStore(MBB)) {
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG))
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
Changed = true;
}
}
}
return Changed;
}
bool runOnMachineFunction(MachineFunction &MF) override {
Function &F = MF.getFunction();
if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
return false;
// This pass only runs on GFX11+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
return false;
LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName()
<< "\n");
SII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
bool Changed = false;
for (auto &MBB : MF) {
Changed |= runOnMachineBasicBlock(MBB);
}
BlockVMEMStore.clear();
return Changed;
}
};
} // namespace
char AMDGPUReleaseVGPRs::ID = 0;
char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID;
INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false)

View File

@ -369,6 +369,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeAMDGPUReleaseVGPRsPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
@ -1421,6 +1422,9 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);
if (getOptLevel() > CodeGenOpt::Less)
addPass(&AMDGPUReleaseVGPRsID);
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
addPass(&AMDGPUInsertDelayAluID);

View File

@ -88,6 +88,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUPromoteKernelArguments.cpp
AMDGPURegBankCombiner.cpp
AMDGPURegisterBankInfo.cpp
AMDGPUReleaseVGPRs.cpp
AMDGPUReplaceLDSUseWithPointer.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp

View File

@ -2,7 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %data) {
; GFX6-LABEL: image_store_f32:
@ -43,6 +43,20 @@ define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, fl
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.f32.i32(float %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -86,6 +100,20 @@ define amdgpu_ps void @image_store_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v2f32.i32(<2 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -129,6 +157,20 @@ define amdgpu_ps void @image_store_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v3f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v3f32.i32(<3 x float> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -172,6 +214,20 @@ define amdgpu_ps void @image_store_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -215,6 +271,20 @@ define amdgpu_ps void @image_store_v4f32_dmask_0001(<8 x i32> inreg %rsrc, i32 %
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32_dmask_0001:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -258,6 +328,20 @@ define amdgpu_ps void @image_store_v4f32_dmask_0010(<8 x i32> inreg %rsrc, i32 %
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32_dmask_0010:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 2, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -301,6 +385,20 @@ define amdgpu_ps void @image_store_v4f32_dmask_0100(<8 x i32> inreg %rsrc, i32 %
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32_dmask_0100:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 4, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -344,6 +442,20 @@ define amdgpu_ps void @image_store_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32_dmask_1000:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 8, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -387,6 +499,20 @@ define amdgpu_ps void @image_store_v4f32_dmask_0011(<8 x i32> inreg %rsrc, i32 %
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32_dmask_0011:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -430,6 +556,20 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 %
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_v4f32_dmask_0110:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 6, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -479,6 +619,22 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_f32_dmask_1111:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, s10
; GFX11-NEXT: v_mov_b32_e32 v2, s11
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
tail call void @llvm.amdgcn.image.store.2d.f32.i32(float %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}

View File

@ -743,6 +743,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
@ -891,6 +892,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
@ -1009,6 +1011,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
@ -1149,6 +1152,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid

View File

@ -17,6 +17,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
@ -33,6 +34,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
@ -49,6 +51,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
@ -63,6 +66,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
@ -79,6 +83,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
@ -93,6 +98,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
@ -109,6 +115,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -123,6 +130,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -137,6 +145,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -151,6 +160,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -165,6 +175,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -179,6 +190,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -193,6 +205,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -207,6 +220,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -223,6 +237,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -237,6 +252,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -251,6 +267,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -265,6 +282,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -280,6 +298,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -294,6 +313,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -308,6 +328,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -322,6 +343,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)

View File

@ -15,6 +15,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
@ -29,6 +30,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
@ -43,6 +45,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
@ -55,6 +58,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
@ -69,6 +73,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
@ -81,6 +86,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
@ -95,6 +101,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -108,6 +115,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -120,6 +128,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -132,6 +141,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -144,6 +154,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -156,6 +167,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -168,6 +180,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -180,6 +193,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -194,6 +208,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -206,6 +221,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -218,6 +234,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -230,6 +247,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -242,6 +260,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -254,6 +273,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -266,6 +286,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -278,6 +299,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)

View File

@ -197,6 +197,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_constant:
@ -229,6 +230,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
@ -435,6 +437,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_uniform:
@ -470,6 +473,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
@ -772,6 +776,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_varying:
@ -828,6 +833,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -1262,6 +1268,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_constant:
@ -1295,6 +1302,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
@ -1540,6 +1548,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_uniform:
@ -1580,6 +1589,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
@ -1654,6 +1664,7 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -1852,6 +1863,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_constant:
@ -1885,6 +1897,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
@ -2094,6 +2107,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_uniform:
@ -2130,6 +2144,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
@ -2432,6 +2447,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_varying:
@ -2488,6 +2504,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -2933,6 +2950,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_constant:
@ -2969,6 +2987,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
@ -3225,6 +3244,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_uniform:
@ -3267,6 +3287,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
@ -3341,6 +3362,7 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -3645,6 +3667,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: and_i32_varying:
@ -3701,6 +3724,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -4004,6 +4028,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: or_i32_varying:
@ -4060,6 +4085,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -4363,6 +4389,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: xor_i32_varying:
@ -4419,6 +4446,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -4722,6 +4750,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: max_i32_varying:
@ -4778,6 +4807,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -4993,6 +5023,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: max_i64_constant:
@ -5027,6 +5058,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
@ -5329,6 +5361,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: min_i32_varying:
@ -5385,6 +5418,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -5600,6 +5634,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: min_i64_constant:
@ -5634,6 +5669,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
@ -5936,6 +5972,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: umax_i32_varying:
@ -5992,6 +6029,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -6204,6 +6242,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: umax_i64_constant:
@ -6238,6 +6277,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
@ -6540,6 +6580,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: umin_i32_varying:
@ -6596,6 +6637,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
@ -6808,6 +6850,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: umin_i64_constant:
@ -6842,6 +6885,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel

View File

@ -58,6 +58,7 @@ define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -124,6 +125,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrs
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -191,6 +193,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float ad
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -269,6 +272,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -347,6 +351,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -431,6 +436,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, f
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -499,6 +505,7 @@ define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
@ -566,6 +573,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspa
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
@ -634,6 +642,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addr
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
@ -703,6 +712,7 @@ define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspa
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
@ -769,6 +779,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double add
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
@ -836,6 +847,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
@ -908,6 +920,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -972,6 +985,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1036,6 +1050,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1100,6 +1115,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1164,6 +1180,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1228,6 +1245,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1292,6 +1310,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1343,6 +1362,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out
; GFX11-NEXT: v_mov_b32_e32 v1, 1.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -1391,6 +1411,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %ou
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -1440,6 +1461,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %ou
; GFX11-NEXT: v_mov_b32_e32 v1, 0.5
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -1489,6 +1511,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(
; GFX11-NEXT: v_mov_b32_e32 v1, 0x7fffff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -1537,6 +1560,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -1585,6 +1609,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -1656,6 +1681,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, f
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1723,6 +1749,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out,
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1795,6 +1822,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1866,6 +1894,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspac
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1933,6 +1962,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -1997,6 +2027,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -2061,6 +2092,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -2125,6 +2157,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -2189,6 +2222,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -2253,6 +2287,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@ -2304,6 +2339,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspa
; GFX11-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -2353,6 +2389,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspa
; GFX11-NEXT: v_mov_b32_e32 v1, 0x7f800001
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -2423,6 +2460,7 @@ define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x hal
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2508,6 +2546,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2592,6 +2631,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2675,6 +2715,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2750,6 +2791,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2829,6 +2871,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2907,6 +2950,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -2984,6 +3028,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -3061,6 +3106,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -3146,6 +3192,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -3231,6 +3278,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
@ -3310,6 +3358,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, flo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
{
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0

View File

@ -123,6 +123,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noa
; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%la0 = getelementptr inbounds i32, i32* %lb, i32 0
@ -264,6 +265,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32
; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%la0 = getelementptr inbounds i32, i32* %lb, i32 0
@ -344,6 +346,7 @@ define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%x1 = add i32 %x, 1
@ -400,6 +403,7 @@ define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> in
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0)
@ -497,6 +501,7 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%s = sitofp i32 %x to float

View File

@ -58,6 +58,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff1:
@ -76,6 +77,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff1 = mul i32 %soff, 1
@ -145,6 +147,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff2:
@ -164,6 +167,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff1 = mul i32 %soff, 1
@ -233,6 +237,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff4:
@ -252,6 +257,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff1 = mul i32 %soff, 1
@ -322,6 +328,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff2_voff1:
@ -341,6 +348,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff2 = mul i32 %soff, 2
@ -414,6 +422,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff2_voff2:
@ -434,6 +443,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff2 = mul i32 %soff, 2
@ -507,6 +517,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff2_voff4:
@ -527,6 +538,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff2 = mul i32 %soff, 2
@ -598,6 +610,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff4_voff1:
@ -617,6 +630,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff4 = mul i32 %soff, 4
@ -691,6 +705,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff4_voff2:
@ -711,6 +726,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff4 = mul i32 %soff, 4
@ -784,6 +800,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff4_voff4:
@ -804,6 +821,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
bb:
%soff4 = mul i32 %soff, 4

View File

@ -67,6 +67,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: zero_init_kernel:
@ -183,6 +184,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16
; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PAL-NEXT: s_endpgm
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
@ -1042,6 +1044,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:304
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:320
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
@ -1170,6 +1173,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:320
; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PAL-NEXT: s_endpgm
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
@ -2102,6 +2106,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32
; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010
; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
@ -2240,6 +2245,7 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32
; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48
; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PAL-NEXT: s_endpgm
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)

View File

@ -63,6 +63,7 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
@ -151,6 +152,7 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
@ -239,6 +241,7 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
@ -327,6 +330,7 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
@ -415,6 +419,7 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
@ -515,6 +520,7 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7:
@ -612,6 +618,7 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:

View File

@ -968,6 +968,7 @@
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: Release VGPRs
; GCN-O2-NEXT: AMDGPU Insert Delay ALU
; GCN-O2-NEXT: Branch relaxation pass
; GCN-O2-NEXT: Register Usage Information Collector Pass
@ -1271,6 +1272,7 @@
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
; GCN-O3-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: Release VGPRs
; GCN-O3-NEXT: AMDGPU Insert Delay ALU
; GCN-O3-NEXT: Branch relaxation pass
; GCN-O3-NEXT: Register Usage Information Collector Pass

View File

@ -31,6 +31,7 @@ define amdgpu_gs void @test_add_32_use(i32 %arg, i32 addrspace(1)* %out) {
; CHECK-NEXT: buffer_gl0_inv
; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b32 v[1:2], v3, off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
%res = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
store i32 %res, i32 addrspace(1)* %out, align 4
@ -63,6 +64,7 @@ define amdgpu_gs void @test_add_64_use(i32 %arg, i64 addrspace(1)* %out) {
; CHECK-NEXT: buffer_gl0_inv
; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
%res = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
store i64 %res, i64 addrspace(1)* %out, align 4

View File

@ -31,6 +31,7 @@ define amdgpu_gs void @test_sub_32_use(i32 %arg, i32 addrspace(1)* %out) {
; CHECK-NEXT: buffer_gl0_inv
; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b32 v[1:2], v3, off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
%res = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
store i32 %res, i32 addrspace(1)* %out, align 4
@ -63,6 +64,7 @@ define amdgpu_gs void @test_sub_64_use(i32 %arg, i64 addrspace(1)* %out) {
; CHECK-NEXT: buffer_gl0_inv
; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
%res = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
store i64 %res, i64 addrspace(1)* %out, align 4

View File

@ -16,6 +16,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
i16 addrspace(1)* %r,
<2 x i16> addrspace(1)* %a,

View File

@ -16,6 +16,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
half addrspace(1)* %r,
<2 x half> addrspace(1)* %a,

View File

@ -18,6 +18,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
float addrspace(1)* %r,
<2 x i16> addrspace(1)* %a,
@ -47,6 +48,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
float addrspace(1)* %r,
<2 x i16> addrspace(1)* %a,

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
; GFX9-LABEL: load_1d:
@ -10,11 +10,11 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@ -28,11 +28,11 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_2d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%t = extractelement <2 x i16> %coords, i32 1
@ -47,11 +47,11 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_3d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -67,11 +67,11 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_cube:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -87,11 +87,11 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%slice = extractelement <2 x i16> %coords, i32 1
@ -106,11 +106,11 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_2darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -126,11 +126,11 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2dmsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_2dmsaa:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -146,11 +146,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2darraymsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_2darraymsaa:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -167,11 +167,11 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_mip_1d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%mip = extractelement <2 x i16> %coords, i32 1
@ -186,11 +186,11 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_mip_2d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -206,11 +206,11 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_mip_3d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -227,11 +227,11 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_mip_cube:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -248,11 +248,11 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_1darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_mip_1darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%slice = extractelement <2 x i16> %coords_lo, i32 1
@ -268,11 +268,11 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_mip_2darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -292,6 +292,12 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@ -308,6 +314,12 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%t = extractelement <2 x i16> %coords, i32 1
@ -325,6 +337,12 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -343,6 +361,12 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_cube:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -361,6 +385,12 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%slice = extractelement <2 x i16> %coords, i32 1
@ -378,6 +408,12 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -396,6 +432,12 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2dmsaa:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -414,6 +456,12 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2darraymsaa:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -433,6 +481,12 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%mip = extractelement <2 x i16> %coords, i32 1
@ -450,6 +504,12 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -468,6 +528,12 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -487,6 +553,12 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_cube:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -506,6 +578,12 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_1darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%slice = extractelement <2 x i16> %coords_lo, i32 1
@ -524,6 +602,12 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_2darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
%t = extractelement <2 x i16> %coords_lo, i32 1
@ -540,11 +624,11 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_1d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -558,11 +642,11 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_2d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -576,11 +660,11 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_3d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -594,11 +678,11 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_cube:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -612,11 +696,11 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_1darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_1darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -630,11 +714,11 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_2darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -648,11 +732,11 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_2dmsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_2dmsaa:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -666,11 +750,11 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_2darraymsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_2darraymsaa:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@ -684,11 +768,11 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_V1:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1d_V1:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%v = call float @llvm.amdgcn.image.load.1d.f32.i16(i32 8, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@ -702,11 +786,11 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_V2:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1d_V2:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32 9, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@ -723,6 +807,12 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_V1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.f32.i16(float %vdata, i32 2, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@ -739,6 +829,12 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_V2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float> %vdata, i32 12, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@ -752,11 +848,11 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_glc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1d_glc:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1)
@ -770,11 +866,11 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1d_slc:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
@ -788,11 +884,11 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_glc_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: load_1d_glc_slc:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%s = extractelement <2 x i16> %coords, i32 0
%v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3)
@ -809,6 +905,12 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_glc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1)
@ -825,6 +927,12 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_slc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
@ -841,6 +949,12 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_glc_slc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%s = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3)
@ -852,9 +966,9 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: getresinfo_dmask0:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: ; return to shader part epilog
; GFX10PLUS-LABEL: getresinfo_dmask0:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: ; return to shader part epilog
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
%r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 0, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)

View File

@ -380,6 +380,7 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
; GFX11-LABEL: store_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -401,6 +402,7 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
; GFX11-LABEL: store_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -423,6 +425,7 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
; GFX11-LABEL: store_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -446,6 +449,7 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
; GFX11-LABEL: store_cube:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -469,6 +473,7 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; GFX11-LABEL: store_1darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -491,6 +496,7 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; GFX11-LABEL: store_2darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -514,6 +520,7 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX11-LABEL: store_2dmsaa:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -537,6 +544,7 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
; GFX11-LABEL: store_2darraymsaa:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -561,6 +569,7 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX11-LABEL: store_mip_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -583,6 +592,7 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX11-LABEL: store_mip_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -606,6 +616,7 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX11-LABEL: store_mip_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -630,6 +641,7 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; GFX11-LABEL: store_mip_cube:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -654,6 +666,7 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
; GFX11-LABEL: store_mip_1darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -677,6 +690,7 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
; GFX11-LABEL: store_mip_2darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords_lo, i32 0
@ -941,6 +955,7 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
; GFX11-LABEL: store_1d_V1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x02,0x19,0xf0,0x01,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -962,6 +977,7 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
; GFX11-LABEL: store_1d_V2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0c,0x19,0xf0,0x02,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -1055,6 +1071,7 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX11-LABEL: store_1d_glc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x80,0x4f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -1076,6 +1093,7 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
; GFX11-LABEL: store_1d_slc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x80,0x1f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0
@ -1097,6 +1115,7 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
; GFX11-LABEL: store_1d_glc_slc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x80,0x5f,0x19,0xf0,0x04,0x00,0x00,0x00]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
main_body:
%s = extractelement <2 x i16> %coords, i32 0

View File

@ -2437,10 +2437,16 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %
; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2467,10 +2473,16 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %
; NOPRT-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_2d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2497,10 +2509,16 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %
; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_3d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2527,10 +2545,16 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32
; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_cube:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_cube:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2557,10 +2581,16 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; NOPRT-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2587,10 +2617,16 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_2darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2617,10 +2653,16 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_2dmsaa:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_2dmsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2dmsaa:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2647,10 +2689,16 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
; NOPRT-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_2darraymsaa:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_2darraymsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_2darraymsaa:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2677,10 +2725,16 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
; NOPRT-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_mip_1d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_mip_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2707,10 +2761,16 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
; NOPRT-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_mip_2d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_mip_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2737,10 +2797,16 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_mip_3d:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_mip_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2767,10 +2833,16 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_mip_cube:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_mip_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_cube:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2797,10 +2869,16 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
; NOPRT-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_mip_1darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_mip_1darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_1darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -2827,10 +2905,16 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_mip_2darray:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_mip_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_mip_2darray:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -3207,10 +3291,16 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s)
; NOPRT-NEXT: image_store v0, v1, s[0:7] dmask:0x2 unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1d_V1:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1d_V1:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_V1:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -3237,10 +3327,16 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i3
; NOPRT-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1d_V2:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1d_V2:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_V2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret void
@ -3372,10 +3468,16 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1d_glc:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1d_glc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_glc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
ret void
@ -3402,10 +3504,16 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1d_slc:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1d_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_slc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
ret void
@ -3432,10 +3540,16 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: store_1d_glc_slc:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: store_1d_glc_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_1d_glc_slc:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
ret void
@ -3690,13 +3804,22 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a
; NOPRT-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
; NOPRT-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: image_store_wait:
; GFX10PLUS: ; %bb.0: ; %main_body
; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX10PLUS-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
; GFX10-LABEL: image_store_wait:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_store_wait:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0)
%data = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %arg4, <8 x i32> %arg1, i32 0, i32 0)

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
; GFX9-LABEL: store_f16_1d:
@ -13,6 +13,12 @@ define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_f16_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%bitcast = bitcast <2 x i32> %val to <4 x half>
@ -30,6 +36,12 @@ define amdgpu_ps void @store_v2f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v2f16_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%bitcast = bitcast <2 x i32> %val to <4 x half>
@ -47,6 +59,12 @@ define amdgpu_ps void @store_v3f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v3f16_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%bitcast = bitcast <2 x i32> %val to <4 x half>
@ -64,6 +82,12 @@ define amdgpu_ps void @store_v4f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v4f16_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%bitcast = bitcast <2 x i32> %val to <4 x half>
@ -81,6 +105,12 @@ define amdgpu_ps void @store_f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_f16_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -99,6 +129,12 @@ define amdgpu_ps void @store_v2f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v2f16_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -117,6 +153,12 @@ define amdgpu_ps void @store_v3f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v3f16_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -135,6 +177,12 @@ define amdgpu_ps void @store_v4f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v4f16_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -153,6 +201,12 @@ define amdgpu_ps void @store_f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_f16_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
@ -172,6 +226,12 @@ define amdgpu_ps void @store_v2f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v2f16_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
@ -191,6 +251,12 @@ define amdgpu_ps void @store_v3f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v3f16_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
@ -210,6 +276,12 @@ define amdgpu_ps void @store_v4f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v4f16_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 d16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
; GFX9-LABEL: store_f32_1d:
@ -13,6 +13,12 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_f32_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
@ -29,6 +35,12 @@ define amdgpu_ps void @store_v2f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v2f32_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
@ -45,6 +57,12 @@ define amdgpu_ps void @store_v3f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v3f32_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
@ -61,6 +79,12 @@ define amdgpu_ps void @store_v4f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v4f32_1d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
@ -77,6 +101,12 @@ define amdgpu_ps void @store_f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_f32_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -94,6 +124,12 @@ define amdgpu_ps void @store_v2f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v2f32_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -111,6 +147,12 @@ define amdgpu_ps void @store_v3f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v3f32_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -128,6 +170,12 @@ define amdgpu_ps void @store_v4f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v4f32_2d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
%y = extractelement <2 x i16> %coords, i32 1
@ -145,6 +193,12 @@ define amdgpu_ps void @store_f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_f32_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
@ -163,6 +217,12 @@ define amdgpu_ps void @store_v2f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v2f32_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
@ -181,6 +241,12 @@ define amdgpu_ps void @store_v3f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v3f32_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1
@ -199,6 +265,12 @@ define amdgpu_ps void @store_v4f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_v4f32_3d:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
%y = extractelement <2 x i16> %coords_lo, i32 1

View File

@ -247,6 +247,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -340,6 +341,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -439,6 +441,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -527,6 +530,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -17,6 +17,7 @@ define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
store i32 %v, i32 addrspace(1)* %out
@ -33,6 +34,7 @@ define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane64(i32 99)
store i32 %v, i32 addrspace(1)* %out
@ -47,6 +49,7 @@ define amdgpu_kernel void @test_v(i32 addrspace(1)* %out, i32 %src0) #1 {
; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_v:
@ -56,6 +59,7 @@ define amdgpu_kernel void @test_v(i32 addrspace(1)* %out, i32 %src0) #1 {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)

View File

@ -11,6 +11,7 @@ define amdgpu_kernel void @test_get_doorbell(i32 addrspace(1)* %out) {
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_doorbell:
@ -21,6 +22,7 @@ define amdgpu_kernel void @test_get_doorbell(i32 addrspace(1)* %out) {
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 128)
store i32 %ret, i32 addrspace(1)* %out
@ -36,6 +38,7 @@ define amdgpu_kernel void @test_get_ddid(i32 addrspace(1)* %out) {
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_ddid:
@ -46,6 +49,7 @@ define amdgpu_kernel void @test_get_ddid(i32 addrspace(1)* %out) {
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 129)
store i32 %ret, i32 addrspace(1)* %out
@ -62,6 +66,7 @@ define amdgpu_kernel void @test_get_tma(i64 addrspace(1)* %out) {
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130)
store i64 %ret, i64 addrspace(1)* %out
@ -78,6 +83,7 @@ define amdgpu_kernel void @test_get_realtime(i64 addrspace(1)* %out) {
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131)
store i64 %ret, i64 addrspace(1)* %out
@ -93,6 +99,7 @@ define amdgpu_kernel void @test_savewave(i32 addrspace(1)* %out) {
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_savewave:
@ -103,6 +110,7 @@ define amdgpu_kernel void @test_savewave(i32 addrspace(1)* %out) {
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 132)
store i32 %ret, i32 addrspace(1)* %out
@ -119,6 +127,7 @@ define amdgpu_kernel void @test_get_tba(i64 addrspace(1)* %out) {
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133)
store i64 %ret, i64 addrspace(1)* %out
@ -134,6 +143,7 @@ define amdgpu_kernel void @test_get_0_i32(i32 addrspace(1)* %out) {
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_0_i32:
@ -144,6 +154,7 @@ define amdgpu_kernel void @test_get_0_i32(i32 addrspace(1)* %out) {
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0)
store i32 %ret, i32 addrspace(1)* %out
@ -160,6 +171,7 @@ define amdgpu_kernel void @test_get_99999_i64(i64 addrspace(1)* %out) {
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999)
store i64 %ret, i64 addrspace(1)* %out

View File

@ -17,6 +17,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
@ -33,6 +34,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
@ -49,6 +51,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
@ -63,6 +66,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
@ -79,6 +83,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
@ -93,6 +98,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
@ -109,6 +115,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -123,6 +130,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -137,6 +145,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -151,6 +160,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -165,6 +175,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -179,6 +190,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -193,6 +205,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -207,6 +220,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[16:17], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -223,6 +237,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -237,6 +252,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -251,6 +267,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -265,6 +282,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -280,6 +298,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -294,6 +313,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -308,6 +328,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -322,6 +343,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[12:13], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)

View File

@ -15,6 +15,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
@ -29,6 +30,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
@ -43,6 +45,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
@ -55,6 +58,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
@ -69,6 +73,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
@ -81,6 +86,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
@ -95,6 +101,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -108,6 +115,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -120,6 +128,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -132,6 +141,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -144,6 +154,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -156,6 +167,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -168,6 +180,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -180,6 +193,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -194,6 +208,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -206,6 +221,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -218,6 +234,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -230,6 +247,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -242,6 +260,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -254,6 +273,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -266,6 +286,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -278,6 +299,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)

View File

@ -390,6 +390,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@ -582,6 +583,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)

View File

@ -706,6 +706,7 @@ define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%ext0 = zext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64

View File

@ -126,6 +126,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_unordered_load:
@ -139,6 +140,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -263,6 +265,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_load:
@ -276,6 +279,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -411,6 +415,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acquire_load:
@ -426,6 +431,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -573,6 +579,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_seq_cst_load:
@ -590,6 +597,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -695,6 +703,7 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_unordered_store:
@ -707,6 +716,7 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -811,6 +821,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_store:
@ -823,6 +834,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -941,6 +953,7 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_release_store:
@ -955,6 +968,7 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -1073,6 +1087,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_seq_cst_store:
@ -1087,6 +1102,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -1191,6 +1207,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_atomicrmw:
@ -1203,6 +1220,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -1464,6 +1482,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_release_atomicrmw:
@ -1478,6 +1497,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -1929,6 +1949,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
@ -1945,6 +1966,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -2093,6 +2115,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
@ -2111,6 +2134,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -2259,6 +2283,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
@ -2277,6 +2302,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -2395,6 +2421,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
@ -2408,6 +2435,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -2698,6 +2726,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
@ -2713,6 +2742,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -4913,6 +4943,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
@ -4928,6 +4959,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5085,6 +5117,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
@ -5102,6 +5135,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5262,6 +5296,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
@ -5279,6 +5314,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5450,6 +5486,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
@ -5469,6 +5506,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5640,6 +5678,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
@ -5659,6 +5698,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5816,6 +5856,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
@ -5833,6 +5874,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5990,6 +6032,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
@ -6007,6 +6050,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6178,6 +6222,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
@ -6197,6 +6242,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6368,6 +6414,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
@ -6387,6 +6434,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6558,6 +6606,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
@ -6577,6 +6626,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6748,6 +6798,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
@ -6767,6 +6818,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6938,6 +6990,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
@ -6957,6 +7010,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7128,6 +7182,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
@ -7147,6 +7202,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7318,6 +7374,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
@ -7337,6 +7394,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7508,6 +7566,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
@ -7527,6 +7586,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7653,6 +7713,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_unordered_load:
@ -7666,6 +7727,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -7790,6 +7852,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_load:
@ -7803,6 +7866,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -7945,6 +8009,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acquire_load:
@ -7961,6 +8026,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -8115,6 +8181,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_load:
@ -8133,6 +8200,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -8238,6 +8306,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_unordered_store:
@ -8250,6 +8319,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8354,6 +8424,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_store:
@ -8366,6 +8437,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8484,6 +8556,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_release_store:
@ -8498,6 +8571,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8616,6 +8690,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_store:
@ -8630,6 +8705,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8734,6 +8810,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
@ -8746,6 +8823,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9003,6 +9081,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_release_atomicrmw:
@ -9017,6 +9096,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9466,6 +9546,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
@ -9483,6 +9564,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9637,6 +9719,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
@ -9656,6 +9739,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9810,6 +9894,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
@ -9829,6 +9914,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9947,6 +10033,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
@ -9960,6 +10047,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -10246,6 +10334,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
@ -10261,6 +10350,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12413,6 +12503,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
@ -12428,6 +12519,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12592,6 +12684,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
@ -12610,6 +12703,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12770,6 +12864,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
@ -12787,6 +12882,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12965,6 +13061,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
@ -12985,6 +13082,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13163,6 +13261,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
@ -13183,6 +13282,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13347,6 +13447,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
@ -13365,6 +13466,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13529,6 +13631,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
@ -13547,6 +13650,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13725,6 +13829,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
@ -13745,6 +13850,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13923,6 +14029,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
@ -13943,6 +14050,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14121,6 +14229,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
@ -14141,6 +14250,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14319,6 +14429,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
@ -14339,6 +14450,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14517,6 +14629,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
@ -14537,6 +14650,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14715,6 +14829,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
@ -14735,6 +14850,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14913,6 +15029,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
@ -14933,6 +15050,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -15111,6 +15229,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
@ -15131,6 +15250,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:

View File

@ -126,6 +126,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_0:
@ -139,6 +140,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -277,6 +279,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_1:
@ -292,6 +295,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -418,6 +422,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
@ -431,6 +436,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -569,6 +575,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
@ -584,6 +591,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:

View File

@ -126,6 +126,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_unordered_load:
@ -139,6 +140,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -263,6 +265,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_load:
@ -276,6 +279,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -413,6 +417,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acquire_load:
@ -428,6 +433,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -577,6 +583,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_seq_cst_load:
@ -594,6 +601,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -699,6 +707,7 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_unordered_store:
@ -711,6 +720,7 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -815,6 +825,7 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_store:
@ -827,6 +838,7 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -947,6 +959,7 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_release_store:
@ -961,6 +974,7 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -1081,6 +1095,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_seq_cst_store:
@ -1095,6 +1110,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -1199,6 +1215,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw:
@ -1211,6 +1228,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -1476,6 +1494,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_release_atomicrmw:
@ -1490,6 +1509,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -1951,6 +1971,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw:
@ -1967,6 +1988,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -2119,6 +2141,7 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
@ -2137,6 +2160,7 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -2289,6 +2313,7 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
@ -2307,6 +2332,7 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -2425,6 +2451,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
@ -2438,6 +2465,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -2732,6 +2760,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg:
@ -2747,6 +2776,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -4991,6 +5021,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
@ -5006,6 +5037,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5165,6 +5197,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
@ -5182,6 +5215,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5344,6 +5378,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
@ -5361,6 +5396,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5536,6 +5572,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
@ -5555,6 +5592,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5730,6 +5768,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
@ -5749,6 +5788,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -5908,6 +5948,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
@ -5925,6 +5966,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6084,6 +6126,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
@ -6101,6 +6144,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6276,6 +6320,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
@ -6295,6 +6340,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6470,6 +6516,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
@ -6489,6 +6536,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6664,6 +6712,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
@ -6683,6 +6732,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -6858,6 +6908,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
@ -6877,6 +6928,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7052,6 +7104,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
@ -7071,6 +7124,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7246,6 +7300,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
@ -7265,6 +7320,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7440,6 +7496,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
@ -7459,6 +7516,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7634,6 +7692,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
@ -7653,6 +7712,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -7779,6 +7839,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_unordered_load:
@ -7792,6 +7853,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -7916,6 +7978,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_load:
@ -7929,6 +7992,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -8073,6 +8137,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acquire_load:
@ -8089,6 +8154,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -8245,6 +8311,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load:
@ -8263,6 +8330,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -8368,6 +8436,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_unordered_store:
@ -8380,6 +8449,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8484,6 +8554,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_store:
@ -8496,6 +8567,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8616,6 +8688,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_release_store:
@ -8630,6 +8703,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8750,6 +8824,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store:
@ -8764,6 +8839,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
@ -8868,6 +8944,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
@ -8880,6 +8957,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9141,6 +9219,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw:
@ -9155,6 +9234,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9614,6 +9694,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
@ -9631,6 +9712,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9789,6 +9871,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
@ -9808,6 +9891,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -9966,6 +10050,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
@ -9985,6 +10070,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
@ -10103,6 +10189,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
@ -10116,6 +10203,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -10406,6 +10494,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
@ -10421,6 +10510,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12617,6 +12707,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
@ -12632,6 +12723,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12798,6 +12890,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
@ -12816,6 +12909,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -12978,6 +13072,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
@ -12995,6 +13090,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13177,6 +13273,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
@ -13197,6 +13294,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13379,6 +13477,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
@ -13399,6 +13498,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13565,6 +13665,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
@ -13583,6 +13684,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13749,6 +13851,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
@ -13767,6 +13870,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -13949,6 +14053,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
@ -13969,6 +14074,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14151,6 +14257,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
@ -14171,6 +14278,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14353,6 +14461,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
@ -14373,6 +14482,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14555,6 +14665,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
@ -14575,6 +14686,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14757,6 +14869,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
@ -14777,6 +14890,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -14959,6 +15073,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
@ -14979,6 +15094,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -15161,6 +15277,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
@ -15181,6 +15298,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
@ -15363,6 +15481,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
@ -15383,6 +15502,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:

View File

@ -75,6 +75,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_0:
@ -89,6 +90,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -174,6 +176,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_1:
@ -190,6 +193,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -269,6 +273,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
@ -283,6 +288,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -368,6 +374,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
@ -384,6 +391,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -463,6 +471,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load:
@ -477,6 +486,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
@ -549,6 +559,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_volatile_workgroup_release_store:
@ -562,6 +573,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -123,6 +123,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_unordered_load:
@ -133,6 +134,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -253,6 +255,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_load:
@ -263,6 +266,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -395,6 +399,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acquire_load:
@ -407,6 +412,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -545,6 +551,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_seq_cst_load:
@ -558,6 +565,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -671,6 +679,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_unordered_store:
@ -682,6 +691,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -794,6 +804,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_store:
@ -805,6 +816,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -932,6 +944,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_store:
@ -945,6 +958,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -1072,6 +1086,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_seq_cst_store:
@ -1085,6 +1100,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -1197,6 +1213,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_atomicrmw:
@ -1208,6 +1225,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -1483,6 +1501,7 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_atomicrmw:
@ -1496,6 +1515,7 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -1970,6 +1990,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acquire_ret_atomicrmw:
@ -1985,6 +2006,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -2145,6 +2167,7 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
@ -2162,6 +2185,7 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -2322,6 +2346,7 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
@ -2339,6 +2364,7 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -2460,6 +2486,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
@ -2472,6 +2499,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -2766,6 +2794,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg:
@ -2780,6 +2809,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -4989,6 +5019,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
@ -5003,6 +5034,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5160,6 +5192,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
@ -5176,6 +5209,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5336,6 +5370,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
@ -5352,6 +5387,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5524,6 +5560,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
@ -5542,6 +5579,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5714,6 +5752,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
@ -5732,6 +5771,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5889,6 +5929,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
@ -5905,6 +5946,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6062,6 +6104,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
@ -6078,6 +6121,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6250,6 +6294,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
@ -6268,6 +6313,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6440,6 +6486,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
@ -6458,6 +6505,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6630,6 +6678,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
@ -6648,6 +6697,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6820,6 +6870,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
@ -6838,6 +6889,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -7010,6 +7062,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
@ -7028,6 +7081,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -7200,6 +7254,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
@ -7218,6 +7273,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -7390,6 +7446,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
@ -7408,6 +7465,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -7580,6 +7638,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
@ -7598,6 +7657,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -7720,6 +7780,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_unordered_load:
@ -7730,6 +7791,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -7850,6 +7912,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_load:
@ -7860,6 +7923,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -7992,6 +8056,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acquire_load:
@ -8004,6 +8069,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -8142,6 +8208,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_load:
@ -8155,6 +8222,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -8268,6 +8336,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_unordered_store:
@ -8279,6 +8348,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -8391,6 +8461,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_store:
@ -8402,6 +8473,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -8529,6 +8601,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_release_store:
@ -8542,6 +8615,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -8669,6 +8743,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_store:
@ -8682,6 +8757,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -8794,6 +8870,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
@ -8805,6 +8882,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9080,6 +9158,7 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_release_atomicrmw:
@ -9093,6 +9172,7 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9567,6 +9647,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
@ -9582,6 +9663,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9742,6 +9824,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
@ -9759,6 +9842,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9919,6 +10003,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
@ -9936,6 +10021,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -10057,6 +10143,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
@ -10069,6 +10156,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -10363,6 +10451,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
@ -10377,6 +10466,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12586,6 +12676,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
@ -12600,6 +12691,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12757,6 +12849,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
@ -12773,6 +12866,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12945,6 +13039,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
@ -12963,6 +13058,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13135,6 +13231,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
@ -13153,6 +13250,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13310,6 +13408,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
@ -13326,6 +13425,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13483,6 +13583,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
@ -13499,6 +13600,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13671,6 +13773,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
@ -13689,6 +13792,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13861,6 +13965,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
@ -13879,6 +13984,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14051,6 +14157,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
@ -14069,6 +14176,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14241,6 +14349,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
@ -14259,6 +14368,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14431,6 +14541,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
@ -14449,6 +14560,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14621,6 +14733,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
@ -14639,6 +14752,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14811,6 +14925,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
@ -14829,6 +14944,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -15001,6 +15117,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
@ -15019,6 +15136,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:

View File

@ -127,6 +127,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_0:
@ -138,6 +139,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -271,6 +273,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_1:
@ -282,6 +285,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -408,6 +412,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_0:
@ -419,6 +424,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -547,6 +553,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_1:
@ -558,6 +565,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:

View File

@ -123,6 +123,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_unordered_load:
@ -133,6 +134,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -253,6 +255,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_load:
@ -263,6 +266,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -397,6 +401,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_load:
@ -409,6 +414,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -549,6 +555,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_load:
@ -562,6 +569,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -675,6 +683,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_unordered_store:
@ -686,6 +695,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -798,6 +808,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_store:
@ -809,6 +820,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -938,6 +950,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_store:
@ -951,6 +964,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -1080,6 +1094,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_store:
@ -1093,6 +1108,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -1205,6 +1221,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_atomicrmw:
@ -1216,6 +1233,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -1495,6 +1513,7 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_atomicrmw:
@ -1508,6 +1527,7 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -1992,6 +2012,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw:
@ -2007,6 +2028,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -2171,6 +2193,7 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
@ -2188,6 +2211,7 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -2352,6 +2376,7 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
@ -2369,6 +2394,7 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -2490,6 +2516,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
@ -2502,6 +2529,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -2800,6 +2828,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg:
@ -2814,6 +2843,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -4351,6 +4381,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
@ -4365,6 +4396,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -4524,6 +4556,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
@ -4540,6 +4573,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -4716,6 +4750,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
@ -4734,6 +4769,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -4910,6 +4946,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
@ -4928,6 +4965,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5087,6 +5125,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
@ -5103,6 +5142,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5262,6 +5302,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
@ -5278,6 +5319,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5454,6 +5496,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
@ -5472,6 +5515,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5648,6 +5692,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
@ -5666,6 +5711,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -5842,6 +5888,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
@ -5860,6 +5907,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6036,6 +6084,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
@ -6054,6 +6103,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6230,6 +6280,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
@ -6248,6 +6299,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6424,6 +6476,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
@ -6442,6 +6495,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6618,6 +6672,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
@ -6636,6 +6691,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6812,6 +6868,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
@ -6830,6 +6887,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -6952,6 +7010,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_unordered_load:
@ -6962,6 +7021,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -7082,6 +7142,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_load:
@ -7092,6 +7153,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -7226,6 +7288,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_load:
@ -7238,6 +7301,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -7378,6 +7442,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_load:
@ -7391,6 +7456,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -7504,6 +7570,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_unordered_store:
@ -7515,6 +7582,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -7627,6 +7695,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_store:
@ -7638,6 +7707,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -7767,6 +7837,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_store:
@ -7780,6 +7851,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -7909,6 +7981,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_store:
@ -7922,6 +7995,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
@ -8034,6 +8108,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
@ -8045,6 +8120,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -8324,6 +8400,7 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw:
@ -8337,6 +8414,7 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -8821,6 +8899,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
@ -8836,6 +8915,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9000,6 +9080,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
@ -9017,6 +9098,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9181,6 +9263,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
@ -9198,6 +9281,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
@ -9319,6 +9403,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
@ -9331,6 +9416,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -9629,6 +9715,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
@ -9643,6 +9730,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -11896,6 +11984,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
@ -11910,6 +11999,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12069,6 +12159,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
@ -12085,6 +12176,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12247,6 +12339,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
@ -12263,6 +12356,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12439,6 +12533,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
@ -12457,6 +12552,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12633,6 +12729,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
@ -12651,6 +12748,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12810,6 +12908,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
@ -12826,6 +12925,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -12985,6 +13085,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
@ -13001,6 +13102,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13177,6 +13279,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
@ -13195,6 +13298,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13371,6 +13475,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
@ -13389,6 +13494,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13565,6 +13671,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
@ -13583,6 +13690,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13759,6 +13867,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
@ -13777,6 +13886,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -13953,6 +14063,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
@ -13971,6 +14082,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14147,6 +14259,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
@ -14165,6 +14278,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14341,6 +14455,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
@ -14359,6 +14474,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
@ -14535,6 +14651,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
@ -14553,6 +14670,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:

View File

@ -79,6 +79,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_load_0:
@ -89,6 +90,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -178,6 +180,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_load_1:
@ -189,6 +192,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -277,6 +281,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_store_0:
@ -289,6 +294,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -379,6 +385,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_store_1:
@ -391,6 +398,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -475,6 +483,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load:
@ -485,6 +494,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
@ -566,6 +576,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_workgroup_release_store:
@ -578,6 +589,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -139,6 +139,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_load_0:
@ -152,6 +153,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
@ -291,6 +293,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_load_1:
@ -304,6 +307,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:

View File

@ -87,6 +87,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_load_0:
@ -100,6 +101,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
@ -191,6 +193,7 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_load_1:
@ -204,6 +207,7 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:

View File

@ -162,6 +162,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_0:
@ -174,6 +175,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
@ -338,6 +340,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_1:
@ -350,6 +353,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
@ -509,6 +513,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_0:
@ -521,6 +526,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
@ -684,6 +690,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_1:
@ -697,6 +704,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:

View File

@ -106,6 +106,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_0:
@ -118,6 +119,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
@ -228,6 +230,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_1:
@ -240,6 +243,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
@ -353,6 +357,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_store_0:
@ -366,6 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
@ -481,6 +487,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_store_1:
@ -495,6 +502,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:

View File

@ -0,0 +1,411 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=release-vgprs -verify-machineinstrs -o - %s | FileCheck %s
--- |
define amdgpu_ps void @tbuffer_store1() { ret void }
define amdgpu_ps void @tbuffer_store2() { ret void }
define amdgpu_ps void @flat_store() { ret void }
define amdgpu_ps void @global_store() { ret void }
define amdgpu_ps void @buffer_store_format() { ret void }
define amdgpu_ps void @ds_write_b32() { ret void }
define amdgpu_ps void @global_store_dword() { ret void }
define amdgpu_ps void @multiple_basic_blocks1() { ret void }
define amdgpu_ps void @multiple_basic_blocks2() { ret void }
define amdgpu_ps void @multiple_basic_blocks3() { ret void }
define amdgpu_ps void @recursive_loop() { ret void }
define amdgpu_ps void @recursive_loop_vmem() { ret void }
define amdgpu_ps void @image_store() { ret void }
define amdgpu_ps void @scratch_store() { ret void }
define amdgpu_ps void @buffer_atomic() { ret void }
define amdgpu_ps void @flat_atomic() { ret void }
define amdgpu_ps void @global_atomic() { ret void }
define amdgpu_ps void @image_atomic() { ret void }
...
---
name: tbuffer_store1
body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store1
; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: tbuffer_store2
body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store2
; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "BufferResource", align 1, addrspace 4)
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "BufferResource", align 1, addrspace 4)
S_ENDPGM 0
...
---
name: flat_store
body: |
bb.0:
; CHECK-LABEL: name: flat_store
; CHECK: FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: global_store
body: |
bb.0:
; CHECK-LABEL: name: global_store
; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...
---
name: buffer_store_format
body: |
bb.0:
; CHECK-LABEL: name: buffer_store_format
; CHECK: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: ds_write_b32
body: |
bb.0:
; CHECK-LABEL: name: ds_write_b32
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
; CHECK-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
renamable $vgpr0 = IMPLICIT_DEF
renamable $vgpr1 = IMPLICIT_DEF
DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
S_ENDPGM 0
...
---
name: global_store_dword
body: |
bb.0:
liveins: $vgpr0, $sgpr0_sgpr1
; CHECK-LABEL: name: global_store_dword
; CHECK: liveins: $vgpr0, $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: multiple_basic_blocks1
body: |
; CHECK-LABEL: name: multiple_basic_blocks1
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
# One block has a VMEM store as the last instruction, we should release the VGPRS
...
---
name: multiple_basic_blocks2
body: |
; CHECK-LABEL: name: multiple_basic_blocks2
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.2
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.2
bb.1:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, 0, implicit $exec
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
# One parent block has a VMEM store, release VGPRs
---
name: multiple_basic_blocks3
body: |
; CHECK-LABEL: name: multiple_basic_blocks3
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.4(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, 0, implicit $exec
S_BRANCH %bb.2
bb.1:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.2
bb.2:
successors: %bb.4
S_BRANCH %bb.4
bb.3:
successors: %bb.4
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.4
bb.4:
S_ENDPGM 0
...
---
name: recursive_loop
body: |
; CHECK-LABEL: name: recursive_loop
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: recursive_loop_vmem
body: |
; CHECK-LABEL: name: recursive_loop_vmem
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, 0, implicit $exec
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: image_store
body: |
bb.0:
; CHECK-LABEL: name: image_store
; CHECK: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into custom "ImageResource")
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>) into custom "ImageResource")
S_ENDPGM 0
...
---
name: scratch_store
body: |
bb.0:
; CHECK-LABEL: name: scratch_store
; CHECK: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: buffer_atomic
body: |
bb.0:
; CHECK-LABEL: name: buffer_atomic
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 4)
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 4)
S_ENDPGM 0
...
---
name: flat_atomic
body: |
bb.0:
; CHECK-LABEL: name: flat_atomic
; CHECK: renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: global_atomic
body: |
bb.0:
; CHECK-LABEL: name: global_atomic
; CHECK: renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
S_ENDPGM 0
...
---
name: image_atomic
body: |
bb.0:
; CHECK-LABEL: name: image_atomic
; CHECK: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on custom "ImageResource")
; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
; CHECK-NEXT: S_ENDPGM 0
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on custom "ImageResource")
S_ENDPGM 0
...

View File

@ -31,6 +31,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
@ -53,6 +54,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
@ -75,6 +77,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
@ -95,6 +98,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
@ -117,6 +121,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
@ -137,6 +142,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
@ -159,6 +165,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -179,6 +186,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -199,6 +207,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -219,6 +228,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@ -239,6 +249,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -259,6 +270,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -279,6 +291,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -299,6 +312,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@ -321,6 +335,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -341,6 +356,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -361,6 +377,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -381,6 +398,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@ -402,6 +420,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -422,6 +441,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -442,6 +462,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@ -462,6 +483,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)

View File

@ -27,6 +27,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
@ -45,6 +46,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
@ -63,6 +65,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
@ -79,6 +82,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
@ -97,6 +101,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
@ -113,6 +118,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
@ -131,6 +137,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -148,6 +155,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -164,6 +172,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -180,6 +189,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@ -196,6 +206,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -212,6 +223,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -228,6 +240,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -244,6 +257,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@ -262,6 +276,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -278,6 +293,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -294,6 +310,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -310,6 +327,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@ -326,6 +344,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -342,6 +361,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -358,6 +378,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@ -374,6 +395,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)