[AMDGPU] Add intrinsics llvm.amdgcn.{raw|struct}.buffer.load.lds

Differential Revision: https://reviews.llvm.org/D124884
This commit is contained in:
Stanislav Mekhanoshin 2022-05-13 13:31:38 -07:00
parent 0b168a49bf
commit 791ec1c68e
9 changed files with 493 additions and 0 deletions

View File

@ -1270,6 +1270,40 @@ class AMDGPUBufferAtomicFP : Intrinsic <
// Legacy form of the intrinsic. raw and struct forms should be preferred.
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
class AMDGPURawBufferLoadLDS : Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
// bit 1 = slc,
// bit 2 = dlc on gfx10+))
// swizzled buffer (bit 3 = swz))
[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
class AMDGPUStructBufferLoadLDS : Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
// bit 1 = slc,
// bit 2 = dlc on gfx10+))
// swizzled buffer (bit 3 = swz))
[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
} // defset AMDGPUBufferIntrinsics
// Uses that do not set the done bit should set IntrWriteMem on the

View File

@ -1780,6 +1780,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
return selectBufferLoadLds(I);
default: {
return selectImpl(I, *CoverageInfo);
}
@ -3054,6 +3057,98 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
unsigned Opc;
unsigned Size = MI.getOperand(3).getImm();
// The struct intrinsic variants add one additional operand over raw.
const bool HasVIndex = MI.getNumOperands() == 9;
Register VIndex;
int OpOffset = 0;
if (HasVIndex) {
VIndex = MI.getOperand(4).getReg();
OpOffset = 1;
}
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
Optional<ValueAndVReg> MaybeVOffset =
getIConstantVRegValWithLookThrough(VOffset, *MRI);
const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
switch (Size) {
default:
return false;
case 1:
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
break;
case 2:
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
break;
case 4:
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
break;
}
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
.add(MI.getOperand(2));
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
if (HasVIndex && HasVOffset) {
Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
.addReg(VIndex)
.addImm(AMDGPU::sub0)
.addReg(VOffset)
.addImm(AMDGPU::sub1);
MIB.addReg(IdxReg);
} else if (HasVIndex) {
MIB.addReg(VIndex);
} else if (HasVOffset) {
MIB.addReg(VOffset);
}
MIB.add(MI.getOperand(1)); // rsrc
MIB.add(MI.getOperand(5 + OpOffset)); // soffset
MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
MIB.addImm((Aux >> 3) & 1); // swz
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
MachinePointerInfo StorePtrI = LoadPtrI;
StorePtrI.V = nullptr;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
Size, LoadMMO->getBaseAlign());
MachineMemOperand *StoreMMO =
MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
sizeof(int32_t), LoadMMO->getBaseAlign());
MIB.setMemRefs({LoadMMO, StoreMMO});
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
MI.removeOperand(1);

View File

@ -143,6 +143,7 @@ private:
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
MachineOperand &DataOp) const;
bool selectBufferLoadLds(MachineInstr &MI) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;

View File

@ -3012,6 +3012,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 2);
return;
}
case Intrinsic::amdgcn_raw_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
return;
}
case Intrinsic::amdgcn_struct_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
return;
}
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@ -4436,6 +4450,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_raw_buffer_load_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store: {
@ -4454,6 +4475,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_struct_buffer_load_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

View File

@ -1191,6 +1191,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
switch (IntrID) {
default:
break;
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
return true;
}
}
}
return true;
}
@ -8228,6 +8239,85 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds: {
unsigned Opc;
bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
unsigned OpOffset = HasVIndex ? 1 : 0;
SDValue VOffset = Op.getOperand(5 + OpOffset);
auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
bool HasVOffset = !CVOffset || !CVOffset->isZero();
unsigned Size = Op->getConstantOperandVal(4);
switch (Size) {
default:
return SDValue();
case 1:
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
break;
case 2:
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
break;
case 4:
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
break;
}
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
SmallVector<SDValue, 8> Ops;
if (HasVIndex && HasVOffset)
Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
{ Op.getOperand(5), // VIndex
VOffset }));
else if (HasVIndex)
Ops.push_back(Op.getOperand(5));
else if (HasVOffset)
Ops.push_back(VOffset);
Ops.push_back(Op.getOperand(2)); // rsrc
Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
Ops.push_back(
DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
Ops.push_back(
DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
MachinePointerInfo StorePtrI = LoadPtrI;
StorePtrI.V = nullptr;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
Size, LoadMMO->getBaseAlign());
MachineMemOperand *StoreMMO =
MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
sizeof(int32_t), LoadMMO->getBaseAlign());
auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
return SDValue(Load, 0);
}
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);

View File

@ -1099,6 +1099,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
continue;
// No need to wait before load from VMEM to LDS.
if (mayWriteLDSThroughDMA(MI))
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(

View File

@ -385,6 +385,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
if (DataOpIdx == -1) // LDS DMA
return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}

View File

@ -0,0 +1,113 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
; GCN-LABEL: buffer_load_lds_dword:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 glc lds
; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 slc lds
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
%ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
%res = load float, float addrspace(3)* %ptr
ret float %res
}
define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
; GCN-LABEL: buffer_load_lds_dword_imm_voffset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v0, 0x800
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 2048, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset) {
; GCN-LABEL: buffer_load_lds_dword_v_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 inreg %soffset) {
; GCN-LABEL: buffer_load_lds_dword_s_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword off, s[0:3], s5 lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
; GCN-LABEL: buffer_load_lds_dword_vs_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
; GCN-LABEL: buffer_load_lds_ushort:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v0, 0x800
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_ushort v0, s[0:3], 0 offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 2048, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
; GCN-LABEL: buffer_load_lds_ubyte:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_ubyte off, s[0:3], 0 offset:2048 lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
ret void
}

View File

@ -0,0 +1,126 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL
declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
; SDAG-LABEL: buffer_load_lds_dword:
; SDAG: ; %bb.0: ; %main_body
; SDAG-NEXT: v_mov_b32_e32 v0, 8
; SDAG-NEXT: s_mov_b32 m0, s4
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
; SDAG-NEXT: v_mov_b32_e32 v0, s4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: ds_read_b32 v0, v0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: buffer_load_lds_dword:
; GISEL: ; %bb.0: ; %main_body
; GISEL-NEXT: s_mov_b32 m0, s4
; GISEL-NEXT: v_mov_b32_e32 v0, 8
; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: ds_read_b32 v0, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
%ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
%res = load float, float addrspace(3)* %ptr
ret float %res
}
define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
; GCN-LABEL: buffer_load_lds_dword_imm_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset) {
; GCN-LABEL: buffer_load_lds_dword_v_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 inreg %soffset) {
; GCN-LABEL: buffer_load_lds_dword_s_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 idxen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GCN-LABEL: buffer_load_lds_dword_vs_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
; GCN-LABEL: buffer_load_lds_ushort:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v1, 0x800
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
; GCN-LABEL: buffer_load_lds_ubyte:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds
; GCN-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
ret void
}