[AMDGPU] Add intrinsics llvm.amdgcn.{raw|struct}.buffer.load.lds
Differential Revision: https://reviews.llvm.org/D124884
This commit is contained in:
parent
0b168a49bf
commit
791ec1c68e
|
@ -1270,6 +1270,40 @@ class AMDGPUBufferAtomicFP : Intrinsic <
|
|||
|
||||
// Legacy form of the intrinsic. raw and struct forms should be preferred.
|
||||
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
|
||||
|
||||
class AMDGPURawBufferLoadLDS : Intrinsic <
|
||||
[],
|
||||
[llvm_v4i32_ty, // rsrc(SGPR)
|
||||
LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
|
||||
llvm_i32_ty, // Data byte size: 1/2/4
|
||||
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
|
||||
// bit 1 = slc,
|
||||
// bit 2 = dlc on gfx10+))
|
||||
// swizzled buffer (bit 3 = swz))
|
||||
[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
|
||||
ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
|
||||
def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
|
||||
|
||||
class AMDGPUStructBufferLoadLDS : Intrinsic <
|
||||
[],
|
||||
[llvm_v4i32_ty, // rsrc(SGPR)
|
||||
LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
|
||||
llvm_i32_ty, // Data byte size: 1/2/4
|
||||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
|
||||
// bit 1 = slc,
|
||||
// bit 2 = dlc on gfx10+))
|
||||
// swizzled buffer (bit 3 = swz))
|
||||
[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
|
||||
ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
|
||||
def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
|
||||
|
||||
} // defset AMDGPUBufferIntrinsics
|
||||
|
||||
// Uses that do not set the done bit should set IntrWriteMem on the
|
||||
|
|
|
@ -1780,6 +1780,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
|
|||
return selectSBarrier(I);
|
||||
case Intrinsic::amdgcn_global_atomic_fadd:
|
||||
return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
|
||||
case Intrinsic::amdgcn_raw_buffer_load_lds:
|
||||
case Intrinsic::amdgcn_struct_buffer_load_lds:
|
||||
return selectBufferLoadLds(I);
|
||||
default: {
|
||||
return selectImpl(I, *CoverageInfo);
|
||||
}
|
||||
|
@ -3054,6 +3057,98 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
|
|||
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
|
||||
unsigned Opc;
|
||||
unsigned Size = MI.getOperand(3).getImm();
|
||||
|
||||
// The struct intrinsic variants add one additional operand over raw.
|
||||
const bool HasVIndex = MI.getNumOperands() == 9;
|
||||
Register VIndex;
|
||||
int OpOffset = 0;
|
||||
if (HasVIndex) {
|
||||
VIndex = MI.getOperand(4).getReg();
|
||||
OpOffset = 1;
|
||||
}
|
||||
|
||||
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
|
||||
Optional<ValueAndVReg> MaybeVOffset =
|
||||
getIConstantVRegValWithLookThrough(VOffset, *MRI);
|
||||
const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
|
||||
|
||||
switch (Size) {
|
||||
default:
|
||||
return false;
|
||||
case 1:
|
||||
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
|
||||
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
|
||||
: HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
|
||||
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
|
||||
break;
|
||||
case 2:
|
||||
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
|
||||
: AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
|
||||
: HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
|
||||
: AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
|
||||
break;
|
||||
case 4:
|
||||
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
|
||||
: AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
|
||||
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
|
||||
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
|
||||
break;
|
||||
}
|
||||
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
|
||||
.add(MI.getOperand(2));
|
||||
|
||||
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
|
||||
|
||||
if (HasVIndex && HasVOffset) {
|
||||
Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
|
||||
BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
|
||||
.addReg(VIndex)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(VOffset)
|
||||
.addImm(AMDGPU::sub1);
|
||||
|
||||
MIB.addReg(IdxReg);
|
||||
} else if (HasVIndex) {
|
||||
MIB.addReg(VIndex);
|
||||
} else if (HasVOffset) {
|
||||
MIB.addReg(VOffset);
|
||||
}
|
||||
|
||||
MIB.add(MI.getOperand(1)); // rsrc
|
||||
MIB.add(MI.getOperand(5 + OpOffset)); // soffset
|
||||
MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
|
||||
unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
|
||||
MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
|
||||
MIB.addImm((Aux >> 3) & 1); // swz
|
||||
|
||||
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
|
||||
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
|
||||
LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
|
||||
MachinePointerInfo StorePtrI = LoadPtrI;
|
||||
StorePtrI.V = nullptr;
|
||||
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
|
||||
|
||||
auto F = LoadMMO->getFlags() &
|
||||
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
|
||||
LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
|
||||
Size, LoadMMO->getBaseAlign());
|
||||
|
||||
MachineMemOperand *StoreMMO =
|
||||
MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
|
||||
sizeof(int32_t), LoadMMO->getBaseAlign());
|
||||
|
||||
MIB.setMemRefs({LoadMMO, StoreMMO});
|
||||
|
||||
MI.eraseFromParent();
|
||||
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
|
||||
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
|
||||
MI.removeOperand(1);
|
||||
|
|
|
@ -143,6 +143,7 @@ private:
|
|||
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
|
||||
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
|
||||
MachineOperand &DataOp) const;
|
||||
bool selectBufferLoadLds(MachineInstr &MI) const;
|
||||
bool selectBVHIntrinsic(MachineInstr &I) const;
|
||||
bool selectSMFMACIntrin(MachineInstr &I) const;
|
||||
bool selectWaveAddress(MachineInstr &I) const;
|
||||
|
|
|
@ -3012,6 +3012,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
constrainOpWithReadfirstlane(MI, MRI, 2);
|
||||
return;
|
||||
}
|
||||
case Intrinsic::amdgcn_raw_buffer_load_lds: {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
|
||||
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
|
||||
constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
|
||||
return;
|
||||
}
|
||||
case Intrinsic::amdgcn_struct_buffer_load_lds: {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
|
||||
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
|
||||
constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
|
||||
return;
|
||||
}
|
||||
default: {
|
||||
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
|
||||
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
|
||||
|
@ -4436,6 +4450,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_raw_buffer_load_lds: {
|
||||
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
|
||||
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
|
||||
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
|
||||
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_raw_buffer_store:
|
||||
case Intrinsic::amdgcn_raw_buffer_store_format:
|
||||
case Intrinsic::amdgcn_raw_tbuffer_store: {
|
||||
|
@ -4454,6 +4475,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_struct_buffer_load_lds: {
|
||||
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
|
||||
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
|
||||
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
|
||||
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
|
||||
OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_struct_buffer_store:
|
||||
case Intrinsic::amdgcn_struct_tbuffer_store: {
|
||||
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
|
||||
|
|
|
@ -1191,6 +1191,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|||
|
||||
// XXX - Should this be volatile without known ordering?
|
||||
Info.flags |= MachineMemOperand::MOVolatile;
|
||||
|
||||
switch (IntrID) {
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::amdgcn_raw_buffer_load_lds:
|
||||
case Intrinsic::amdgcn_struct_buffer_load_lds: {
|
||||
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
|
||||
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -8228,6 +8239,85 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||
M->getMemoryVT(), M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_raw_buffer_load_lds:
|
||||
case Intrinsic::amdgcn_struct_buffer_load_lds: {
|
||||
unsigned Opc;
|
||||
bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
|
||||
unsigned OpOffset = HasVIndex ? 1 : 0;
|
||||
SDValue VOffset = Op.getOperand(5 + OpOffset);
|
||||
auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
|
||||
bool HasVOffset = !CVOffset || !CVOffset->isZero();
|
||||
unsigned Size = Op->getConstantOperandVal(4);
|
||||
|
||||
switch (Size) {
|
||||
default:
|
||||
return SDValue();
|
||||
case 1:
|
||||
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
|
||||
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
|
||||
: HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
|
||||
: AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
|
||||
break;
|
||||
case 2:
|
||||
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
|
||||
: AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
|
||||
: HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
|
||||
: AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
|
||||
break;
|
||||
case 4:
|
||||
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
|
||||
: AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
|
||||
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
|
||||
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
|
||||
break;
|
||||
}
|
||||
|
||||
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
|
||||
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
|
||||
if (HasVIndex && HasVOffset)
|
||||
Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
|
||||
{ Op.getOperand(5), // VIndex
|
||||
VOffset }));
|
||||
else if (HasVIndex)
|
||||
Ops.push_back(Op.getOperand(5));
|
||||
else if (HasVOffset)
|
||||
Ops.push_back(VOffset);
|
||||
|
||||
Ops.push_back(Op.getOperand(2)); // rsrc
|
||||
Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
|
||||
Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
|
||||
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
|
||||
Ops.push_back(
|
||||
DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
|
||||
Ops.push_back(
|
||||
DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
|
||||
Ops.push_back(M0Val.getValue(0)); // Chain
|
||||
Ops.push_back(M0Val.getValue(1)); // Glue
|
||||
|
||||
auto *M = cast<MemSDNode>(Op);
|
||||
MachineMemOperand *LoadMMO = M->getMemOperand();
|
||||
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
|
||||
LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
|
||||
MachinePointerInfo StorePtrI = LoadPtrI;
|
||||
StorePtrI.V = nullptr;
|
||||
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
|
||||
|
||||
auto F = LoadMMO->getFlags() &
|
||||
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
|
||||
LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
|
||||
Size, LoadMMO->getBaseAlign());
|
||||
|
||||
MachineMemOperand *StoreMMO =
|
||||
MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
|
||||
sizeof(int32_t), LoadMMO->getBaseAlign());
|
||||
|
||||
auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
|
||||
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
|
||||
|
||||
return SDValue(Load, 0);
|
||||
}
|
||||
case Intrinsic::amdgcn_end_cf:
|
||||
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
|
||||
Op->getOperand(2), Chain), 0);
|
||||
|
|
|
@ -1099,6 +1099,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|||
unsigned AS = Memop->getAddrSpace();
|
||||
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
|
||||
continue;
|
||||
// No need to wait before load from VMEM to LDS.
|
||||
if (mayWriteLDSThroughDMA(MI))
|
||||
continue;
|
||||
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
|
||||
// VM_CNT is only relevant to vgpr or LDS.
|
||||
ScoreBrackets.determineWait(
|
||||
|
|
|
@ -385,6 +385,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
|
|||
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
|
||||
if (DataOpIdx == -1)
|
||||
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
|
||||
if (DataOpIdx == -1) // LDS DMA
|
||||
return false;
|
||||
Width = getOpSize(LdSt, DataOpIdx);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
|
||||
|
||||
declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
|
||||
|
||||
define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
|
||||
; GCN-LABEL: buffer_load_lds_dword:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword off, s[0:3], 0 lds
|
||||
; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 glc lds
|
||||
; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 slc lds
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: ds_read_b32 v0, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
|
||||
%ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
|
||||
%res = load float, float addrspace(3)* %ptr
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_imm_voffset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x800
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 2048, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_v_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 inreg %soffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_s_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword off, s[0:3], s5 lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_vs_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
|
||||
; GCN-LABEL: buffer_load_lds_ushort:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x800
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_ushort v0, s[0:3], 0 offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 2048, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
|
||||
; GCN-LABEL: buffer_load_lds_ubyte:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_ubyte off, s[0:3], 0 offset:2048 lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL
|
||||
|
||||
declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
|
||||
|
||||
define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
|
||||
; SDAG-LABEL: buffer_load_lds_dword:
|
||||
; SDAG: ; %bb.0: ; %main_body
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, 8
|
||||
; SDAG-NEXT: s_mov_b32 m0, s4
|
||||
; SDAG-NEXT: s_nop 0
|
||||
; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
|
||||
; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
|
||||
; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
|
||||
; SDAG-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-NEXT: ds_read_b32 v0, v0
|
||||
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SDAG-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GISEL-LABEL: buffer_load_lds_dword:
|
||||
; GISEL: ; %bb.0: ; %main_body
|
||||
; GISEL-NEXT: s_mov_b32 m0, s4
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, 8
|
||||
; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds
|
||||
; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds
|
||||
; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: ds_read_b32 v0, v0
|
||||
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
|
||||
%ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
|
||||
%res = load float, float addrspace(3)* %ptr
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_imm_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_v_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 inreg %soffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_s_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 idxen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_vs_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
|
||||
; GCN-LABEL: buffer_load_lds_ushort:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0x800
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
|
||||
; GCN-LABEL: buffer_load_lds_ubyte:
|
||||
; GCN: ; %bb.0: ; %main_body
|
||||
; GCN-NEXT: s_mov_b32 m0, s4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds
|
||||
; GCN-NEXT: s_endpgm
|
||||
main_body:
|
||||
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue