[VENTUS][fix] Put local variables declared in kernel function into shared memory

This commit is contained in:
zhoujing 2024-03-05 16:32:59 +08:00
parent a48f51ab76
commit 87fe5f3ce8
9 changed files with 352 additions and 79 deletions

View File

@ -43,10 +43,14 @@ _start:
li t4, 0
csrr t1, CSR_WID
csrr t2, CSR_LDS
li t3, 1024 # 1k size for single warp
mul t1, t1, t3 # sp = lds + wid * warp_size
li t3, 1024 # 1M size for single warp
mul t1, t1, t3 # sp wid * warp_size
add sp, t1, t2 # sp points to baseaddr of local memory of each SM
li tp, 0 # tp points to baseaddr for lower bound of private memory(1K) of each thread
csrr t5, CSR_NUMW
li t3, 1024
mul t5, t5, t3
add s0, t2, t5 # s0 points to local memory base addr in a workgroup
# clear BSS segment
la a0, _edata

View File

@ -116,6 +116,7 @@ enum Value {
ScalableVector = 2,
WasmLocal = 3,
VGPRSpill = 4,
LocalMemSpill = 5,
NoAlloc = 255
};
}

View File

@ -300,7 +300,8 @@ getNonLibcallCSI(const MachineFunction &MF,
// TODO: For now, we don't define VGPR callee saved registers, when we later
// add VGPR callee saved register, remember to modify here
if (FI >= 0 && (MFI.getStackID(FI) == RISCVStackID::Default ||
MFI.getStackID(FI) == RISCVStackID::SGPRSpill))
MFI.getStackID(FI) == RISCVStackID::SGPRSpill ||
MFI.getStackID(FI) == RISCVStackID::VGPRSpill))
NonLibcallCSI.push_back(CS);
}
@ -374,6 +375,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
uint64_t SPStackSize = getStackSize(MF, RISCVStackID::SGPRSpill);
uint64_t TPStackSize = getStackSize(MF, RISCVStackID::VGPRSpill);
uint64_t LocalStackSize = getStackSize(MF, RISCVStackID::LocalMemSpill);
// FIXME: need to add local data declaration calculation
CurrentSubProgramInfo->LDSMemory += SPStackSize;
CurrentSubProgramInfo->PDSMemory += TPStackSize;
@ -400,8 +403,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo,
SPReg);
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(SPStackSize),
MachineInstr::FrameSetup, getStackAlign());
StackOffset::getFixed(SPStackSize), MachineInstr::FrameSetup,
getStackAlign());
// Emit ".cfi_def_cfa_offset SPStackSize"
unsigned CFIIndex = MF.addFrameInst(
@ -411,14 +414,21 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
if(TPStackSize) {
RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo,
TPReg);
RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo,
RI->getPrivateMemoryBaseRegister(MF));
RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg,
StackOffset::getFixed(TPStackSize),
if (LocalStackSize) {
RI->adjustReg(MBB, MBBI, DL, RISCV::X8, RISCV::X8,
StackOffset::getFixed(LocalStackSize),
MachineInstr::FrameSetup, getStackAlign());
// Emit ".cfi_def_cfa_offset Local memory StackSize"
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, SPStackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}
if (TPStackSize) {
RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg,
StackOffset::getFixed(TPStackSize), MachineInstr::FrameSetup,
getStackAlign());
// Emit ".cfi_def_cfa_offset TPStackSize"
unsigned CFIIndex = MF.addFrameInst(
@ -500,12 +510,16 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// Get 2 stack size for TP and SP
uint64_t SPStackSize = getStackSize(MF, RISCVStackID::SGPRSpill);
uint64_t TPStackSize = getStackSize(MF, RISCVStackID::VGPRSpill);
uint64_t LocalStackSize = getStackSize(MF, RISCVStackID::LocalMemSpill);
// Deallocate stack
if(SPStackSize)
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-SPStackSize),
MachineInstr::FrameDestroy, getStackAlign());
if(LocalStackSize)
RI->adjustReg(MBB, MBBI, DL, RISCV::X8, RISCV::X8,
StackOffset::getFixed(-LocalStackSize),
MachineInstr::FrameDestroy, getStackAlign());
if(TPStackSize) {
RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg,
StackOffset::getFixed(-TPStackSize),
@ -564,7 +578,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
assert((StackID == RISCVStackID::Default ||
StackID == RISCVStackID::SGPRSpill ||
StackID == RISCVStackID::VGPRSpill) &&
StackID == RISCVStackID::VGPRSpill ||
StackID == RISCVStackID::LocalMemSpill) &&
"Unexpected stack ID for the frame object.");
// Different stacks for sALU and vALU threads.
@ -785,9 +800,12 @@ void RISCVFrameLowering::determineStackID(MachineFunction &MF) const {
// MFI.setStackID(I, RISCVStackID::VGPRSpill);
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, I);
if(MFI.getStackID(I) != RISCVStackID::SGPRSpill &&
PtrInfo.getAddrSpace() == RISCVAS::PRIVATE_ADDRESS)
if (MFI.getStackID(I) != RISCVStackID::Default)
continue;
if (PtrInfo.getAddrSpace() == RISCVAS::PRIVATE_ADDRESS)
MFI.setStackID(I, RISCVStackID::VGPRSpill);
else if (PtrInfo.getAddrSpace() == RISCVAS::LOCAL_ADDRESS)
MFI.setStackID(I, RISCVStackID::LocalMemSpill);
else
MFI.setStackID(I, RISCVStackID::SGPRSpill);
}
@ -824,17 +842,17 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
Register Reg = CS.getReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
// TODO: Have we allocated stack for vGPR spilling?
if(Reg.id() < RISCV::V0 || Reg.id() > RISCV::V255) {
MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::SGPRSpill);
// FIXME: Right now, no vgpr callee saved register, maybe later needed
TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(),
RC, TRI);
} else {
assert(Reg.id() >= RISCV::V32 && Reg.id() <= RISCV::V255 && "TODO");
MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::VGPRSpill);
TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(),
RC, TRI);
}
// else {
// FIXME: Right now, no callee saved register for VGPR
// MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::VGPRSpill);
// }
}
return true;
@ -862,7 +880,6 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
for (auto &CS : NonLibcallCSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
if(Reg.id() < RISCV::V0 || Reg.id() > RISCV::V255 )
TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI);
assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
}
@ -946,6 +963,7 @@ bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case RISCVStackID::Default:
case RISCVStackID::SGPRSpill:
case RISCVStackID::VGPRSpill:
case RISCVStackID::LocalMemSpill:
return true;
case RISCVStackID::ScalableVector:
case RISCVStackID::NoAlloc:

View File

@ -19,6 +19,7 @@
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
@ -35,6 +36,7 @@
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/IR/PatternMatch.h"
@ -4296,9 +4298,35 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
assert(N->getOffset() == 0 && "unexpected offset in global node");
// FIXME: Only support local address?
if (N->getAddressSpace() == RISCVAS::LOCAL_ADDRESS)
return lowerGlobalLocalAddress(N, DAG);
return getAddr(N, DAG, N->getGlobal()->isDSOLocal());
}
/// For local variables, we need to store variables into local memory,
/// rather than put it into '.sbss' section
/// TODO: Remove the address allocating in '.sbss' section
SDValue RISCVTargetLowering::lowerGlobalLocalAddress(GlobalAddressSDNode *Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
static SmallVector<std::pair<const GlobalVariable *, int>> LoweredVariables;
MachineFrameInfo &MFI = MF.getFrameInfo();
const DataLayout &DL = DAG.getDataLayout();
auto *GV = cast<GlobalVariable>(Op->getGlobal());
for(auto &VA : LoweredVariables) {
if(VA.first == GV)
return DAG.getFrameIndex(VA.second, MVT::i32);
}
unsigned AlignValue = DL.getABITypeAlignment(GV->getValueType());
int FI = MFI.CreateStackObject(DL.getTypeAllocSize(GV->getValueType())
/*Offset need to be modified too*/,
Align(AlignValue), false, nullptr, RISCVStackID::LocalMemSpill);
LoweredVariables.push_back(std::make_pair(GV, FI));
return DAG.getFrameIndex(FI, MVT::i32);
}
SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
@ -7458,7 +7486,6 @@ SDValue RISCVTargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
}
SDValue RISCVTargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
SDValue Op,
const SDLoc &DL,
@ -11480,7 +11507,7 @@ static bool CC_Ventus(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
// Allocate stack for arguments which can not use register
unsigned StackOffset =
Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign);
Reg ? 0 : -State.AllocateStack(StoreSizeBytes, StackAlign);
// If we reach this point and PendingLocs is non-empty, we must be at the
// end of a split argument that must be passed indirectly.
@ -13483,6 +13510,10 @@ bool RISCVTargetLowering::isSDNodeSourceOfDivergence(
}
case ISD::STORE: {
const StoreSDNode *Store= cast<StoreSDNode>(N);
auto &MFI = FLI->MF->getFrameInfo();
if(auto *BaseBase = dyn_cast<FrameIndexSDNode>(Store->getOperand(1)))
if(MFI.getStackID(BaseBase->getIndex()) == RISCVStackID::SGPRSpill)
return false;
return Store->getAddressSpace() == RISCVAS::PRIVATE_ADDRESS ||
Store->getPointerInfo().StackID == RISCVStackID::VGPRSpill;
}
@ -13494,6 +13525,8 @@ bool RISCVTargetLowering::isSDNodeSourceOfDivergence(
case ISD::INTRINSIC_W_CHAIN:
return RISCVII::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
case Intrinsic::vastart:
return true;
/*
case AMDGPUISD::ATOMIC_CMP_SWAP:
case AMDGPUISD::ATOMIC_INC:

View File

@ -646,6 +646,9 @@ private:
SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGlobalLocalAddress(GlobalAddressSDNode *Op,
SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;

View File

@ -95,7 +95,7 @@ unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
return 0;
}
bool RISCVInstrInfo::isVGPRMemoryAccess(const MachineInstr &MI) const {
bool RISCVInstrInfo::isPrivateMemoryAccess(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return false;
@ -107,11 +107,102 @@ bool RISCVInstrInfo::isVGPRMemoryAccess(const MachineInstr &MI) const {
case RISCV::VSW:
case RISCV::VSH:
case RISCV::VSB:
case RISCV::VSWI12:
return true;
}
}
bool RISCVInstrInfo::isUniformMemoryAccess(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return false;
case RISCV::LW:
case RISCV::LB:
case RISCV::LBU:
case RISCV::LH:
case RISCV::LHU:
case RISCV::SW:
case RISCV::SH:
case RISCV::SB:
return true;
}
}
bool RISCVInstrInfo::isLocalMemoryAccess(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return false;
case RISCV::VLWI12:
case RISCV::VLBI12:
case RISCV::VLBUI12:
case RISCV::VLHI12:
case RISCV::VLHUI12:
case RISCV::VSWI12:
case RISCV::VSHI12:
case RISCV::VSBI12:
return true;
}
}
unsigned RISCVInstrInfo::getPrivateMemoryOpcode(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case RISCV::LW:
case RISCV::VLWI12:
return RISCV::VLW;
case RISCV::LB:
case RISCV::VLBI12:
return RISCV::VLB;
case RISCV::LBU:
case RISCV::VLBUI12:
return RISCV::VLBU;
case RISCV::LH:
case RISCV::VLHI12:
return RISCV::VLH;
case RISCV::LHU:
case RISCV::VLHUI12:
return RISCV::VLHU;
case RISCV::SW:
case RISCV::VSWI12:
return RISCV::VSW;
case RISCV::SH:
case RISCV::VSHI12:
return RISCV::VSH;
case RISCV::SB:
case RISCV::VSBI12:
return RISCV::VSB;
default:
// MI.dump();
assert(0 && "TODO");
return RISCV::VLW;
}
}
unsigned RISCVInstrInfo::getUniformMemoryOpcode(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case RISCV::VLW:
return RISCV::VLWI12;
case RISCV::VLB:
return RISCV::VLBI12;
case RISCV::VLBU:
return RISCV::VLBUI12;
case RISCV::VLH:
return RISCV::VLHI12;
case RISCV::VLHU:
return RISCV::VLHUI12;
case RISCV::VSW:
return RISCV::VSWI12;
case RISCV::VSH:
return RISCV::VSHI12;
case RISCV::VSB:
return RISCV::VSBI12;
default:
// MI.dump();
assert(0 && "TODO");
return RISCV::VLW;
}
}
unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
switch (MI.getOpcode()) {

View File

@ -55,7 +55,18 @@ public:
MCInst getNop() const override;
const MCInstrDesc &getBrCond(RISCVCC::CondCode CC) const;
bool isVGPRMemoryAccess(const MachineInstr &MI) const;
/// Check the memory access instruction is private memory access
bool isPrivateMemoryAccess(const MachineInstr &MI) const;
/// Check the memory access instruction is uniform memory access
bool isUniformMemoryAccess(const MachineInstr &MI) const;
/// Check the memory access instruction is uniform memory access
bool isLocalMemoryAccess(const MachineInstr &MI) const;
unsigned getPrivateMemoryOpcode(MachineInstr &MI) const;
unsigned getUniformMemoryOpcode(MachineInstr &MI) const;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;

View File

@ -87,10 +87,9 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Use markSuperRegs to ensure any register aliases are also reserved
markSuperRegs(Reserved, RISCV::X0); // zero
markSuperRegs(Reserved, RISCV::X2); // sp
markSuperRegs(Reserved, RISCV::X8); // s0
markSuperRegs(Reserved, RISCV::X3); // gp
markSuperRegs(Reserved, RISCV::X4); // tp
if (TFI->hasFP(MF))
markSuperRegs(Reserved, RISCV::X8); // fp
// Reserve the base register if we need to realign the stack and allocate
// variable-sized objects at runtime.
if (TFI->hasBP(MF))
@ -333,7 +332,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
MachineInstr &MI = *II;
MachineBasicBlock *MBB = MI.getParent();
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
const RISCVRegisterInfo *RI = ST.getRegisterInfo();
const RISCVInstrInfo *RII = ST.getInstrInfo();
@ -343,12 +344,14 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
auto FrameIndexID = MF.getFrameInfo().getStackID(FrameIndex);
Register FrameReg;
StackOffset Offset = // FIXME: The FrameReg and Offset should be depended on divergency route.
StackOffset Offset = // FIXME: The FrameReg and Offset should be depended on
// divergency route.
getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg);
// TODO: finish
// if(!RII->isVGPRMemoryAccess(MI))
// Offset -= StackOffset::getFixed(
// MF.getInfo<RISCVMachineFunctionInfo>()->getVarArgsSaveSize() - 4);
// MF.getInfo<RISCVMachineFunctionInfo>()->getVarArgsSaveSize() -
// 4);
int64_t Lo11 = Offset.getFixed();
Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
@ -356,7 +359,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
report_fatal_error(
"Frame offsets outside of the signed 32-bit range not supported");
}
// FIXME: vsw/vlw has 11 bits immediates
if (MI.getOpcode() == RISCV::ADDI && !isInt<11>(Offset.getFixed())) {
// We chose to emit the canonical immediate sequence rather than folding
// the offset into the using add under the theory that doing so doesn't
@ -369,36 +372,117 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// operand of our user instruction. As a result, the remaining
// offset can by construction, at worst, a LUI and a ADD.
int64_t Val = Offset.getFixed();
Lo11 = SignExtend64<11>(Val);
Lo11 = SignExtend64<12>(Val);
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Lo11);
Offset = StackOffset::get((uint64_t)Val - (uint64_t)Lo11,
Offset.getScalable());
Offset =
StackOffset::get((uint64_t)Val - (uint64_t)Lo11, Offset.getScalable());
// adjustReg(*II->getParent(), II, DL, DestReg, FrameReg, Offset,
// MachineInstr::NoFlags, std::nullopt);
}
Register DestReg = MI.getOperand(0).getReg();
if (Offset.getScalable() || Offset.getFixed()) {
if (MI.getOpcode() == RISCV::ADDI)
DestReg = MI.getOperand(0).getReg();
else
DestReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
// !!!Very importtant for adjust
adjustReg(*II->getParent(), II, DL, DestReg, FrameReg, Offset,
MachineInstr::NoFlags, std::nullopt);
}
if (MI.getOpcode() == RISCV::ADDI &&
static_cast<unsigned>(FrameIndexID) == RISCVStackID::VGPRSpill) {
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg,
MI.getOperand(FIOperandNum)
.ChangeToRegister(FrameReg,
/*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
}
if(RII->isVGPRMemoryAccess(MI)) {
MI.getOperand(FIOperandNum).ChangeToRegister(getPrivateMemoryBaseRegister(MF),
if (RII->isPrivateMemoryAccess(MI) && FrameReg == RISCV::X4) {
MI.getOperand(FIOperandNum)
.ChangeToRegister(getPrivateMemoryBaseRegister(MF),
/*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
// simm11 locates in range [-1024, 1023], if offset not in this range, then
// we legalize the offset
if(!isInt<11>(Lo11))
if (!isInt<12>(Lo11))
adjustPriMemRegOffset(MF, *MI.getParent(), MI, Lo11,
getPrivateMemoryBaseRegister(MF), FIOperandNum);
}
if (RII->isPrivateMemoryAccess(MI) && FrameReg == RISCV::X2) {
MI.getOperand(FIOperandNum)
.ChangeToRegister(getPrivateMemoryBaseRegister(MF),
/*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
// simm11 locates in range [-1024, 1023], if offset not in this range, then
// we legalize the offset
MI.setDesc(RII->get(RII->getUniformMemoryOpcode(MI)));
if (!isInt<12>(Lo11))
adjustPriMemRegOffset(MF, *MI.getParent(), MI, Lo11,
getPrivateMemoryBaseRegister(MF), FIOperandNum);
}
// else
// MI.getOperand(FIOperandNum)
// .ChangeToRegister(FrameReg, /*IsDef*/ false,
// /*IsImp*/ false,
// /*IsKill*/ false);
if (RII->isUniformMemoryAccess(MI) && FrameReg == RISCV::X4) {
Register DestReg =
MF.getRegInfo().createVirtualRegister(&RISCV::VGPRRegClass);
MI.setDesc(RII->get(RII->getPrivateMemoryOpcode(MI)));
BuildMI(*MBB, II, DL, RII->get(RISCV::VMV_V_X), DestReg)
.addReg(MI.getOperand(FIOperandNum - 1).getReg());
MI.getOperand(FIOperandNum)
.ChangeToRegister(getPrivateMemoryBaseRegister(MF), /*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
MI.getOperand(FIOperandNum - 1)
.ChangeToRegister(DestReg, /*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
return false;
}
if (RII->isLocalMemoryAccess(MI) && FrameReg == RISCV::X4) {
Register DestReg =
MF.getRegInfo().createVirtualRegister(&RISCV::VGPRRegClass);
BuildMI(*MBB, II, DL, RII->get(RISCV::VMV_V_X), DestReg).addReg(FrameReg);
MI.getOperand(FIOperandNum)
.ChangeToRegister(getFrameRegister(MF), /*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
MI.setDesc(RII->get(RII->getPrivateMemoryOpcode(MI)));
return false;
}
if (RII->isLocalMemoryAccess(MI) && FrameReg == RISCV::X2) {
Register DestReg =
MF.getRegInfo().createVirtualRegister(&RISCV::VGPRRegClass);
BuildMI(*MBB, II, DL, RII->get(RISCV::VMV_V_X), DestReg).addReg(FrameReg);
MI.getOperand(FIOperandNum)
.ChangeToRegister(DestReg, /*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
return false;
}
if (RII->isPrivateMemoryAccess(MI))
MI.getOperand(FIOperandNum)
.ChangeToRegister(getPrivateMemoryBaseRegister(MF), /*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);
else
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*IsDef*/false,
MI.getOperand(FIOperandNum)
.ChangeToRegister(DestReg == MI.getOperand(0).getReg() ? FrameReg
: DestReg,
/*IsDef*/ false,
/*IsImp*/ false,
/*IsKill*/ false);

View File

@ -0,0 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=VENTUS %s
@compute_sum_with_localmem.tmp_sum = internal addrspace(3) global [10 x i32] undef, align 4
; Function Attrs: convergent noinline norecurse nounwind optnone vscale_range(1,2048)
define dso_local ventus_kernel void @compute_sum_with_localmem(ptr addrspace(1) noundef align 4 %a, i32 noundef %n, ptr addrspace(1) noundef align 4 %sum) {
; VENTUS-LABEL: compute_sum_with_localmem:
; VENTUS: # %bb.0: # %entry
; VENTUS-NEXT: li t0, 12
; VENTUS-NEXT: sw t0, -16(s0)
; VENTUS-NEXT: sw t0, -40(s0)
; VENTUS-NEXT: sw t0, -4(s0)
; VENTUS-NEXT: addi s0, s0, -40
; VENTUS-NEXT: ret
entry:
%a.addr = alloca ptr addrspace(1), align 4, addrspace(5)
%n.addr = alloca i32, align 4, addrspace(5)
%sum.addr = alloca ptr addrspace(1), align 4, addrspace(5)
store ptr addrspace(1) %a, ptr addrspace(5) %a.addr, align 4
store i32 %n, ptr addrspace(5) %n.addr, align 4
store ptr addrspace(1) %sum, ptr addrspace(5) %sum.addr, align 4
store i32 12, ptr addrspace(3) getelementptr inbounds ([10 x i32], ptr addrspace(3) @compute_sum_with_localmem.tmp_sum, i32 0, i32 6), align 4
store i32 12, ptr addrspace(3) @compute_sum_with_localmem.tmp_sum, align 4
store i32 12, ptr addrspace(3) getelementptr inbounds ([10 x i32], ptr addrspace(3) @compute_sum_with_localmem.tmp_sum, i32 0, i32 9), align 4
ret void
}