From 87fe5f3ce8ec579389b657c093582bdcb5b99edc Mon Sep 17 00:00:00 2001 From: zhoujing Date: Tue, 5 Mar 2024 16:32:59 +0800 Subject: [PATCH] [VENTUS][fix] Put local variables declared in kernel function into shared memory --- libclc/riscv32/lib/crt0.S | 10 +- llvm/lib/Target/RISCV/RISCV.h | 1 + llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 100 +++++++------ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 45 +++++- llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 + llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 95 +++++++++++- llvm/lib/Target/RISCV/RISCVInstrInfo.h | 13 +- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 136 ++++++++++++++---- .../VentusGPGPU/local_addressed_variables.ll | 28 ++++ 9 files changed, 352 insertions(+), 79 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/VentusGPGPU/local_addressed_variables.ll diff --git a/libclc/riscv32/lib/crt0.S b/libclc/riscv32/lib/crt0.S index 212d96501207..2d86a90546c4 100644 --- a/libclc/riscv32/lib/crt0.S +++ b/libclc/riscv32/lib/crt0.S @@ -39,14 +39,18 @@ _start: li t4,32 vsetvli t4,t4,e32,m1,ta,ma li t4,0x2000 - csrrs t4,mstatus,t4 + csrrs t4, mstatus, t4 li t4, 0 csrr t1, CSR_WID csrr t2, CSR_LDS - li t3, 1024 # 1k size for single warp - mul t1, t1, t3 # sp = lds + wid * warp_size + li t3, 1024 # 1M size for single warp + mul t1, t1, t3 # sp wid * warp_size add sp, t1, t2 # sp points to baseaddr of local memory of each SM li tp, 0 # tp points to baseaddr for lower bound of private memory(1K) of each thread + csrr t5, CSR_NUMW + li t3, 1024 + mul t5, t5, t3 + add s0, t2, t5 # s0 points to local memory base addr in a workgroup # clear BSS segment la a0, _edata diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index ed6597b7fca3..318a8dc9518b 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -116,6 +116,7 @@ enum Value { ScalableVector = 2, WasmLocal = 3, VGPRSpill = 4, + LocalMemSpill = 5, NoAlloc = 255 }; } diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 712d4a59e6fb..10d6418860f3 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -300,7 +300,8 @@ getNonLibcallCSI(const MachineFunction &MF, // TODO: For now, we don't define VGPR callee saved registers, when we later // add VGPR callee saved register, remember to modify here if (FI >= 0 && (MFI.getStackID(FI) == RISCVStackID::Default || - MFI.getStackID(FI) == RISCVStackID::SGPRSpill)) + MFI.getStackID(FI) == RISCVStackID::SGPRSpill || + MFI.getStackID(FI) == RISCVStackID::VGPRSpill)) NonLibcallCSI.push_back(CS); } @@ -374,6 +375,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, uint64_t SPStackSize = getStackSize(MF, RISCVStackID::SGPRSpill); uint64_t TPStackSize = getStackSize(MF, RISCVStackID::VGPRSpill); + uint64_t LocalStackSize = getStackSize(MF, RISCVStackID::LocalMemSpill); + // FIXME: need to add local data declaration calculation CurrentSubProgramInfo->LDSMemory += SPStackSize; CurrentSubProgramInfo->PDSMemory += TPStackSize; @@ -397,11 +400,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Allocate space on the local-mem stack and private-mem stack if necessary. if(SPStackSize) { - RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo, + RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo, SPReg); RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, - StackOffset::getFixed(SPStackSize), - MachineInstr::FrameSetup, getStackAlign()); + StackOffset::getFixed(SPStackSize), MachineInstr::FrameSetup, + getStackAlign()); // Emit ".cfi_def_cfa_offset SPStackSize" unsigned CFIIndex = MF.addFrameInst( @@ -411,14 +414,21 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if(TPStackSize) { - RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo, - TPReg); - RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo, - RI->getPrivateMemoryBaseRegister(MF)); - RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg, - StackOffset::getFixed(TPStackSize), + if (LocalStackSize) { + RI->adjustReg(MBB, MBBI, DL, RISCV::X8, RISCV::X8, + StackOffset::getFixed(LocalStackSize), MachineInstr::FrameSetup, getStackAlign()); + // Emit ".cfi_def_cfa_offset Local memory StackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, SPStackSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } + if (TPStackSize) { + RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg, + StackOffset::getFixed(TPStackSize), MachineInstr::FrameSetup, + getStackAlign()); // Emit ".cfi_def_cfa_offset TPStackSize" unsigned CFIIndex = MF.addFrameInst( @@ -500,23 +510,27 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // Get 2 stack size for TP and SP uint64_t SPStackSize = getStackSize(MF, RISCVStackID::SGPRSpill); uint64_t TPStackSize = getStackSize(MF, RISCVStackID::VGPRSpill); - + uint64_t LocalStackSize = getStackSize(MF, RISCVStackID::LocalMemSpill); // Deallocate stack if(SPStackSize) RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-SPStackSize), MachineInstr::FrameDestroy, getStackAlign()); + if(LocalStackSize) + RI->adjustReg(MBB, MBBI, DL, RISCV::X8, RISCV::X8, + StackOffset::getFixed(-LocalStackSize), + MachineInstr::FrameDestroy, getStackAlign()); if(TPStackSize) { RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg, StackOffset::getFixed(-TPStackSize), MachineInstr::FrameDestroy, getStackAlign()); - + // Restore V32 BuildMI(MBB, MBBI, DL, TII->get(RISCV::VMV_V_X), RI->getPrivateMemoryBaseRegister(MF)) .addReg(TPReg); } - + // Emit epilogue for shadow call stack. emitSCSEpilogue(MF, MBB, MBBI, DL); } @@ -527,8 +541,8 @@ uint64_t RISCVFrameLowering::getStackOffset(const MachineFunction &MF, const MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t StackSize = 0; - // because the parameters spilling to the stack are not in the current TP - // stack, the offset in the current stack should not be calculated from a + // because the parameters spilling to the stack are not in the current TP + // stack, the offset in the current stack should not be calculated from a // negative FI. for (int I = FI < 0 ? MFI.getObjectIndexBegin() : 0; I != FI + 1; I++) { if (static_cast(MFI.getStackID(I)) == Stack) { @@ -545,7 +559,7 @@ uint64_t RISCVFrameLowering::getStackOffset(const MachineFunction &MF, // instead of current stack. if (FI < 0 && !MF.getFunction().isVarArg()) StackSize += getStackSize(MF, RISCVStackID::VGPRSpill); - + return StackSize; } @@ -564,7 +578,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, assert((StackID == RISCVStackID::Default || StackID == RISCVStackID::SGPRSpill || - StackID == RISCVStackID::VGPRSpill) && + StackID == RISCVStackID::VGPRSpill || + StackID == RISCVStackID::LocalMemSpill) && "Unexpected stack ID for the frame object."); // Different stacks for sALU and vALU threads. @@ -586,7 +601,7 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized( auto *CurrentProgramInfo = const_cast( MF.getSubtarget().getVentusProgramInfo()); - // When accessing a new function, we need to add a new container to calculate + // When accessing a new function, we need to add a new container to calculate // its resource usage. CurrentProgramInfo->RegisterAddedSetVec.push_back(DenseSet()); CurrentProgramInfo->SubProgramInfoVec.push_back(SubVentusProgramInfo()); @@ -604,14 +619,14 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized( if (!Op.isReg()) continue; - RI->insertRegToSet(MRI, CurrentRegisterAddedSet, + RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo, Op.getReg()); } } } // ra register is a special register. - RI->insertRegToSet(MRI, CurrentRegisterAddedSet, + RI->insertRegToSet(MRI, CurrentRegisterAddedSet, CurrentSubProgramInfo, RISCV::X1); } @@ -706,14 +721,14 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount), MachineInstr::NoFlags, getStackAlign()); - + // The value of TP will be re-assigned to V32 at the end of the callee - // function, which is actually the TP value after ADJCALLSTACKUP, so the - // tp value after ADJCALLSTACKDOWN should be reassigned to V32 to ensure - // that it is consistent with the TP value that has not been internally - // adjusted (that is, excluding the initial TP adjustment) within the + // function, which is actually the TP value after ADJCALLSTACKUP, so the + // tp value after ADJCALLSTACKDOWN should be reassigned to V32 to ensure + // that it is consistent with the TP value that has not been internally + // adjusted (that is, excluding the initial TP adjustment) within the // current function. - if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN) + if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN) BuildMI(MBB, MI, DL, TII->get(RISCV::VMV_V_X), RI.getPrivateMemoryBaseRegister(MF)) .addReg(TPReg); @@ -765,10 +780,10 @@ uint64_t RISCVFrameLowering::getStackSize(const MachineFunction &MF, RISCVStackID::Value ID) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t StackSize = 0; - + for(int I = 0; I != MFI.getObjectIndexEnd(); I++) { if(static_cast(MFI.getStackID(I)) == ID) { - Align Alignment = MFI.getObjectAlign(I).value() <= 4 ? + Align Alignment = MFI.getObjectAlign(I).value() <= 4 ? Align(4) : MFI.getObjectAlign(I); StackSize += MFI.getObjectSize(I); StackSize = alignTo(StackSize, Alignment); @@ -780,16 +795,19 @@ uint64_t RISCVFrameLowering::getStackSize(const MachineFunction &MF, void RISCVFrameLowering::determineStackID(MachineFunction &MF) const { MachineFrameInfo &MFI = MF.getFrameInfo(); - for(int I = MFI.getObjectIndexBegin(); I != MFI.getObjectIndexEnd(); I++) { + for (int I = MFI.getObjectIndexBegin(); I != MFI.getObjectIndexEnd(); I++) { // FIXME: There is no sGPR spill stack! // MFI.setStackID(I, RISCVStackID::VGPRSpill); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF,I); - if(MFI.getStackID(I) != RISCVStackID::SGPRSpill && - PtrInfo.getAddrSpace() == RISCVAS::PRIVATE_ADDRESS) + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, I); + if (MFI.getStackID(I) != RISCVStackID::Default) + continue; + if (PtrInfo.getAddrSpace() == RISCVAS::PRIVATE_ADDRESS) MFI.setStackID(I, RISCVStackID::VGPRSpill); + else if (PtrInfo.getAddrSpace() == RISCVAS::LOCAL_ADDRESS) + MFI.setStackID(I, RISCVStackID::LocalMemSpill); else - MFI.setStackID(I, RISCVStackID::SGPRSpill); + MFI.setStackID(I, RISCVStackID::SGPRSpill); } } @@ -824,17 +842,17 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - // TODO: Have we allocated stack for vGPR spilling? if(Reg.id() < RISCV::V0 || Reg.id() > RISCV::V255) { MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::SGPRSpill); // FIXME: Right now, no vgpr callee saved register, maybe later needed TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(), RC, TRI); + } else { + assert(Reg.id() >= RISCV::V32 && Reg.id() <= RISCV::V255 && "TODO"); + MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::VGPRSpill); + TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(), + RC, TRI); } - // else { - // FIXME: Right now, no callee saved register for VGPR - // MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::VGPRSpill); - // } } return true; @@ -862,8 +880,7 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( for (auto &CS : NonLibcallCSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - if(Reg.id() < RISCV::V0 || Reg.id() > RISCV::V255 ) - TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } @@ -946,6 +963,7 @@ bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case RISCVStackID::Default: case RISCVStackID::SGPRSpill: case RISCVStackID::VGPRSpill: + case RISCVStackID::LocalMemSpill: return true; case RISCVStackID::ScalableVector: case RISCVStackID::NoAlloc: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f6ef1418cab4..b04b6357188c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19,6 +19,7 @@ #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/PatternMatch.h" @@ -4296,9 +4298,35 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *N = cast(Op); assert(N->getOffset() == 0 && "unexpected offset in global node"); + // FIXME: Only support local address? + if (N->getAddressSpace() == RISCVAS::LOCAL_ADDRESS) + return lowerGlobalLocalAddress(N, DAG); return getAddr(N, DAG, N->getGlobal()->isDSOLocal()); } +/// For local variables, we need to store variables into local memory, +/// rather than put it into '.sbss' section +/// TODO: Remove the address allocating in '.sbss' section +SDValue RISCVTargetLowering::lowerGlobalLocalAddress(GlobalAddressSDNode *Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + static SmallVector> LoweredVariables; + + MachineFrameInfo &MFI = MF.getFrameInfo(); + const DataLayout &DL = DAG.getDataLayout(); + auto *GV = cast(Op->getGlobal()); + for(auto &VA : LoweredVariables) { + if(VA.first == GV) + return DAG.getFrameIndex(VA.second, MVT::i32); + } + unsigned AlignValue = DL.getABITypeAlignment(GV->getValueType()); + int FI = MFI.CreateStackObject(DL.getTypeAllocSize(GV->getValueType()) + /*Offset need to be modified too*/, + Align(AlignValue), false, nullptr, RISCVStackID::LocalMemSpill); + LoweredVariables.push_back(std::make_pair(GV, FI)); + return DAG.getFrameIndex(FI, MVT::i32); +} + SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { BlockAddressSDNode *N = cast(Op); @@ -7458,7 +7486,6 @@ SDValue RISCVTargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); } - SDValue RISCVTargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, @@ -11480,7 +11507,7 @@ static bool CC_Ventus(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, // Allocate stack for arguments which can not use register unsigned StackOffset = - Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign); + Reg ? 0 : -State.AllocateStack(StoreSizeBytes, StackAlign); // If we reach this point and PendingLocs is non-empty, we must be at the // end of a split argument that must be passed indirectly. @@ -11788,7 +11815,7 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, ValVT = LocVT; } - // Just align to 4 bytes, because parameters more than 4 bytes will be split + // Just align to 4 bytes, because parameters more than 4 bytes will be split // into 4-byte parameters int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), 0, /*IsImmutable=*/true); @@ -11904,7 +11931,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL); else if (VA.isRegLoc()) ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, *this, Ins[i]); - else + else ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL); if (VA.getLocInfo() == CCValAssign::Indirect) { @@ -12264,12 +12291,12 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X4, PtrVT); SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, - DAG.getIntPtrConstant(-((int)VA.getLocMemOffset() + DAG.getIntPtrConstant(-((int)VA.getLocMemOffset() + CurrentFrameSize), DL)); // Emit the store. MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, Address, + DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo(RISCVAS::PRIVATE_ADDRESS))); } } @@ -13483,6 +13510,10 @@ bool RISCVTargetLowering::isSDNodeSourceOfDivergence( } case ISD::STORE: { const StoreSDNode *Store= cast(N); + auto &MFI = FLI->MF->getFrameInfo(); + if(auto *BaseBase = dyn_cast(Store->getOperand(1))) + if(MFI.getStackID(BaseBase->getIndex()) == RISCVStackID::SGPRSpill) + return false; return Store->getAddressSpace() == RISCVAS::PRIVATE_ADDRESS || Store->getPointerInfo().StackID == RISCVStackID::VGPRSpill; } @@ -13494,6 +13525,8 @@ bool RISCVTargetLowering::isSDNodeSourceOfDivergence( case ISD::INTRINSIC_W_CHAIN: return RISCVII::isIntrinsicSourceOfDivergence( cast(N->getOperand(1))->getZExtValue()); + case Intrinsic::vastart: + return true; /* case AMDGPUISD::ATOMIC_CMP_SWAP: case AMDGPUISD::ATOMIC_INC: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 0965b7020ed1..6d5641af0cf5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -646,6 +646,9 @@ private: SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const; SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerGlobalLocalAddress(GlobalAddressSDNode *Op, + SelectionDAG &DAG) const; SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 18866771df2e..7859e15475ca 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -95,7 +95,7 @@ unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, return 0; } -bool RISCVInstrInfo::isVGPRMemoryAccess(const MachineInstr &MI) const { +bool RISCVInstrInfo::isPrivateMemoryAccess(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return false; @@ -107,11 +107,102 @@ bool RISCVInstrInfo::isVGPRMemoryAccess(const MachineInstr &MI) const { case RISCV::VSW: case RISCV::VSH: case RISCV::VSB: - case RISCV::VSWI12: return true; } } +bool RISCVInstrInfo::isUniformMemoryAccess(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return false; + case RISCV::LW: + case RISCV::LB: + case RISCV::LBU: + case RISCV::LH: + case RISCV::LHU: + case RISCV::SW: + case RISCV::SH: + case RISCV::SB: + return true; + } +} + +bool RISCVInstrInfo::isLocalMemoryAccess(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return false; + case RISCV::VLWI12: + case RISCV::VLBI12: + case RISCV::VLBUI12: + case RISCV::VLHI12: + case RISCV::VLHUI12: + case RISCV::VSWI12: + case RISCV::VSHI12: + case RISCV::VSBI12: + return true; + } +} + + + +unsigned RISCVInstrInfo::getPrivateMemoryOpcode(MachineInstr &MI) const { + switch (MI.getOpcode()) { + case RISCV::LW: + case RISCV::VLWI12: + return RISCV::VLW; + case RISCV::LB: + case RISCV::VLBI12: + return RISCV::VLB; + case RISCV::LBU: + case RISCV::VLBUI12: + return RISCV::VLBU; + case RISCV::LH: + case RISCV::VLHI12: + return RISCV::VLH; + case RISCV::LHU: + case RISCV::VLHUI12: + return RISCV::VLHU; + case RISCV::SW: + case RISCV::VSWI12: + return RISCV::VSW; + case RISCV::SH: + case RISCV::VSHI12: + return RISCV::VSH; + case RISCV::SB: + case RISCV::VSBI12: + return RISCV::VSB; + default: + // MI.dump(); + assert(0 && "TODO"); + return RISCV::VLW; + } +} + +unsigned RISCVInstrInfo::getUniformMemoryOpcode(MachineInstr &MI) const { + switch (MI.getOpcode()) { + case RISCV::VLW: + return RISCV::VLWI12; + case RISCV::VLB: + return RISCV::VLBI12; + case RISCV::VLBU: + return RISCV::VLBUI12; + case RISCV::VLH: + return RISCV::VLHI12; + case RISCV::VLHU: + return RISCV::VLHUI12; + case RISCV::VSW: + return RISCV::VSWI12; + case RISCV::VSH: + return RISCV::VSHI12; + case RISCV::VSB: + return RISCV::VSBI12; + default: + // MI.dump(); + assert(0 && "TODO"); + return RISCV::VLW; + } +} + unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 46470aaa5ff0..f8f757380787 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -55,7 +55,18 @@ public: MCInst getNop() const override; const MCInstrDesc &getBrCond(RISCVCC::CondCode CC) const; - bool isVGPRMemoryAccess(const MachineInstr &MI) const; + /// Check the memory access instruction is private memory access + bool isPrivateMemoryAccess(const MachineInstr &MI) const; + + /// Check the memory access instruction is uniform memory access + bool isUniformMemoryAccess(const MachineInstr &MI) const; + + /// Check the memory access instruction is uniform memory access + bool isLocalMemoryAccess(const MachineInstr &MI) const; + + unsigned getPrivateMemoryOpcode(MachineInstr &MI) const; + + unsigned getUniformMemoryOpcode(MachineInstr &MI) const; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index c8c5a1c06b7e..b9ca2dadd5ef 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -87,10 +87,9 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Use markSuperRegs to ensure any register aliases are also reserved markSuperRegs(Reserved, RISCV::X0); // zero markSuperRegs(Reserved, RISCV::X2); // sp + markSuperRegs(Reserved, RISCV::X8); // s0 markSuperRegs(Reserved, RISCV::X3); // gp markSuperRegs(Reserved, RISCV::X4); // tp - if (TFI->hasFP(MF)) - markSuperRegs(Reserved, RISCV::X8); // fp // Reserve the base register if we need to realign the stack and allocate // variable-sized objects at runtime. if (TFI->hasBP(MF)) @@ -333,9 +332,11 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(SPAdj == 0 && "Unexpected non-zero SPAdj value"); MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); MachineFunction &MF = *MI.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const RISCVSubtarget &ST = MF.getSubtarget(); - const RISCVRegisterInfo * RI = ST.getRegisterInfo(); + const RISCVRegisterInfo *RI = ST.getRegisterInfo(); const RISCVInstrInfo *RII = ST.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); @@ -343,12 +344,14 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, auto FrameIndexID = MF.getFrameInfo().getStackID(FrameIndex); Register FrameReg; - StackOffset Offset = // FIXME: The FrameReg and Offset should be depended on divergency route. + StackOffset Offset = // FIXME: The FrameReg and Offset should be depended on + // divergency route. getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg); // TODO: finish // if(!RII->isVGPRMemoryAccess(MI)) // Offset -= StackOffset::getFixed( - // MF.getInfo()->getVarArgsSaveSize() - 4); + // MF.getInfo()->getVarArgsSaveSize() - + // 4); int64_t Lo11 = Offset.getFixed(); Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); @@ -356,7 +359,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, report_fatal_error( "Frame offsets outside of the signed 32-bit range not supported"); } - + // FIXME: vsw/vlw has 11 bits immediates if (MI.getOpcode() == RISCV::ADDI && !isInt<11>(Offset.getFixed())) { // We chose to emit the canonical immediate sequence rather than folding // the offset into the using add under the theory that doing so doesn't @@ -369,38 +372,119 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // operand of our user instruction. As a result, the remaining // offset can by construction, at worst, a LUI and a ADD. int64_t Val = Offset.getFixed(); - Lo11 = SignExtend64<11>(Val); - + Lo11 = SignExtend64<12>(Val); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Lo11); - Offset = StackOffset::get((uint64_t)Val - (uint64_t)Lo11, - Offset.getScalable()); + Offset = + StackOffset::get((uint64_t)Val - (uint64_t)Lo11, Offset.getScalable()); + // adjustReg(*II->getParent(), II, DL, DestReg, FrameReg, Offset, + // MachineInstr::NoFlags, std::nullopt); } - if(MI.getOpcode() == RISCV::ADDI && - static_cast(FrameIndexID) == RISCVStackID::VGPRSpill) { - MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, - /*IsDef*/false, - /*IsImp*/false, - /*IsKill*/false); + Register DestReg = MI.getOperand(0).getReg(); + if (Offset.getScalable() || Offset.getFixed()) { + if (MI.getOpcode() == RISCV::ADDI) + DestReg = MI.getOperand(0).getReg(); + else + DestReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); + // !!!Very importtant for adjust + adjustReg(*II->getParent(), II, DL, DestReg, FrameReg, Offset, + MachineInstr::NoFlags, std::nullopt); + } + if (MI.getOpcode() == RISCV::ADDI && + static_cast(FrameIndexID) == RISCVStackID::VGPRSpill) { + MI.getOperand(FIOperandNum) + .ChangeToRegister(FrameReg, + /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); } - if(RII->isVGPRMemoryAccess(MI)) { - MI.getOperand(FIOperandNum).ChangeToRegister(getPrivateMemoryBaseRegister(MF), - /*IsDef*/false, - /*IsImp*/false, - /*IsKill*/false); + if (RII->isPrivateMemoryAccess(MI) && FrameReg == RISCV::X4) { + MI.getOperand(FIOperandNum) + .ChangeToRegister(getPrivateMemoryBaseRegister(MF), + /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); // simm11 locates in range [-1024, 1023], if offset not in this range, then // we legalize the offset - if(!isInt<11>(Lo11)) + if (!isInt<12>(Lo11)) adjustPriMemRegOffset(MF, *MI.getParent(), MI, Lo11, - getPrivateMemoryBaseRegister(MF), FIOperandNum); + getPrivateMemoryBaseRegister(MF), FIOperandNum); } + if (RII->isPrivateMemoryAccess(MI) && FrameReg == RISCV::X2) { + MI.getOperand(FIOperandNum) + .ChangeToRegister(getPrivateMemoryBaseRegister(MF), + /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); + // simm11 locates in range [-1024, 1023], if offset not in this range, then + // we legalize the offset + MI.setDesc(RII->get(RII->getUniformMemoryOpcode(MI))); + if (!isInt<12>(Lo11)) + adjustPriMemRegOffset(MF, *MI.getParent(), MI, Lo11, + getPrivateMemoryBaseRegister(MF), FIOperandNum); + } + + // else + // MI.getOperand(FIOperandNum) + // .ChangeToRegister(FrameReg, /*IsDef*/ false, + // /*IsImp*/ false, + // /*IsKill*/ false); + if (RII->isUniformMemoryAccess(MI) && FrameReg == RISCV::X4) { + Register DestReg = + MF.getRegInfo().createVirtualRegister(&RISCV::VGPRRegClass); + MI.setDesc(RII->get(RII->getPrivateMemoryOpcode(MI))); + BuildMI(*MBB, II, DL, RII->get(RISCV::VMV_V_X), DestReg) + .addReg(MI.getOperand(FIOperandNum - 1).getReg()); + MI.getOperand(FIOperandNum) + .ChangeToRegister(getPrivateMemoryBaseRegister(MF), /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); + MI.getOperand(FIOperandNum - 1) + .ChangeToRegister(DestReg, /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); + + return false; + } + + if (RII->isLocalMemoryAccess(MI) && FrameReg == RISCV::X4) { + Register DestReg = + MF.getRegInfo().createVirtualRegister(&RISCV::VGPRRegClass); + BuildMI(*MBB, II, DL, RII->get(RISCV::VMV_V_X), DestReg).addReg(FrameReg); + MI.getOperand(FIOperandNum) + .ChangeToRegister(getFrameRegister(MF), /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); + MI.setDesc(RII->get(RII->getPrivateMemoryOpcode(MI))); + return false; + } + + if (RII->isLocalMemoryAccess(MI) && FrameReg == RISCV::X2) { + Register DestReg = + MF.getRegInfo().createVirtualRegister(&RISCV::VGPRRegClass); + BuildMI(*MBB, II, DL, RII->get(RISCV::VMV_V_X), DestReg).addReg(FrameReg); + MI.getOperand(FIOperandNum) + .ChangeToRegister(DestReg, /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); + return false; + } + + if (RII->isPrivateMemoryAccess(MI)) + MI.getOperand(FIOperandNum) + .ChangeToRegister(getPrivateMemoryBaseRegister(MF), /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); else - MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*IsDef*/false, - /*IsImp*/false, - /*IsKill*/false); + MI.getOperand(FIOperandNum) + .ChangeToRegister(DestReg == MI.getOperand(0).getReg() ? FrameReg + : DestReg, + /*IsDef*/ false, + /*IsImp*/ false, + /*IsKill*/ false); // If after materializing the adjustment, we have a pointless ADDI, remove it if (MI.getOpcode() == RISCV::ADDI && diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/local_addressed_variables.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/local_addressed_variables.ll new file mode 100644 index 000000000000..91f19d76bda6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/local_addressed_variables.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=VENTUS %s + +@compute_sum_with_localmem.tmp_sum = internal addrspace(3) global [10 x i32] undef, align 4 + +; Function Attrs: convergent noinline norecurse nounwind optnone vscale_range(1,2048) +define dso_local ventus_kernel void @compute_sum_with_localmem(ptr addrspace(1) noundef align 4 %a, i32 noundef %n, ptr addrspace(1) noundef align 4 %sum) { +; VENTUS-LABEL: compute_sum_with_localmem: +; VENTUS: # %bb.0: # %entry +; VENTUS-NEXT: li t0, 12 +; VENTUS-NEXT: sw t0, -16(s0) +; VENTUS-NEXT: sw t0, -40(s0) +; VENTUS-NEXT: sw t0, -4(s0) +; VENTUS-NEXT: addi s0, s0, -40 +; VENTUS-NEXT: ret +entry: + %a.addr = alloca ptr addrspace(1), align 4, addrspace(5) + %n.addr = alloca i32, align 4, addrspace(5) + %sum.addr = alloca ptr addrspace(1), align 4, addrspace(5) + store ptr addrspace(1) %a, ptr addrspace(5) %a.addr, align 4 + store i32 %n, ptr addrspace(5) %n.addr, align 4 + store ptr addrspace(1) %sum, ptr addrspace(5) %sum.addr, align 4 + store i32 12, ptr addrspace(3) getelementptr inbounds ([10 x i32], ptr addrspace(3) @compute_sum_with_localmem.tmp_sum, i32 0, i32 6), align 4 + store i32 12, ptr addrspace(3) @compute_sum_with_localmem.tmp_sum, align 4 + store i32 12, ptr addrspace(3) getelementptr inbounds ([10 x i32], ptr addrspace(3) @compute_sum_with_localmem.tmp_sum, i32 0, i32 9), align 4 + ret void +}