Merge pull request #89 from THU-DSP-LAB/eliminate_call_frame
[VENTUS][fix] Fix framelowering and calculation method of stack offset
This commit is contained in:
commit
49c039a902
|
@ -81,7 +81,7 @@ __builtin_riscv_workitem_linear_id:
|
|||
.type __builtin_riscv_global_linear_id, @function
|
||||
__builtin_riscv_global_linear_id:
|
||||
addi sp, sp, 4
|
||||
sw ra, 0(sp)
|
||||
sw ra, -4(sp)
|
||||
csrr a3, CSR_KNL # Get kernel metadata buffer
|
||||
lw t0, KNL_WORK_DIM(a3) # Get work_dims
|
||||
call __builtin_riscv_global_id_x
|
||||
|
@ -109,7 +109,7 @@ __builtin_riscv_global_linear_id:
|
|||
vadd.vv v5, v5, v6 # global_linear_id3 = tmp + global_linear_id2
|
||||
.GLR:
|
||||
vadd.vx v0, v5, zero # Return global_linear_id for 1/2/3 dims
|
||||
lw ra, 0(sp)
|
||||
lw ra, -4(sp)
|
||||
addi sp, sp, -4
|
||||
ret
|
||||
|
||||
|
@ -202,7 +202,7 @@ __builtin_riscv_workitem_id_z:
|
|||
.type __builtin_riscv_global_id_x, @function
|
||||
__builtin_riscv_global_id_x:
|
||||
addi sp, sp, 4
|
||||
sw ra, 0(sp)
|
||||
sw ra, -4(sp)
|
||||
call __builtin_riscv_workitem_id_x
|
||||
csrr a0, CSR_KNL # Get kernel metadata buffer
|
||||
csrr t1, CSR_GID_X # Get group_id_x
|
||||
|
@ -211,7 +211,7 @@ __builtin_riscv_global_id_x:
|
|||
mul t6, t1, t3 # CSR_GID_X * local_size_x
|
||||
add t6, t6, t4 # Get global_offset_x + CSR_GID_X * local_size_x
|
||||
vadd.vx v0,v0, t6
|
||||
lw ra, 0(sp)
|
||||
lw ra, -4(sp)
|
||||
addi sp, sp, -4
|
||||
ret
|
||||
|
||||
|
@ -221,7 +221,7 @@ __builtin_riscv_global_id_x:
|
|||
.type __builtin_riscv_global_id_y, @function
|
||||
__builtin_riscv_global_id_y:
|
||||
addi sp, sp, 4
|
||||
sw ra, 0(sp)
|
||||
sw ra, -4(sp)
|
||||
call __builtin_riscv_workitem_id_y
|
||||
csrr t1, CSR_GID_Y # Get group_id_y
|
||||
lw t2, KNL_LC_SIZE_Y(a0) # Get local_size_y
|
||||
|
@ -229,7 +229,7 @@ __builtin_riscv_global_id_y:
|
|||
mul t3, t1, t2 # CSR_GID_Y * local_size_y
|
||||
add t3, t3, t4 # global_offset_y + (CSR_GID_Y * local_size_y)
|
||||
vadd.vx v0, v0, t3 # global_id_y
|
||||
lw ra, 0(sp)
|
||||
lw ra, -4(sp)
|
||||
addi sp, sp, -4
|
||||
ret
|
||||
|
||||
|
@ -239,7 +239,7 @@ __builtin_riscv_global_id_y:
|
|||
.type __builtin_riscv_global_id_z, @function
|
||||
__builtin_riscv_global_id_z:
|
||||
addi sp, sp, 4
|
||||
sw ra, 0(sp)
|
||||
sw ra, -4(sp)
|
||||
call __builtin_riscv_workitem_id_z
|
||||
csrr a0, CSR_KNL # Get kernel metadata buffer
|
||||
csrr t1, CSR_GID_Z # Get group_id_z
|
||||
|
@ -248,7 +248,7 @@ __builtin_riscv_global_id_z:
|
|||
mul t2, t2, t1 # CSR_GID_Z * local_size_z
|
||||
add t2, t2, t3 # global_offset_z + (CSR_GID_Z * local_size_z)
|
||||
vadd.vx v0, v0, t2 # global_id_z
|
||||
lw ra, 0(sp)
|
||||
lw ra, -4(sp)
|
||||
addi sp, sp, -4
|
||||
ret
|
||||
|
||||
|
|
|
@ -466,6 +466,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
MachineBasicBlock &MBB) const {
|
||||
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
const RISCVInstrInfo *TII = STI.getInstrInfo();
|
||||
Register SPReg = getSPReg(STI);
|
||||
Register TPReg = getTPReg(STI);
|
||||
|
||||
|
@ -505,26 +506,46 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
|
||||
StackOffset::getFixed(-SPStackSize),
|
||||
MachineInstr::FrameDestroy, getStackAlign());
|
||||
if(TPStackSize)
|
||||
if(TPStackSize) {
|
||||
RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg,
|
||||
StackOffset::getFixed(-TPStackSize),
|
||||
MachineInstr::FrameDestroy, getStackAlign());
|
||||
|
||||
// Restore V32
|
||||
BuildMI(MBB, MBBI, DL, TII->get(RISCV::VMV_V_X),
|
||||
RI->getPrivateMemoryBaseRegister(MF))
|
||||
.addReg(TPReg);
|
||||
}
|
||||
|
||||
// Emit epilogue for shadow call stack.
|
||||
emitSCSEpilogue(MF, MBB, MBBI, DL);
|
||||
}
|
||||
|
||||
uint64_t RISCVFrameLowering::getExtractedStackOffset(const MachineFunction &MF,
|
||||
unsigned FI, RISCVStackID::Value Stack) const {
|
||||
uint64_t RISCVFrameLowering::getStackOffset(const MachineFunction &MF,
|
||||
int FI,
|
||||
RISCVStackID::Value Stack) const {
|
||||
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
uint64_t StackSize = 0;
|
||||
for(int I = FI + 1; I != MFI.getObjectIndexEnd(); I++) {
|
||||
if(static_cast<unsigned>(MFI.getStackID(I)) != Stack) {
|
||||
|
||||
// because the parameters spilling to the stack are not in the current TP
|
||||
// stack, the offset in the current stack should not be calculated from a
|
||||
// negative FI.
|
||||
for (int I = FI < 0 ? MFI.getObjectIndexBegin() : 0; I != FI + 1; I++) {
|
||||
if (static_cast<unsigned>(MFI.getStackID(I)) == Stack) {
|
||||
// Need to consider the alignment for different frame index
|
||||
uint64_t Size = MFI.getObjectSize(I);
|
||||
StackSize += Size;
|
||||
Align Alignment =
|
||||
MFI.getObjectAlign(I).value() <= 4 ? Align(4) : MFI.getObjectAlign(I);
|
||||
StackSize += MFI.getObjectSize(I);
|
||||
StackSize = alignTo(StackSize, Alignment);
|
||||
}
|
||||
}
|
||||
|
||||
// In the case of parameters spilling to the stack, needing to add the size of
|
||||
// the current TP stack because the parameters are on the caller's TP stack
|
||||
// instead of current stack.
|
||||
if (FI < 0 && !MF.getFunction().isVarArg())
|
||||
StackSize += getStackSize(MF, RISCVStackID::VGPRSpill);
|
||||
|
||||
return StackSize;
|
||||
}
|
||||
|
||||
|
@ -545,33 +566,16 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
|
|||
StackID == RISCVStackID::SGPRSpill ||
|
||||
StackID == RISCVStackID::VGPRSpill) &&
|
||||
"Unexpected stack ID for the frame object.");
|
||||
uint8_t Stack = MFI.getStackID(FI);
|
||||
StackOffset Offset =
|
||||
StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea()
|
||||
-getExtractedStackOffset(MF, FI, RISCVStackID::Value(Stack))
|
||||
+ MFI.getOffsetAdjustment());
|
||||
|
||||
|
||||
|
||||
// Different stacks for sALU and vALU threads.
|
||||
FrameReg = StackID == RISCVStackID::SGPRSpill ? RISCV::X2 : RISCV::X4;
|
||||
|
||||
if (CSI.size()) {
|
||||
// For callee saved registers
|
||||
MinCSFI = CSI[0].getFrameIdx();
|
||||
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
|
||||
if (FI >= MinCSFI && FI <= MaxCSFI) {
|
||||
Offset -= StackOffset::getFixed(RVFI->getVarArgsSaveSize());
|
||||
return Offset;
|
||||
}
|
||||
}
|
||||
// TODO: This only saves sGPR CSRs, as we haven't define vGPR CSRs
|
||||
// within getNonLibcallCSI.
|
||||
// if (FI >= MinCSFI && FI <= MaxCSFI) {
|
||||
Offset -= StackOffset::getFixed(
|
||||
getStackSize(const_cast<MachineFunction&>(MF),
|
||||
(RISCVStackID::Value)StackID));
|
||||
return Offset;
|
||||
if (StackID == RISCVStackID::VGPRSpill)
|
||||
FrameReg = RISCV::X4;
|
||||
else if (StackID == RISCVStackID::SGPRSpill)
|
||||
FrameReg = RISCV::X2;
|
||||
else
|
||||
FrameReg = RISCV::X8;
|
||||
return -StackOffset::getFixed(
|
||||
getStackOffset(MF, FI, (RISCVStackID::Value)StackID));
|
||||
}
|
||||
|
||||
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
|
||||
|
@ -632,7 +636,7 @@ static unsigned estimateFunctionSizeInBytes(const MachineFunction &MF,
|
|||
// by the frame pointer.
|
||||
// Let eliminateCallFramePseudoInstr preserve stack space for it.
|
||||
bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
|
||||
return !MF.getFrameInfo().hasVarSizedObjects();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
|
||||
|
@ -644,6 +648,9 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
|
|||
// Kernel and normal function has different stack pointer for Ventus GPGPU.
|
||||
Register SPReg = RISCV::X4; // MFI->isEntryFunction() ? RISCV::X2 : RISCV::X4;
|
||||
DebugLoc DL = MI->getDebugLoc();
|
||||
Register TPReg = getTPReg(STI);
|
||||
const RISCVInstrInfo *TII = STI.getInstrInfo();
|
||||
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
|
||||
|
||||
if (!hasReservedCallFrame(MF)) {
|
||||
// If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
|
||||
|
@ -660,9 +667,19 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
|
|||
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
|
||||
Amount = -Amount;
|
||||
|
||||
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
|
||||
RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
|
||||
MachineInstr::NoFlags, getStackAlign());
|
||||
|
||||
// The value of TP will be re-assigned to V32 at the end of the callee
|
||||
// function, which is actually the TP value after ADJCALLSTACKUP, so the
|
||||
// tp value after ADJCALLSTACKDOWN should be reassigned to V32 to ensure
|
||||
// that it is consistent with the TP value that has not been internally
|
||||
// adjusted (that is, excluding the initial TP adjustment) within the
|
||||
// current function.
|
||||
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
|
||||
BuildMI(MBB, MI, DL, TII->get(RISCV::VMV_V_X),
|
||||
RI.getPrivateMemoryBaseRegister(MF))
|
||||
.addReg(TPReg);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -707,19 +724,20 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
|
|||
return 0;
|
||||
}
|
||||
|
||||
uint64_t RISCVFrameLowering::getStackSize(MachineFunction &MF,
|
||||
uint64_t RISCVFrameLowering::getStackSize(const MachineFunction &MF,
|
||||
RISCVStackID::Value ID) const {
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
uint64_t StackSize = 0;
|
||||
|
||||
for(int I = MFI.getObjectIndexBegin(); I != MFI.getObjectIndexEnd(); I++) {
|
||||
for(int I = 0; I != MFI.getObjectIndexEnd(); I++) {
|
||||
if(static_cast<unsigned>(MFI.getStackID(I)) == ID) {
|
||||
// Need to consider the alignment for different frame index
|
||||
uint64_t Size = ((MFI.getObjectSize(I) + 3) >> 2) * 4;
|
||||
StackSize += Size;
|
||||
Align Alignment = MFI.getObjectAlign(I).value() <= 4 ?
|
||||
Align(4) : MFI.getObjectAlign(I);
|
||||
StackSize += MFI.getObjectSize(I);
|
||||
StackSize = alignTo(StackSize, Alignment);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return StackSize;
|
||||
}
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ public:
|
|||
bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
|
||||
|
||||
/// Get stack size for different stack ID
|
||||
uint64_t getStackSize(MachineFunction &MF, RISCVStackID::Value ID) const;
|
||||
uint64_t getStackSize(const MachineFunction &MF, RISCVStackID::Value ID) const;
|
||||
|
||||
/// Frame Objects:
|
||||
/// fi#0: id=4 size=48, align=4, at location [SP+8]
|
||||
|
@ -77,7 +77,7 @@ public:
|
|||
/// As we can see, if we split the stack, different frame offset calculation
|
||||
/// need to be modified too, when calculate the TP stack offset, we need to
|
||||
/// extract the stack offset of 'SP' in machine function frame
|
||||
uint64_t getExtractedStackOffset(const MachineFunction &MF, unsigned FI,
|
||||
uint64_t getStackOffset(const MachineFunction &MF, int FI,
|
||||
RISCVStackID::Value Stack) const;
|
||||
|
||||
/// Before insert prolog/epilog information, set stack ID for each frame index
|
||||
|
|
|
@ -11787,8 +11787,12 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
|
|||
// type, instead of the scalable vector type.
|
||||
ValVT = LocVT;
|
||||
}
|
||||
int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(),
|
||||
|
||||
// Just align to 4 bytes, because parameters more than 4 bytes will be split
|
||||
// into 4-byte parameters
|
||||
int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), 0,
|
||||
/*IsImmutable=*/true);
|
||||
MFI.setObjectAlignment(FI, Align(4));
|
||||
// This is essential for calculating stack size for VGPRSpill
|
||||
MFI.setStackID(FI, RISCVStackID::VGPRSpill);
|
||||
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
||||
|
@ -11982,6 +11986,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
|
|||
RegInfo.addLiveIn(ArgRegs[I], Reg);
|
||||
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
|
||||
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
|
||||
MFI.setObjectAlignment(FI, Align(4));
|
||||
MFI.setStackID(FI, RISCVStackID::VGPRSpill);
|
||||
// MFI.setStackID(FI, RISCVStackID::VGPRSpill);
|
||||
SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
|
||||
|
@ -12151,6 +12156,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
|
|||
SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
|
||||
SmallVector<SDValue, 8> MemOpChains;
|
||||
SDValue StackPtr;
|
||||
|
||||
// Get the value of adjusting the stack frame before the Call.
|
||||
uint64_t CurrentFrameSize = Chain->getConstantOperandVal(1);
|
||||
for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
|
||||
CCValAssign &VA = ArgLocs[i];
|
||||
SDValue ArgValue = OutVals[i];
|
||||
|
@ -12256,11 +12264,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
|
|||
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X4, PtrVT);
|
||||
SDValue Address =
|
||||
DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
|
||||
DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
|
||||
DAG.getIntPtrConstant(-((int)VA.getLocMemOffset()
|
||||
+ CurrentFrameSize), DL));
|
||||
|
||||
// Emit the store.
|
||||
MemOpChains.push_back(
|
||||
DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
|
||||
DAG.getStore(Chain, DL, ArgValue, Address,
|
||||
MachinePointerInfo(RISCVAS::PRIVATE_ADDRESS)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1463,6 +1463,12 @@ std::string RISCVInstrInfo::createMIROperandComment(
|
|||
return Comment;
|
||||
}
|
||||
|
||||
int RISCVInstrInfo::getSPAdjust(const MachineInstr &MI) const {
|
||||
// FIXME: Don't need this value now, but we can add relevant modifications
|
||||
// here when we optimize the PrologueInsert stage in the future.
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
|
||||
bool RISCV::isSEXT_W(const MachineInstr &MI) {
|
||||
return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&
|
||||
|
|
|
@ -202,6 +202,8 @@ public:
|
|||
|
||||
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
|
||||
|
||||
int getSPAdjust(const MachineInstr &MI) const;
|
||||
|
||||
protected:
|
||||
const RISCVSubtarget &STI;
|
||||
};
|
||||
|
|
|
@ -0,0 +1,347 @@
|
|||
; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \
|
||||
; RUN: | FileCheck -check-prefix=VENTUS %s
|
||||
|
||||
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) vscale_range(1,2048)
|
||||
define dso_local <16 x double> @func(<16 x double> noundef %x, <16 x double> noundef %y) local_unnamed_addr {
|
||||
; VENTUS: vsw.v v33, -4(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v34, -8(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v35, -12(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v36, -16(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v37, -20(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v38, -24(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v39, -28(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v40, -32(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v41, -36(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v42, -40(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v43, -44(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v44, -48(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v45, -52(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v46, -56(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v47, -60(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v48, -64(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v49, -68(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v50, -72(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v51, -76(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v52, -80(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v53, -84(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v54, -88(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v55, -92(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v56, -96(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v57, -100(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v58, -104(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v59, -108(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v60, -112(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v61, -116(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v62, -120(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 72
|
||||
; VENTUS-NEXT: vsw.v v63, -124(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v64, -128(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v65, -132(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v66, -136(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v67, -140(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v68, -144(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v69, -148(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v70, -152(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v71, -156(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v72, -160(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v73, -164(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v74, -168(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v75, -172(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v76, -176(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v77, -180(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v78, -184(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v79, -188(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v80, -192(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v81, -196(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v82, -200(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v83, -204(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v84, -208(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v85, -212(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v86, -216(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v87, -220(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v88, -224(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v89, -228(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v90, -232(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v91, -236(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v92, -240(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v93, -244(v32) # 4-byte Folded Spill
|
||||
; VENTUS-NEXT: regext zero, zero, 136
|
||||
; VENTUS-NEXT: vsw.v v94, -248(v32) # 4-byte Folded Spill
|
||||
entry:
|
||||
%add = fadd <16 x double> %x, %y
|
||||
ret <16 x double> %add
|
||||
}
|
||||
|
||||
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) vscale_range(1,2048)
|
||||
define dso_local ventus_kernel void @test_fn(ptr addrspace(1) nocapture noundef readonly align 128 %x, ptr addrspace(1) nocapture noundef readonly align 128 %y, ptr addrspace(1) nocapture noundef writeonly align 128 %dst) {
|
||||
; VENTUS: addi tp, tp, 128
|
||||
; VENTUS=NEXT: li t0, 4
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 2
|
||||
; VENTUS=NEXT: vmv.v.x v66, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 80
|
||||
; VENTUS=NEXT: vsw.v v34, 0(v66)
|
||||
; VENTUS=NEXT: li t0, 8
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 136
|
||||
; VENTUS=NEXT: vsw.v v65, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 12
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 136
|
||||
; VENTUS=NEXT: vsw.v v64, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 16
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v63, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 20
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v62, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 24
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v61, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 28
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v60, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 32
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v59, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 36
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v58, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 40
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v57, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 44
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v56, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 48
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v55, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 52
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v54, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 56
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v53, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 60
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v52, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 64
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v51, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 68
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v50, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 72
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v49, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 76
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v48, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 80
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v47, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 84
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v46, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 88
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v45, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 92
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v44, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 96
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v43, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 100
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v42, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 104
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v41, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 108
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v40, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 112
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v39, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 116
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v38, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 120
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v37, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 124
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v36, 0(v34)
|
||||
; VENTUS=NEXT: li t0, 128
|
||||
; VENTUS=NEXT: sub t0, tp, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 1
|
||||
; VENTUS=NEXT: vmv.v.x v34, t0
|
||||
; VENTUS=NEXT: regext zero, zero, 72
|
||||
; VENTUS=NEXT: vsw.v v35, 0(v34)
|
||||
; VENTUS=NEXT: call _Z3minDv16_dS_
|
||||
; VENTUS=NEXT: addi tp, tp, -128
|
||||
entry:
|
||||
%call = call i32 @_Z13get_global_idj(i32 noundef 0)
|
||||
%arrayidx = getelementptr inbounds <16 x double>, ptr addrspace(1) %x, i32 %call
|
||||
%0 = load <16 x double>, ptr addrspace(1) %arrayidx, align 128
|
||||
%arrayidx1 = getelementptr inbounds <16 x double>, ptr addrspace(1) %y, i32 %call
|
||||
%1 = load <16 x double>, ptr addrspace(1) %arrayidx1, align 128
|
||||
%call2 = call <16 x double> @_Z3minDv16_dS_(<16 x double> noundef %0, <16 x double> noundef %1)
|
||||
%arrayidx3 = getelementptr inbounds <16 x double>, ptr addrspace(1) %dst, i32 %call
|
||||
store <16 x double> %call2, ptr addrspace(1) %arrayidx3, align 128
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
|
||||
declare dso_local i32 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #2
|
||||
|
||||
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
|
||||
declare dso_local <16 x double> @_Z3minDv16_dS_(<16 x double> noundef, <16 x double> noundef) local_unnamed_addr #2
|
Loading…
Reference in New Issue