Merge pull request #89 from THU-DSP-LAB/eliminate_call_frame

[VENTUS][fix] Fix framelowering and calculation method of stack offset
This commit is contained in:
zhoujingya 2024-02-01 14:54:42 +08:00 committed by GitHub
commit 49c039a902
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 441 additions and 58 deletions

View File

@ -81,7 +81,7 @@ __builtin_riscv_workitem_linear_id:
.type __builtin_riscv_global_linear_id, @function
__builtin_riscv_global_linear_id:
addi sp, sp, 4
sw ra, 0(sp)
sw ra, -4(sp)
csrr a3, CSR_KNL # Get kernel metadata buffer
lw t0, KNL_WORK_DIM(a3) # Get work_dims
call __builtin_riscv_global_id_x
@ -109,7 +109,7 @@ __builtin_riscv_global_linear_id:
vadd.vv v5, v5, v6 # global_linear_id3 = tmp + global_linear_id2
.GLR:
vadd.vx v0, v5, zero # Return global_linear_id for 1/2/3 dims
lw ra, 0(sp)
lw ra, -4(sp)
addi sp, sp, -4
ret
@ -202,7 +202,7 @@ __builtin_riscv_workitem_id_z:
.type __builtin_riscv_global_id_x, @function
__builtin_riscv_global_id_x:
addi sp, sp, 4
sw ra, 0(sp)
sw ra, -4(sp)
call __builtin_riscv_workitem_id_x
csrr a0, CSR_KNL # Get kernel metadata buffer
csrr t1, CSR_GID_X # Get group_id_x
@ -211,7 +211,7 @@ __builtin_riscv_global_id_x:
mul t6, t1, t3 # CSR_GID_X * local_size_x
add t6, t6, t4 # Get global_offset_x + CSR_GID_X * local_size_x
vadd.vx v0,v0, t6
lw ra, 0(sp)
lw ra, -4(sp)
addi sp, sp, -4
ret
@ -221,7 +221,7 @@ __builtin_riscv_global_id_x:
.type __builtin_riscv_global_id_y, @function
__builtin_riscv_global_id_y:
addi sp, sp, 4
sw ra, 0(sp)
sw ra, -4(sp)
call __builtin_riscv_workitem_id_y
csrr t1, CSR_GID_Y # Get group_id_y
lw t2, KNL_LC_SIZE_Y(a0) # Get local_size_y
@ -229,7 +229,7 @@ __builtin_riscv_global_id_y:
mul t3, t1, t2 # CSR_GID_Y * local_size_y
add t3, t3, t4 # global_offset_y + (CSR_GID_Y * local_size_y)
vadd.vx v0, v0, t3 # global_id_y
lw ra, 0(sp)
lw ra, -4(sp)
addi sp, sp, -4
ret
@ -239,7 +239,7 @@ __builtin_riscv_global_id_y:
.type __builtin_riscv_global_id_z, @function
__builtin_riscv_global_id_z:
addi sp, sp, 4
sw ra, 0(sp)
sw ra, -4(sp)
call __builtin_riscv_workitem_id_z
csrr a0, CSR_KNL # Get kernel metadata buffer
csrr t1, CSR_GID_Z # Get group_id_z
@ -248,7 +248,7 @@ __builtin_riscv_global_id_z:
mul t2, t2, t1 # CSR_GID_Z * local_size_z
add t2, t2, t3 # global_offset_z + (CSR_GID_Z * local_size_z)
vadd.vx v0, v0, t2 # global_id_z
lw ra, 0(sp)
lw ra, -4(sp)
addi sp, sp, -4
ret

View File

@ -466,6 +466,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
Register SPReg = getSPReg(STI);
Register TPReg = getTPReg(STI);
@ -505,26 +506,46 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-SPStackSize),
MachineInstr::FrameDestroy, getStackAlign());
if(TPStackSize)
if(TPStackSize) {
RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg,
StackOffset::getFixed(-TPStackSize),
MachineInstr::FrameDestroy, getStackAlign());
// Restore V32
BuildMI(MBB, MBBI, DL, TII->get(RISCV::VMV_V_X),
RI->getPrivateMemoryBaseRegister(MF))
.addReg(TPReg);
}
// Emit epilogue for shadow call stack.
emitSCSEpilogue(MF, MBB, MBBI, DL);
}
uint64_t RISCVFrameLowering::getExtractedStackOffset(const MachineFunction &MF,
unsigned FI, RISCVStackID::Value Stack) const {
uint64_t RISCVFrameLowering::getStackOffset(const MachineFunction &MF,
int FI,
RISCVStackID::Value Stack) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint64_t StackSize = 0;
for(int I = FI + 1; I != MFI.getObjectIndexEnd(); I++) {
if(static_cast<unsigned>(MFI.getStackID(I)) != Stack) {
// because the parameters spilling to the stack are not in the current TP
// stack, the offset in the current stack should not be calculated from a
// negative FI.
for (int I = FI < 0 ? MFI.getObjectIndexBegin() : 0; I != FI + 1; I++) {
if (static_cast<unsigned>(MFI.getStackID(I)) == Stack) {
// Need to consider the alignment for different frame index
uint64_t Size = MFI.getObjectSize(I);
StackSize += Size;
Align Alignment =
MFI.getObjectAlign(I).value() <= 4 ? Align(4) : MFI.getObjectAlign(I);
StackSize += MFI.getObjectSize(I);
StackSize = alignTo(StackSize, Alignment);
}
}
// In the case of parameters spilling to the stack, needing to add the size of
// the current TP stack because the parameters are on the caller's TP stack
// instead of current stack.
if (FI < 0 && !MF.getFunction().isVarArg())
StackSize += getStackSize(MF, RISCVStackID::VGPRSpill);
return StackSize;
}
@ -545,33 +566,16 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
StackID == RISCVStackID::SGPRSpill ||
StackID == RISCVStackID::VGPRSpill) &&
"Unexpected stack ID for the frame object.");
uint8_t Stack = MFI.getStackID(FI);
StackOffset Offset =
StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea()
-getExtractedStackOffset(MF, FI, RISCVStackID::Value(Stack))
+ MFI.getOffsetAdjustment());
// Different stacks for sALU and vALU threads.
FrameReg = StackID == RISCVStackID::SGPRSpill ? RISCV::X2 : RISCV::X4;
if (CSI.size()) {
// For callee saved registers
MinCSFI = CSI[0].getFrameIdx();
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
if (FI >= MinCSFI && FI <= MaxCSFI) {
Offset -= StackOffset::getFixed(RVFI->getVarArgsSaveSize());
return Offset;
}
}
// TODO: This only saves sGPR CSRs, as we haven't define vGPR CSRs
// within getNonLibcallCSI.
// if (FI >= MinCSFI && FI <= MaxCSFI) {
Offset -= StackOffset::getFixed(
getStackSize(const_cast<MachineFunction&>(MF),
(RISCVStackID::Value)StackID));
return Offset;
if (StackID == RISCVStackID::VGPRSpill)
FrameReg = RISCV::X4;
else if (StackID == RISCVStackID::SGPRSpill)
FrameReg = RISCV::X2;
else
FrameReg = RISCV::X8;
return -StackOffset::getFixed(
getStackOffset(MF, FI, (RISCVStackID::Value)StackID));
}
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
@ -632,7 +636,7 @@ static unsigned estimateFunctionSizeInBytes(const MachineFunction &MF,
// by the frame pointer.
// Let eliminateCallFramePseudoInstr preserve stack space for it.
bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo().hasVarSizedObjects();
return false;
}
// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
@ -644,6 +648,9 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
// Kernel and normal function has different stack pointer for Ventus GPGPU.
Register SPReg = RISCV::X4; // MFI->isEntryFunction() ? RISCV::X2 : RISCV::X4;
DebugLoc DL = MI->getDebugLoc();
Register TPReg = getTPReg(STI);
const RISCVInstrInfo *TII = STI.getInstrInfo();
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
if (!hasReservedCallFrame(MF)) {
// If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
@ -660,9 +667,19 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
Amount = -Amount;
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
MachineInstr::NoFlags, getStackAlign());
// The value of TP will be re-assigned to V32 at the end of the callee
// function, which is actually the TP value after ADJCALLSTACKUP, so the
// tp value after ADJCALLSTACKDOWN should be reassigned to V32 to ensure
// that it is consistent with the TP value that has not been internally
// adjusted (that is, excluding the initial TP adjustment) within the
// current function.
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
BuildMI(MBB, MI, DL, TII->get(RISCV::VMV_V_X),
RI.getPrivateMemoryBaseRegister(MF))
.addReg(TPReg);
}
}
@ -707,19 +724,20 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
return 0;
}
uint64_t RISCVFrameLowering::getStackSize(MachineFunction &MF,
uint64_t RISCVFrameLowering::getStackSize(const MachineFunction &MF,
RISCVStackID::Value ID) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint64_t StackSize = 0;
for(int I = MFI.getObjectIndexBegin(); I != MFI.getObjectIndexEnd(); I++) {
for(int I = 0; I != MFI.getObjectIndexEnd(); I++) {
if(static_cast<unsigned>(MFI.getStackID(I)) == ID) {
// Need to consider the alignment for different frame index
uint64_t Size = ((MFI.getObjectSize(I) + 3) >> 2) * 4;
StackSize += Size;
Align Alignment = MFI.getObjectAlign(I).value() <= 4 ?
Align(4) : MFI.getObjectAlign(I);
StackSize += MFI.getObjectSize(I);
StackSize = alignTo(StackSize, Alignment);
}
}
}
return StackSize;
}

View File

@ -68,7 +68,7 @@ public:
bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
/// Get stack size for different stack ID
uint64_t getStackSize(MachineFunction &MF, RISCVStackID::Value ID) const;
uint64_t getStackSize(const MachineFunction &MF, RISCVStackID::Value ID) const;
/// Frame Objects:
/// fi#0: id=4 size=48, align=4, at location [SP+8]
@ -77,7 +77,7 @@ public:
/// As we can see, if we split the stack, different frame offset calculation
/// need to be modified too, when calculate the TP stack offset, we need to
/// extract the stack offset of 'SP' in machine function frame
uint64_t getExtractedStackOffset(const MachineFunction &MF, unsigned FI,
uint64_t getStackOffset(const MachineFunction &MF, int FI,
RISCVStackID::Value Stack) const;
/// Before insert prolog/epilog information, set stack ID for each frame index

View File

@ -11787,8 +11787,12 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
// type, instead of the scalable vector type.
ValVT = LocVT;
}
int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(),
// Just align to 4 bytes, because parameters more than 4 bytes will be split
// into 4-byte parameters
int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), 0,
/*IsImmutable=*/true);
MFI.setObjectAlignment(FI, Align(4));
// This is essential for calculating stack size for VGPRSpill
MFI.setStackID(FI, RISCVStackID::VGPRSpill);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
@ -11982,6 +11986,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
RegInfo.addLiveIn(ArgRegs[I], Reg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
MFI.setObjectAlignment(FI, Align(4));
MFI.setStackID(FI, RISCVStackID::VGPRSpill);
// MFI.setStackID(FI, RISCVStackID::VGPRSpill);
SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
@ -12151,6 +12156,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
// Get the value of adjusting the stack frame before the Call.
uint64_t CurrentFrameSize = Chain->getConstantOperandVal(1);
for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue ArgValue = OutVals[i];
@ -12256,11 +12264,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X4, PtrVT);
SDValue Address =
DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
DAG.getIntPtrConstant(-((int)VA.getLocMemOffset()
+ CurrentFrameSize), DL));
// Emit the store.
MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
DAG.getStore(Chain, DL, ArgValue, Address,
MachinePointerInfo(RISCVAS::PRIVATE_ADDRESS)));
}
}

View File

@ -1463,6 +1463,12 @@ std::string RISCVInstrInfo::createMIROperandComment(
return Comment;
}
int RISCVInstrInfo::getSPAdjust(const MachineInstr &MI) const {
// FIXME: Don't need this value now, but we can add relevant modifications
// here when we optimize the PrologueInsert stage in the future.
return 0;
}
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
bool RISCV::isSEXT_W(const MachineInstr &MI) {
return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&

View File

@ -202,6 +202,8 @@ public:
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
int getSPAdjust(const MachineInstr &MI) const;
protected:
const RISCVSubtarget &STI;
};

View File

@ -0,0 +1,347 @@
; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=VENTUS %s
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) vscale_range(1,2048)
define dso_local <16 x double> @func(<16 x double> noundef %x, <16 x double> noundef %y) local_unnamed_addr {
; VENTUS: vsw.v v33, -4(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v34, -8(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v35, -12(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v36, -16(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v37, -20(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v38, -24(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v39, -28(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v40, -32(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v41, -36(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v42, -40(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v43, -44(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v44, -48(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v45, -52(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v46, -56(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v47, -60(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v48, -64(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v49, -68(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v50, -72(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v51, -76(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v52, -80(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v53, -84(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v54, -88(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v55, -92(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v56, -96(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v57, -100(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v58, -104(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v59, -108(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v60, -112(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v61, -116(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v62, -120(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v63, -124(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v64, -128(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v65, -132(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v66, -136(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v67, -140(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v68, -144(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v69, -148(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v70, -152(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v71, -156(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v72, -160(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v73, -164(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v74, -168(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v75, -172(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v76, -176(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v77, -180(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v78, -184(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v79, -188(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v80, -192(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v81, -196(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v82, -200(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v83, -204(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v84, -208(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v85, -212(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v86, -216(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v87, -220(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v88, -224(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v89, -228(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v90, -232(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v91, -236(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v92, -240(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v93, -244(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v94, -248(v32) # 4-byte Folded Spill
entry:
%add = fadd <16 x double> %x, %y
ret <16 x double> %add
}
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) vscale_range(1,2048)
define dso_local ventus_kernel void @test_fn(ptr addrspace(1) nocapture noundef readonly align 128 %x, ptr addrspace(1) nocapture noundef readonly align 128 %y, ptr addrspace(1) nocapture noundef writeonly align 128 %dst) {
; VENTUS: addi tp, tp, 128
; VENTUS=NEXT: li t0, 4
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 2
; VENTUS=NEXT: vmv.v.x v66, t0
; VENTUS=NEXT: regext zero, zero, 80
; VENTUS=NEXT: vsw.v v34, 0(v66)
; VENTUS=NEXT: li t0, 8
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 136
; VENTUS=NEXT: vsw.v v65, 0(v34)
; VENTUS=NEXT: li t0, 12
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 136
; VENTUS=NEXT: vsw.v v64, 0(v34)
; VENTUS=NEXT: li t0, 16
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v63, 0(v34)
; VENTUS=NEXT: li t0, 20
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v62, 0(v34)
; VENTUS=NEXT: li t0, 24
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v61, 0(v34)
; VENTUS=NEXT: li t0, 28
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v60, 0(v34)
; VENTUS=NEXT: li t0, 32
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v59, 0(v34)
; VENTUS=NEXT: li t0, 36
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v58, 0(v34)
; VENTUS=NEXT: li t0, 40
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v57, 0(v34)
; VENTUS=NEXT: li t0, 44
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v56, 0(v34)
; VENTUS=NEXT: li t0, 48
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v55, 0(v34)
; VENTUS=NEXT: li t0, 52
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v54, 0(v34)
; VENTUS=NEXT: li t0, 56
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v53, 0(v34)
; VENTUS=NEXT: li t0, 60
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v52, 0(v34)
; VENTUS=NEXT: li t0, 64
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v51, 0(v34)
; VENTUS=NEXT: li t0, 68
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v50, 0(v34)
; VENTUS=NEXT: li t0, 72
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v49, 0(v34)
; VENTUS=NEXT: li t0, 76
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v48, 0(v34)
; VENTUS=NEXT: li t0, 80
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v47, 0(v34)
; VENTUS=NEXT: li t0, 84
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v46, 0(v34)
; VENTUS=NEXT: li t0, 88
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v45, 0(v34)
; VENTUS=NEXT: li t0, 92
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v44, 0(v34)
; VENTUS=NEXT: li t0, 96
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v43, 0(v34)
; VENTUS=NEXT: li t0, 100
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v42, 0(v34)
; VENTUS=NEXT: li t0, 104
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v41, 0(v34)
; VENTUS=NEXT: li t0, 108
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v40, 0(v34)
; VENTUS=NEXT: li t0, 112
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v39, 0(v34)
; VENTUS=NEXT: li t0, 116
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v38, 0(v34)
; VENTUS=NEXT: li t0, 120
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v37, 0(v34)
; VENTUS=NEXT: li t0, 124
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v36, 0(v34)
; VENTUS=NEXT: li t0, 128
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v35, 0(v34)
; VENTUS=NEXT: call _Z3minDv16_dS_
; VENTUS=NEXT: addi tp, tp, -128
entry:
%call = call i32 @_Z13get_global_idj(i32 noundef 0)
%arrayidx = getelementptr inbounds <16 x double>, ptr addrspace(1) %x, i32 %call
%0 = load <16 x double>, ptr addrspace(1) %arrayidx, align 128
%arrayidx1 = getelementptr inbounds <16 x double>, ptr addrspace(1) %y, i32 %call
%1 = load <16 x double>, ptr addrspace(1) %arrayidx1, align 128
%call2 = call <16 x double> @_Z3minDv16_dS_(<16 x double> noundef %0, <16 x double> noundef %1)
%arrayidx3 = getelementptr inbounds <16 x double>, ptr addrspace(1) %dst, i32 %call
store <16 x double> %call2, ptr addrspace(1) %arrayidx3, align 128
ret void
}
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare dso_local i32 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #2
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare dso_local <16 x double> @_Z3minDv16_dS_(<16 x double> noundef, <16 x double> noundef) local_unnamed_addr #2