Merge pull request #89 from THU-DSP-LAB/eliminate_call_frame

[VENTUS][fix] Fix framelowering and calculation method of stack offset
This commit is contained in:
zhoujingya 2024-02-01 14:54:42 +08:00 committed by GitHub
commit 49c039a902
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 441 additions and 58 deletions

View File

@ -81,7 +81,7 @@ __builtin_riscv_workitem_linear_id:
.type __builtin_riscv_global_linear_id, @function .type __builtin_riscv_global_linear_id, @function
__builtin_riscv_global_linear_id: __builtin_riscv_global_linear_id:
addi sp, sp, 4 addi sp, sp, 4
sw ra, 0(sp) sw ra, -4(sp)
csrr a3, CSR_KNL # Get kernel metadata buffer csrr a3, CSR_KNL # Get kernel metadata buffer
lw t0, KNL_WORK_DIM(a3) # Get work_dims lw t0, KNL_WORK_DIM(a3) # Get work_dims
call __builtin_riscv_global_id_x call __builtin_riscv_global_id_x
@ -109,7 +109,7 @@ __builtin_riscv_global_linear_id:
vadd.vv v5, v5, v6 # global_linear_id3 = tmp + global_linear_id2 vadd.vv v5, v5, v6 # global_linear_id3 = tmp + global_linear_id2
.GLR: .GLR:
vadd.vx v0, v5, zero # Return global_linear_id for 1/2/3 dims vadd.vx v0, v5, zero # Return global_linear_id for 1/2/3 dims
lw ra, 0(sp) lw ra, -4(sp)
addi sp, sp, -4 addi sp, sp, -4
ret ret
@ -202,7 +202,7 @@ __builtin_riscv_workitem_id_z:
.type __builtin_riscv_global_id_x, @function .type __builtin_riscv_global_id_x, @function
__builtin_riscv_global_id_x: __builtin_riscv_global_id_x:
addi sp, sp, 4 addi sp, sp, 4
sw ra, 0(sp) sw ra, -4(sp)
call __builtin_riscv_workitem_id_x call __builtin_riscv_workitem_id_x
csrr a0, CSR_KNL # Get kernel metadata buffer csrr a0, CSR_KNL # Get kernel metadata buffer
csrr t1, CSR_GID_X # Get group_id_x csrr t1, CSR_GID_X # Get group_id_x
@ -211,7 +211,7 @@ __builtin_riscv_global_id_x:
mul t6, t1, t3 # CSR_GID_X * local_size_x mul t6, t1, t3 # CSR_GID_X * local_size_x
add t6, t6, t4 # Get global_offset_x + CSR_GID_X * local_size_x add t6, t6, t4 # Get global_offset_x + CSR_GID_X * local_size_x
vadd.vx v0,v0, t6 vadd.vx v0,v0, t6
lw ra, 0(sp) lw ra, -4(sp)
addi sp, sp, -4 addi sp, sp, -4
ret ret
@ -221,7 +221,7 @@ __builtin_riscv_global_id_x:
.type __builtin_riscv_global_id_y, @function .type __builtin_riscv_global_id_y, @function
__builtin_riscv_global_id_y: __builtin_riscv_global_id_y:
addi sp, sp, 4 addi sp, sp, 4
sw ra, 0(sp) sw ra, -4(sp)
call __builtin_riscv_workitem_id_y call __builtin_riscv_workitem_id_y
csrr t1, CSR_GID_Y # Get group_id_y csrr t1, CSR_GID_Y # Get group_id_y
lw t2, KNL_LC_SIZE_Y(a0) # Get local_size_y lw t2, KNL_LC_SIZE_Y(a0) # Get local_size_y
@ -229,7 +229,7 @@ __builtin_riscv_global_id_y:
mul t3, t1, t2 # CSR_GID_Y * local_size_y mul t3, t1, t2 # CSR_GID_Y * local_size_y
add t3, t3, t4 # global_offset_y + (CSR_GID_Y * local_size_y) add t3, t3, t4 # global_offset_y + (CSR_GID_Y * local_size_y)
vadd.vx v0, v0, t3 # global_id_y vadd.vx v0, v0, t3 # global_id_y
lw ra, 0(sp) lw ra, -4(sp)
addi sp, sp, -4 addi sp, sp, -4
ret ret
@ -239,7 +239,7 @@ __builtin_riscv_global_id_y:
.type __builtin_riscv_global_id_z, @function .type __builtin_riscv_global_id_z, @function
__builtin_riscv_global_id_z: __builtin_riscv_global_id_z:
addi sp, sp, 4 addi sp, sp, 4
sw ra, 0(sp) sw ra, -4(sp)
call __builtin_riscv_workitem_id_z call __builtin_riscv_workitem_id_z
csrr a0, CSR_KNL # Get kernel metadata buffer csrr a0, CSR_KNL # Get kernel metadata buffer
csrr t1, CSR_GID_Z # Get group_id_z csrr t1, CSR_GID_Z # Get group_id_z
@ -248,7 +248,7 @@ __builtin_riscv_global_id_z:
mul t2, t2, t1 # CSR_GID_Z * local_size_z mul t2, t2, t1 # CSR_GID_Z * local_size_z
add t2, t2, t3 # global_offset_z + (CSR_GID_Z * local_size_z) add t2, t2, t3 # global_offset_z + (CSR_GID_Z * local_size_z)
vadd.vx v0, v0, t2 # global_id_z vadd.vx v0, v0, t2 # global_id_z
lw ra, 0(sp) lw ra, -4(sp)
addi sp, sp, -4 addi sp, sp, -4
ret ret

View File

@ -466,6 +466,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const { MachineBasicBlock &MBB) const {
const RISCVRegisterInfo *RI = STI.getRegisterInfo(); const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo(); MachineFrameInfo &MFI = MF.getFrameInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
Register SPReg = getSPReg(STI); Register SPReg = getSPReg(STI);
Register TPReg = getTPReg(STI); Register TPReg = getTPReg(STI);
@ -505,26 +506,46 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-SPStackSize), StackOffset::getFixed(-SPStackSize),
MachineInstr::FrameDestroy, getStackAlign()); MachineInstr::FrameDestroy, getStackAlign());
if(TPStackSize) if(TPStackSize) {
RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg, RI->adjustReg(MBB, MBBI, DL, TPReg, TPReg,
StackOffset::getFixed(-TPStackSize), StackOffset::getFixed(-TPStackSize),
MachineInstr::FrameDestroy, getStackAlign()); MachineInstr::FrameDestroy, getStackAlign());
// Restore V32
BuildMI(MBB, MBBI, DL, TII->get(RISCV::VMV_V_X),
RI->getPrivateMemoryBaseRegister(MF))
.addReg(TPReg);
}
// Emit epilogue for shadow call stack. // Emit epilogue for shadow call stack.
emitSCSEpilogue(MF, MBB, MBBI, DL); emitSCSEpilogue(MF, MBB, MBBI, DL);
} }
uint64_t RISCVFrameLowering::getExtractedStackOffset(const MachineFunction &MF, uint64_t RISCVFrameLowering::getStackOffset(const MachineFunction &MF,
unsigned FI, RISCVStackID::Value Stack) const { int FI,
RISCVStackID::Value Stack) const {
const MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo();
uint64_t StackSize = 0; uint64_t StackSize = 0;
for(int I = FI + 1; I != MFI.getObjectIndexEnd(); I++) {
if(static_cast<unsigned>(MFI.getStackID(I)) != Stack) { // because the parameters spilling to the stack are not in the current TP
// stack, the offset in the current stack should not be calculated from a
// negative FI.
for (int I = FI < 0 ? MFI.getObjectIndexBegin() : 0; I != FI + 1; I++) {
if (static_cast<unsigned>(MFI.getStackID(I)) == Stack) {
// Need to consider the alignment for different frame index // Need to consider the alignment for different frame index
uint64_t Size = MFI.getObjectSize(I); Align Alignment =
StackSize += Size; MFI.getObjectAlign(I).value() <= 4 ? Align(4) : MFI.getObjectAlign(I);
StackSize += MFI.getObjectSize(I);
StackSize = alignTo(StackSize, Alignment);
} }
} }
// In the case of parameters spilling to the stack, needing to add the size of
// the current TP stack because the parameters are on the caller's TP stack
// instead of current stack.
if (FI < 0 && !MF.getFunction().isVarArg())
StackSize += getStackSize(MF, RISCVStackID::VGPRSpill);
return StackSize; return StackSize;
} }
@ -545,33 +566,16 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
StackID == RISCVStackID::SGPRSpill || StackID == RISCVStackID::SGPRSpill ||
StackID == RISCVStackID::VGPRSpill) && StackID == RISCVStackID::VGPRSpill) &&
"Unexpected stack ID for the frame object."); "Unexpected stack ID for the frame object.");
uint8_t Stack = MFI.getStackID(FI);
StackOffset Offset =
StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea()
-getExtractedStackOffset(MF, FI, RISCVStackID::Value(Stack))
+ MFI.getOffsetAdjustment());
// Different stacks for sALU and vALU threads. // Different stacks for sALU and vALU threads.
FrameReg = StackID == RISCVStackID::SGPRSpill ? RISCV::X2 : RISCV::X4; if (StackID == RISCVStackID::VGPRSpill)
FrameReg = RISCV::X4;
if (CSI.size()) { else if (StackID == RISCVStackID::SGPRSpill)
// For callee saved registers FrameReg = RISCV::X2;
MinCSFI = CSI[0].getFrameIdx(); else
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); FrameReg = RISCV::X8;
if (FI >= MinCSFI && FI <= MaxCSFI) { return -StackOffset::getFixed(
Offset -= StackOffset::getFixed(RVFI->getVarArgsSaveSize()); getStackOffset(MF, FI, (RISCVStackID::Value)StackID));
return Offset;
}
}
// TODO: This only saves sGPR CSRs, as we haven't define vGPR CSRs
// within getNonLibcallCSI.
// if (FI >= MinCSFI && FI <= MaxCSFI) {
Offset -= StackOffset::getFixed(
getStackSize(const_cast<MachineFunction&>(MF),
(RISCVStackID::Value)StackID));
return Offset;
} }
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
@ -632,7 +636,7 @@ static unsigned estimateFunctionSizeInBytes(const MachineFunction &MF,
// by the frame pointer. // by the frame pointer.
// Let eliminateCallFramePseudoInstr preserve stack space for it. // Let eliminateCallFramePseudoInstr preserve stack space for it.
bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo().hasVarSizedObjects(); return false;
} }
// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions. // Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
@ -644,6 +648,9 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
// Kernel and normal function has different stack pointer for Ventus GPGPU. // Kernel and normal function has different stack pointer for Ventus GPGPU.
Register SPReg = RISCV::X4; // MFI->isEntryFunction() ? RISCV::X2 : RISCV::X4; Register SPReg = RISCV::X4; // MFI->isEntryFunction() ? RISCV::X2 : RISCV::X4;
DebugLoc DL = MI->getDebugLoc(); DebugLoc DL = MI->getDebugLoc();
Register TPReg = getTPReg(STI);
const RISCVInstrInfo *TII = STI.getInstrInfo();
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
if (!hasReservedCallFrame(MF)) { if (!hasReservedCallFrame(MF)) {
// If space has not been reserved for a call frame, ADJCALLSTACKDOWN and // If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
@ -660,9 +667,19 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN) if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
Amount = -Amount; Amount = -Amount;
const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount), RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
MachineInstr::NoFlags, getStackAlign()); MachineInstr::NoFlags, getStackAlign());
// The value of TP will be re-assigned to V32 at the end of the callee
// function, which is actually the TP value after ADJCALLSTACKUP, so the
// tp value after ADJCALLSTACKDOWN should be reassigned to V32 to ensure
// that it is consistent with the TP value that has not been internally
// adjusted (that is, excluding the initial TP adjustment) within the
// current function.
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
BuildMI(MBB, MI, DL, TII->get(RISCV::VMV_V_X),
RI.getPrivateMemoryBaseRegister(MF))
.addReg(TPReg);
} }
} }
@ -707,19 +724,20 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
return 0; return 0;
} }
uint64_t RISCVFrameLowering::getStackSize(MachineFunction &MF, uint64_t RISCVFrameLowering::getStackSize(const MachineFunction &MF,
RISCVStackID::Value ID) const { RISCVStackID::Value ID) const {
MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo();
uint64_t StackSize = 0; uint64_t StackSize = 0;
for(int I = MFI.getObjectIndexBegin(); I != MFI.getObjectIndexEnd(); I++) { for(int I = 0; I != MFI.getObjectIndexEnd(); I++) {
if(static_cast<unsigned>(MFI.getStackID(I)) == ID) { if(static_cast<unsigned>(MFI.getStackID(I)) == ID) {
// Need to consider the alignment for different frame index Align Alignment = MFI.getObjectAlign(I).value() <= 4 ?
uint64_t Size = ((MFI.getObjectSize(I) + 3) >> 2) * 4; Align(4) : MFI.getObjectAlign(I);
StackSize += Size; StackSize += MFI.getObjectSize(I);
StackSize = alignTo(StackSize, Alignment);
}
} }
}
return StackSize; return StackSize;
} }

View File

@ -68,7 +68,7 @@ public:
bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
/// Get stack size for different stack ID /// Get stack size for different stack ID
uint64_t getStackSize(MachineFunction &MF, RISCVStackID::Value ID) const; uint64_t getStackSize(const MachineFunction &MF, RISCVStackID::Value ID) const;
/// Frame Objects: /// Frame Objects:
/// fi#0: id=4 size=48, align=4, at location [SP+8] /// fi#0: id=4 size=48, align=4, at location [SP+8]
@ -77,7 +77,7 @@ public:
/// As we can see, if we split the stack, different frame offset calculation /// As we can see, if we split the stack, different frame offset calculation
/// need to be modified too, when calculate the TP stack offset, we need to /// need to be modified too, when calculate the TP stack offset, we need to
/// extract the stack offset of 'SP' in machine function frame /// extract the stack offset of 'SP' in machine function frame
uint64_t getExtractedStackOffset(const MachineFunction &MF, unsigned FI, uint64_t getStackOffset(const MachineFunction &MF, int FI,
RISCVStackID::Value Stack) const; RISCVStackID::Value Stack) const;
/// Before insert prolog/epilog information, set stack ID for each frame index /// Before insert prolog/epilog information, set stack ID for each frame index

View File

@ -11787,8 +11787,12 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
// type, instead of the scalable vector type. // type, instead of the scalable vector type.
ValVT = LocVT; ValVT = LocVT;
} }
int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(),
// Just align to 4 bytes, because parameters more than 4 bytes will be split
// into 4-byte parameters
int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), 0,
/*IsImmutable=*/true); /*IsImmutable=*/true);
MFI.setObjectAlignment(FI, Align(4));
// This is essential for calculating stack size for VGPRSpill // This is essential for calculating stack size for VGPRSpill
MFI.setStackID(FI, RISCVStackID::VGPRSpill); MFI.setStackID(FI, RISCVStackID::VGPRSpill);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
@ -11982,6 +11986,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
RegInfo.addLiveIn(ArgRegs[I], Reg); RegInfo.addLiveIn(ArgRegs[I], Reg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true); FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
MFI.setObjectAlignment(FI, Align(4));
MFI.setStackID(FI, RISCVStackID::VGPRSpill); MFI.setStackID(FI, RISCVStackID::VGPRSpill);
// MFI.setStackID(FI, RISCVStackID::VGPRSpill); // MFI.setStackID(FI, RISCVStackID::VGPRSpill);
SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
@ -12151,6 +12156,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains; SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr; SDValue StackPtr;
// Get the value of adjusting the stack frame before the Call.
uint64_t CurrentFrameSize = Chain->getConstantOperandVal(1);
for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i]; CCValAssign &VA = ArgLocs[i];
SDValue ArgValue = OutVals[i]; SDValue ArgValue = OutVals[i];
@ -12256,11 +12264,13 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X4, PtrVT); StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X4, PtrVT);
SDValue Address = SDValue Address =
DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); DAG.getIntPtrConstant(-((int)VA.getLocMemOffset()
+ CurrentFrameSize), DL));
// Emit the store. // Emit the store.
MemOpChains.push_back( MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); DAG.getStore(Chain, DL, ArgValue, Address,
MachinePointerInfo(RISCVAS::PRIVATE_ADDRESS)));
} }
} }

View File

@ -1463,6 +1463,12 @@ std::string RISCVInstrInfo::createMIROperandComment(
return Comment; return Comment;
} }
int RISCVInstrInfo::getSPAdjust(const MachineInstr &MI) const {
// FIXME: Don't need this value now, but we can add relevant modifications
// here when we optimize the PrologueInsert stage in the future.
return 0;
}
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0. // Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
bool RISCV::isSEXT_W(const MachineInstr &MI) { bool RISCV::isSEXT_W(const MachineInstr &MI) {
return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() && return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&

View File

@ -202,6 +202,8 @@ public:
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
int getSPAdjust(const MachineInstr &MI) const;
protected: protected:
const RISCVSubtarget &STI; const RISCVSubtarget &STI;
}; };

View File

@ -0,0 +1,347 @@
; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=VENTUS %s
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) vscale_range(1,2048)
define dso_local <16 x double> @func(<16 x double> noundef %x, <16 x double> noundef %y) local_unnamed_addr {
; VENTUS: vsw.v v33, -4(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v34, -8(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v35, -12(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v36, -16(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v37, -20(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v38, -24(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v39, -28(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v40, -32(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v41, -36(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v42, -40(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v43, -44(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v44, -48(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v45, -52(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v46, -56(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v47, -60(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v48, -64(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v49, -68(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v50, -72(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v51, -76(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v52, -80(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v53, -84(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v54, -88(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v55, -92(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v56, -96(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v57, -100(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v58, -104(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v59, -108(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v60, -112(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v61, -116(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v62, -120(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 72
; VENTUS-NEXT: vsw.v v63, -124(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v64, -128(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v65, -132(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v66, -136(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v67, -140(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v68, -144(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v69, -148(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v70, -152(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v71, -156(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v72, -160(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v73, -164(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v74, -168(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v75, -172(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v76, -176(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v77, -180(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v78, -184(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v79, -188(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v80, -192(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v81, -196(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v82, -200(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v83, -204(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v84, -208(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v85, -212(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v86, -216(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v87, -220(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v88, -224(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v89, -228(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v90, -232(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v91, -236(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v92, -240(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v93, -244(v32) # 4-byte Folded Spill
; VENTUS-NEXT: regext zero, zero, 136
; VENTUS-NEXT: vsw.v v94, -248(v32) # 4-byte Folded Spill
entry:
%add = fadd <16 x double> %x, %y
ret <16 x double> %add
}
; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) vscale_range(1,2048)
define dso_local ventus_kernel void @test_fn(ptr addrspace(1) nocapture noundef readonly align 128 %x, ptr addrspace(1) nocapture noundef readonly align 128 %y, ptr addrspace(1) nocapture noundef writeonly align 128 %dst) {
; VENTUS: addi tp, tp, 128
; VENTUS=NEXT: li t0, 4
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 2
; VENTUS=NEXT: vmv.v.x v66, t0
; VENTUS=NEXT: regext zero, zero, 80
; VENTUS=NEXT: vsw.v v34, 0(v66)
; VENTUS=NEXT: li t0, 8
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 136
; VENTUS=NEXT: vsw.v v65, 0(v34)
; VENTUS=NEXT: li t0, 12
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 136
; VENTUS=NEXT: vsw.v v64, 0(v34)
; VENTUS=NEXT: li t0, 16
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v63, 0(v34)
; VENTUS=NEXT: li t0, 20
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v62, 0(v34)
; VENTUS=NEXT: li t0, 24
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v61, 0(v34)
; VENTUS=NEXT: li t0, 28
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v60, 0(v34)
; VENTUS=NEXT: li t0, 32
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v59, 0(v34)
; VENTUS=NEXT: li t0, 36
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v58, 0(v34)
; VENTUS=NEXT: li t0, 40
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v57, 0(v34)
; VENTUS=NEXT: li t0, 44
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v56, 0(v34)
; VENTUS=NEXT: li t0, 48
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v55, 0(v34)
; VENTUS=NEXT: li t0, 52
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v54, 0(v34)
; VENTUS=NEXT: li t0, 56
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v53, 0(v34)
; VENTUS=NEXT: li t0, 60
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v52, 0(v34)
; VENTUS=NEXT: li t0, 64
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v51, 0(v34)
; VENTUS=NEXT: li t0, 68
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v50, 0(v34)
; VENTUS=NEXT: li t0, 72
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v49, 0(v34)
; VENTUS=NEXT: li t0, 76
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v48, 0(v34)
; VENTUS=NEXT: li t0, 80
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v47, 0(v34)
; VENTUS=NEXT: li t0, 84
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v46, 0(v34)
; VENTUS=NEXT: li t0, 88
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v45, 0(v34)
; VENTUS=NEXT: li t0, 92
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v44, 0(v34)
; VENTUS=NEXT: li t0, 96
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v43, 0(v34)
; VENTUS=NEXT: li t0, 100
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v42, 0(v34)
; VENTUS=NEXT: li t0, 104
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v41, 0(v34)
; VENTUS=NEXT: li t0, 108
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v40, 0(v34)
; VENTUS=NEXT: li t0, 112
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v39, 0(v34)
; VENTUS=NEXT: li t0, 116
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v38, 0(v34)
; VENTUS=NEXT: li t0, 120
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v37, 0(v34)
; VENTUS=NEXT: li t0, 124
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v36, 0(v34)
; VENTUS=NEXT: li t0, 128
; VENTUS=NEXT: sub t0, tp, t0
; VENTUS=NEXT: regext zero, zero, 1
; VENTUS=NEXT: vmv.v.x v34, t0
; VENTUS=NEXT: regext zero, zero, 72
; VENTUS=NEXT: vsw.v v35, 0(v34)
; VENTUS=NEXT: call _Z3minDv16_dS_
; VENTUS=NEXT: addi tp, tp, -128
entry:
%call = call i32 @_Z13get_global_idj(i32 noundef 0)
%arrayidx = getelementptr inbounds <16 x double>, ptr addrspace(1) %x, i32 %call
%0 = load <16 x double>, ptr addrspace(1) %arrayidx, align 128
%arrayidx1 = getelementptr inbounds <16 x double>, ptr addrspace(1) %y, i32 %call
%1 = load <16 x double>, ptr addrspace(1) %arrayidx1, align 128
%call2 = call <16 x double> @_Z3minDv16_dS_(<16 x double> noundef %0, <16 x double> noundef %1)
%arrayidx3 = getelementptr inbounds <16 x double>, ptr addrspace(1) %dst, i32 %call
store <16 x double> %call2, ptr addrspace(1) %arrayidx3, align 128
ret void
}
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare dso_local i32 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #2
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare dso_local <16 x double> @_Z3minDv16_dS_(<16 x double> noundef, <16 x double> noundef) local_unnamed_addr #2