From d32d735ea4d954b77719a8067a1008787322290f Mon Sep 17 00:00:00 2001 From: zhoujingya Date: Fri, 24 Nov 2023 12:09:21 +0800 Subject: [PATCH] [VENTUS][fix] Remove instructions not supported by hardware These instructions included belows: * float load/store instructions * vfmv instruction * "Single-Width Floating-Point/Integer Type-Convert Instructions" in RISCV manual --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 15 --------- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 23 +------------- .../Target/RISCV/RISCVMakeCompressible.cpp | 9 ++---- .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 8 +---- llvm/lib/Target/RISCV/VentusInstrInfoF.td | 24 +++++++------- llvm/lib/Target/RISCV/VentusInstrInfoV.td | 17 +++++----- .../CodeGen/RISCV/VentusGPGPU/addr-space.ll | 4 +-- .../CodeGen/RISCV/VentusGPGPU/addr-space2.ll | 14 ++++----- .../test/CodeGen/RISCV/VentusGPGPU/bitcast.ll | 6 ++-- llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll | 3 +- .../RISCV/VentusGPGPU/builtin-noverify.ll | 4 +-- .../test/CodeGen/RISCV/VentusGPGPU/builtin.ll | 8 ++--- .../RISCV/VentusGPGPU/calling-convention.ll | 29 +++++++++-------- llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll | 19 ------------ .../RISCV/VentusGPGPU/function-call.ll | 4 +-- .../RISCV/VentusGPGPU/resource-usage.ll | 15 ++++----- .../test/CodeGen/RISCV/VentusGPGPU/var-arg.ll | 28 ++++++++--------- .../CodeGen/RISCV/VentusGPGPU/vbranch-join.ll | 31 ++++++++++--------- 18 files changed, 98 insertions(+), 163 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 4a9e56e6d596..8cccbea60444 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2650,15 +2650,6 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case RISCV::PseudoLD: emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false); return false; - case RISCV::PseudoFLH: - emitLoadStoreSymbol(Inst, RISCV::FLH, IDLoc, Out, /*HasTmpReg=*/true); - return false; - case RISCV::PseudoFLW: - emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true); - return false; - case RISCV::PseudoFLD: - emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /*HasTmpReg=*/true); - return false; case RISCV::PseudoSB: emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /*HasTmpReg=*/true); return false; @@ -2674,12 +2665,6 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case RISCV::PseudoFSH: emitLoadStoreSymbol(Inst, RISCV::FSH, IDLoc, Out, /*HasTmpReg=*/true); return false; - case RISCV::PseudoFSW: - emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true); - return false; - case RISCV::PseudoFSD: - emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /*HasTmpReg=*/true); - return false; case RISCV::PseudoAddTPRel: if (checkPseudoAddTPRel(Inst, Operands)) return true; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 77a902da1302..c3929b47530c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -75,12 +75,9 @@ unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, case RISCV::LBU: case RISCV::LH: case RISCV::LHU: - case RISCV::FLH: case RISCV::LW: - case RISCV::FLW: case RISCV::LWU: case RISCV::LD: - case RISCV::FLD: case RISCV::VLW: case RISCV::VLH: case RISCV::VLB: @@ -123,10 +120,7 @@ unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI, case RISCV::SB: case RISCV::SH: case RISCV::SW: - case RISCV::FSH: - case RISCV::FSW: case RISCV::SD: - case RISCV::FSD: case RISCV::VSW: case RISCV::VSH: break; @@ -184,10 +178,7 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // sGPRF32 -> vGPR move if (RISCV::GPRF32RegClass.contains(SrcReg) && RISCV::VGPRRegClass.contains(DstReg)) { - BuildMI(MBB, MBBI, DL, get(RISCV::VFMV_S_F), DstReg) - .addReg(DstReg, RegState::Undef) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; + llvm_unreachable("Not supported by HW, use vmv.v.x instead."); } // Handle copy from csr @@ -238,12 +229,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW : RISCV::SD; - } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FSH; - } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FSW; - } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FSD; } else if (RISCV::VGPRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::VSW; } else @@ -281,12 +266,6 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW : RISCV::LD; - } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FLH; - } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FLW; - } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FLD; } else if (RISCV::VGPRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::VLW; } else diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index 42d2943c6787..a9b4e49af9e6 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -103,12 +103,9 @@ static unsigned log2LdstWidth(unsigned Opcode) { llvm_unreachable("Unexpected opcode"); case RISCV::LW: case RISCV::SW: - case RISCV::FLW: - case RISCV::FSW: return 2; case RISCV::LD: case RISCV::SD: - case RISCV::FLD: case RISCV::FSD: return 3; } @@ -147,8 +144,7 @@ static bool isCompressibleLoad(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget(); const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) || - Opcode == RISCV::LD || Opcode == RISCV::FLD; + return Opcode == RISCV::LW || Opcode == RISCV::LD; } // Return true if MI is a store for which there exists a compressed version. @@ -156,8 +152,7 @@ static bool isCompressibleStore(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget(); const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) || - Opcode == RISCV::SD || Opcode == RISCV::FSD; + return Opcode == RISCV::SW || Opcode == RISCV::SD; } // Find a single register and/or large offset which, if compressible, would diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index a57635abb7e0..2d05077663b3 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -355,16 +355,10 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, case RISCV::LHU: case RISCV::LWU: case RISCV::LD: - case RISCV::FLH: - case RISCV::FLW: - case RISCV::FLD: case RISCV::SB: case RISCV::SH: case RISCV::SW: - case RISCV::SD: - case RISCV::FSH: - case RISCV::FSW: - case RISCV::FSD: { + case RISCV::SD: { if (UseMI.getOperand(1).isFI()) return false; // Register defined by Lo should not be the value register. diff --git a/llvm/lib/Target/RISCV/VentusInstrInfoF.td b/llvm/lib/Target/RISCV/VentusInstrInfoF.td index fefe36189e99..46f3831f2ef3 100644 --- a/llvm/lib/Target/RISCV/VentusInstrInfoF.td +++ b/llvm/lib/Target/RISCV/VentusInstrInfoF.td @@ -341,11 +341,13 @@ class PseudoVFROUND //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZfinx] in { -def FLW : FPLoad_r<0b010, "flw", GPRF32, WriteFLD32>; -// Operands for stores are in the order srcreg, base, offset rather than -// reflecting the order these fields are specified in the instruction -// encoding. -def FSW : FPStore_r<0b010, "fsw", GPRF32, WriteFST32>; +/// Loads +def : Pat<(f32 (load (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12))), + (COPY_TO_REGCLASS (LW GPR:$rs1, simm12:$imm12), GPRF32)>; + +/// Stores +def : Pat<(store (f32 FPR32INX:$rs2), (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12)), + (SW (COPY_TO_REGCLASS FPR32INX:$rs2, GPR), GPR:$rs1, simm12:$imm12)>; } // Predicates = [HasStdExtZfinx] let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in { @@ -450,8 +452,8 @@ defm : FPUnaryOpDynFrmAlias_m; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZfinx] in { -def : InstAlias<"flw $rd, (${rs1})", (FLW GPRF32:$rd, GPR:$rs1, 0), 0>; -def : InstAlias<"fsw $rs2, (${rs1})", (FSW GPRF32:$rs2, GPR:$rs1, 0), 0>; +// def : InstAlias<"flw $rd, (${rs1})", (FLW GPRF32:$rd, GPR:$rs1, 0), 0>; +// def : InstAlias<"fsw $rs2, (${rs1})", (FSW GPRF32:$rs2, GPR:$rs1, 0), 0>; def : InstAlias<"fmv.s $rd, $rs", (FSGNJ_S GPRF32:$rd, GPRF32:$rs, GPRF32:$rs)>; def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S GPRF32:$rd, GPRF32:$rs, GPRF32:$rs)>; @@ -636,13 +638,13 @@ defm Select_FPR32 : SelectCC_GPR_rrirr; def PseudoVFROUND_S : PseudoVFROUND; def PseudoFROUND_S : PseudoFROUND; -/// Loads +// /// Loads -defm : UniformLdPat; +// defm : UniformLdPat; -/// Stores +// /// Stores -defm : UniformStPat; +// defm : UniformStPat; } // Predicates = [HasStdExtZfinx] diff --git a/llvm/lib/Target/RISCV/VentusInstrInfoV.td b/llvm/lib/Target/RISCV/VentusInstrInfoV.td index 8f396aa0e95d..984352f02112 100644 --- a/llvm/lib/Target/RISCV/VentusInstrInfoV.td +++ b/llvm/lib/Target/RISCV/VentusInstrInfoV.td @@ -1134,8 +1134,9 @@ defm VFCVT_XU_F_V : VCVTI_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>; defm VFCVT_X_F_V : VCVTI_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>; } // Follow the way by RISCVInstrInfoF -defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>; -defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>; +// TODO: later support +// defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>; +// defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>; let Uses = [FRM] in { defm VFCVT_F_XU_V : VCVTF_IV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>; defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>; @@ -1336,10 +1337,10 @@ defm : PatFloatSetCC<[VGPR, GPRF32], [SETOLE, SETLE], VMFLE_VV>; defm : PatFloatSetCC<[VGPR, GPRF32], [SETOGT, SETGT], VMFGT_VF>; defm : PatFloatSetCC<[VGPR, GPRF32], [SETOGE, SETGE], VMFGE_VF>; -def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), - (VFCVT_RTZ_X_F_V (f32 VGPR:$rs1), $frm)>; -def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), - (VFCVT_RTZ_XU_F_V (f32 VGPR:$rs1), $frm)>; +// def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), +// (VFCVT_RTZ_X_F_V (f32 VGPR:$rs1), $frm)>; +// def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), +// (VFCVT_RTZ_XU_F_V (f32 VGPR:$rs1), $frm)>; def : PatFXConvert, [XLenVT, f32], VFCVT_X_F_V>; def : PatFXConvert, @@ -1495,6 +1496,6 @@ def : Pat<(XLenVT (DivergentBinFrag (XLenVT VGPR:$rs1), uimm12:$imm)), // There already has patterns defined in VentusInstrInfo.td let Predicates = [HasStdExtZfinx] in { -// def : Pat<(f32 (bitconvert (i32 GPR:$src))), (VMV_V_X GPR:$src)>; -def : Pat<(i32 (bitconvert GPRF32:$src)), (VFMV_V_F GPRF32:$src)>; +def : Pat<(f32 (bitconvert (i32 GPR:$src))), (VMV_V_X GPR:$src)>; +// def : Pat<(i32 (bitconvert GPRF32:$src)), (VFMV_V_F GPRF32:$src)>; } // Predicates = [HasStdExtZfinx] diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll index 41407866feb9..97f21e5f6ff1 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll @@ -11,7 +11,7 @@ define dso_local ventus_kernel void @func(ptr addrspace(1) nocapture noundef ali ; VENTUS-NEXT: .cfi_def_cfa_offset 4 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 4 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: lw t0, 0(a0) @@ -35,7 +35,7 @@ define dso_local ventus_kernel void @func(ptr addrspace(1) nocapture noundef ali ; VENTUS-NEXT: vlw12.v v2, 0(v1) ; VENTUS-NEXT: vadd.vv v0, v2, v0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -12 ; VENTUS-NEXT: addi tp, tp, -4 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll index eab09cbda1b1..a090733ba98a 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll @@ -13,7 +13,7 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: .cfi_def_cfa_offset 24 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 4 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: lw t0, 0(a0) @@ -58,8 +58,9 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: vadd.vv v0, v33, v0 ; VENTUS-NEXT: vsw12.v v1, 0(v0) ; VENTUS-NEXT: .LBB0_3: # %if.end -; VENTUS-NEXT: join -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: # Label of block must be emitted +; VENTUS-NEXT: join zero, zero, 0 +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: addi tp, tp, -24 ; VENTUS-NEXT: ret @@ -241,10 +242,9 @@ define dso_local ventus_kernel void @local_memmory1(ptr addrspace(3) nocapture n ; VENTUS-LABEL: local_memmory1: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lw t0, 0(a0) -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v1, 0(v0) -; VENTUS-NEXT: vadd.vi v1, v1, 1 -; VENTUS-NEXT: vsw12.v v1, 0(v0) +; VENTUS-NEXT: lw t1, 0(t0) +; VENTUS-NEXT: addi t1, t1, 1 +; VENTUS-NEXT: sw t1, 0(t0) ; VENTUS-NEXT: ret entry: %0 = load i32, ptr addrspace(3) %b, align 4 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll index 19a9281b4f5e..f9a21a444f04 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll @@ -6,12 +6,12 @@ define dso_local ventus_kernel void @bitcast(float noundef %a, ptr addrspace(5) ; VENTUS-LABEL: bitcast: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lw t0, 8(a0) -; VENTUS-NEXT: flw t1, 0(a0) +; VENTUS-NEXT: lw t1, 0(a0) ; VENTUS-NEXT: lw t2, 4(a0) -; VENTUS-NEXT: vfmv.s.f v0, t1 +; VENTUS-NEXT: vmv.v.x v0, t1 ; VENTUS-NEXT: vmv.v.x v1, t2 ; VENTUS-NEXT: vsw.v v0, 0(v1) -; VENTUS-NEXT: fsw t1, 0(t0) +; VENTUS-NEXT: sw t1, 0(t0) ; VENTUS-NEXT: ret entry: %conv = bitcast float %a to i32 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll index c69ba586d903..26ad7ded1865 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll @@ -14,7 +14,8 @@ define i32 @foo(i32 noundef %cond, i32 noundef %a, i32 noundef %b, i32 noundef % ; VENTUS-NEXT: # %bb.1: ; VENTUS-NEXT: vrsub.vi v3, v3, 0 ; VENTUS-NEXT: .LBB0_2: # %entry -; VENTUS-NEXT: join +; VENTUS-NEXT: # Label of block must be emitted +; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: vmadd.vv v2, v1, v3 ; VENTUS-NEXT: vadd.vx v0, v2, zero ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll index f1537d70be48..bb3343f7c922 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll @@ -11,7 +11,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: .cfi_def_cfa_offset 8 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 8 ; VENTUS-NEXT: .cfi_offset v33.l, 4 ; VENTUS-NEXT: .cfi_offset v34.l, 0 @@ -30,7 +30,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll index 9ca6df0f8cbc..65e5569a56ea 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll @@ -7,7 +7,7 @@ define ventus_kernel void @foo_ker(ptr addrspace(1) nocapture noundef align 4 %A ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 12 ; VENTUS-NEXT: .cfi_def_cfa_offset 12 -; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 0(a0) ; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill @@ -24,7 +24,7 @@ define ventus_kernel void @foo_ker(ptr addrspace(1) nocapture noundef align 4 %A ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -12 ; VENTUS-NEXT: ret entry: @@ -47,7 +47,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: .cfi_def_cfa_offset 8 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 8 ; VENTUS-NEXT: .cfi_offset v33.l, 4 ; VENTUS-NEXT: .cfi_offset v34.l, 0 @@ -66,7 +66,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll index ab2ee139f621..453f7c4f2c1a 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll @@ -12,7 +12,7 @@ define dso_local ventus_kernel void @kernel_calling_convention(ptr addrspace(1) ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 16 ; VENTUS-NEXT: .cfi_def_cfa_offset 16 -; VENTUS-NEXT: sw ra, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: sw t0, -12(sp) # 4-byte Folded Spill @@ -30,13 +30,12 @@ define dso_local ventus_kernel void @kernel_calling_convention(ptr addrspace(1) ; VENTUS-NEXT: vadd.vx v0, v0, t1 ; VENTUS-NEXT: vmv.v.x v1, s0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw t0, -12(sp) # 4-byte Folded Reload -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v1, 0(v0) -; VENTUS-NEXT: lw t0, 0(t2) -; VENTUS-NEXT: vadd.vx v1, v1, t0 -; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t0, 0(s0) +; VENTUS-NEXT: lw t2, 0(t2) +; VENTUS-NEXT: add t0, t2, t0 +; VENTUS-NEXT: sw t0, 0(s0) +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -16 ; VENTUS-NEXT: ret entry: @@ -82,16 +81,16 @@ entry: define dso_local i32 @non_kernel_calling_convention(ptr nocapture noundef readonly %a1, ptr nocapture noundef readonly %a2, ptr nocapture noundef readonly %a3, ptr nocapture noundef readonly %a4, ptr nocapture noundef readonly %a5, ptr nocapture noundef readonly %a6, ptr nocapture noundef readonly %a7, ptr nocapture noundef readonly %a8, ptr nocapture noundef readonly %a9, ptr nocapture noundef readonly %a10, ptr nocapture noundef readonly %a11, ptr nocapture noundef readonly %a12, ptr nocapture noundef readonly %a13, ptr nocapture noundef readonly %a14, ptr nocapture noundef readonly %a15, ptr nocapture noundef readonly %a16, ptr nocapture noundef readonly %a17, ptr nocapture noundef readonly %a18, ptr nocapture noundef readonly %a19, ptr nocapture noundef readonly %a20, ptr nocapture noundef readonly %a21, ptr nocapture noundef readonly %a22, ptr nocapture noundef readonly %a23, ptr nocapture noundef readonly %a24, ptr nocapture noundef readonly %a25, ptr nocapture noundef readonly %a26, ptr nocapture noundef readonly %a27, ptr nocapture noundef readonly %a28, ptr nocapture noundef readonly %a29, ptr nocapture noundef readonly %a30, ptr nocapture noundef readonly %a31, ptr nocapture noundef readonly %a32, ptr addrspace(3) nocapture noundef readonly %a33, ptr addrspace(5) nocapture noundef readonly %a34) local_unnamed_addr #2 { ; VENTUS-LABEL: non_kernel_calling_convention: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: addi tp, tp, 28 -; VENTUS-NEXT: .cfi_def_cfa_offset 28 +; VENTUS-NEXT: addi tp, tp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: .cfi_offset v33.l, 4 ; VENTUS-NEXT: .cfi_offset v34.l, 0 ; VENTUS-NEXT: regext zero, zero, 9 -; VENTUS-NEXT: vlw.v v33, -24(v32) +; VENTUS-NEXT: vlw.v v33, -12(v32) ; VENTUS-NEXT: regext zero, zero, 9 -; VENTUS-NEXT: vlw.v v34, -28(v32) +; VENTUS-NEXT: vlw.v v34, -16(v32) ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: vlw12.v v1, 0(v1) ; VENTUS-NEXT: vlw12.v v2, 0(v2) @@ -161,7 +160,7 @@ define dso_local i32 @non_kernel_calling_convention(ptr nocapture noundef readon ; VENTUS-NEXT: vadd.vv v0, v0, v1 ; VENTUS-NEXT: vadd.vv v0, v0, v2 ; VENTUS-NEXT: vadd.vv v0, v0, v3 -; VENTUS-NEXT: addi tp, tp, -28 +; VENTUS-NEXT: addi tp, tp, -16 ; VENTUS-NEXT: ret entry: %0 = load i32, ptr %a1, align 4 @@ -270,7 +269,7 @@ define dso_local i32 @test_add(ptr nocapture noundef readonly %a, ptr nocapture ; VENTUS-NEXT: .cfi_def_cfa_offset 8 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: vadd.vi v0, v0, 1 @@ -288,7 +287,7 @@ define dso_local i32 @test_add(ptr nocapture noundef readonly %a, ptr nocapture ; VENTUS-NEXT: regext zero, zero, 8 ; VENTUS-NEXT: vlw.v v1, -8(v32) ; VENTUS-NEXT: vadd.vv v0, v1, v0 -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll index b93930235c91..51fa6ec4d020 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll @@ -466,25 +466,6 @@ define float @fsgnjnx_v(float %a) nounwind { ret float %1 } -define i32 @fcvt_rtz_x_f_v(float %a) nounwind { -; VENTUS-LABEL: fcvt_rtz_x_f_v: -; VENTUS: # %bb.0: -; VENTUS-NEXT: vfcvt.rtz.x.f.v v0, v0 -; VENTUS-NEXT: ret - %1 = call float @llvm.trunc.f32(float %a) - %conv = fptosi float %1 to i32 - ret i32 %conv -} - -define i32 @fcvt_rtz_xu_f_v(float %x) { -; VENTUS-LABEL: fcvt_rtz_xu_f_v: -; VENTUS: # %bb.0: -; VENTUS-NEXT: vfcvt.rtz.xu.f.v v0, v0 -; VENTUS-NEXT: ret - %a = call float @llvm.trunc.f32(float %x) - %b = fptoui float %a to i32 - ret i32 %b -} @global_val = dso_local global float 0x3FF547AE20000000, align 4 declare float @llvm.sqrt.f32(float %Val) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll index 8e0dca08af1c..40a944d2e4aa 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll @@ -25,7 +25,7 @@ define dso_local ventus_kernel void @foo(i32 noundef %a, i32 noundef %b, ptr add ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 8(a0) ; VENTUS-NEXT: sw t0, -4(sp) # 4-byte Folded Spill @@ -37,7 +37,7 @@ define dso_local ventus_kernel void @foo(i32 noundef %a, i32 noundef %b, ptr add ; VENTUS-NEXT: lw t0, -4(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll index 6cc3e1a9a89d..4705bce7008d 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll @@ -11,17 +11,14 @@ define dso_local ventus_kernel void @usage(ptr addrspace(1) nocapture noundef al ; VENTUS-LABEL: usage: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 4 -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: lw t1, 0(a0) -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v0, 0(v0) -; VENTUS-NEXT: lw t0, 0(t1) -; VENTUS-NEXT: vadd.vx v0, v0, t0 -; VENTUS-NEXT: vmv.v.x v1, t1 -; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload -; VENTUS-NEXT: barrier x0, x0, 1 +; VENTUS-NEXT: lw t0, 0(t0) +; VENTUS-NEXT: lw t2, 0(t1) +; VENTUS-NEXT: add t0, t2, t0 +; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll index 8520cc25bf11..1c23a2dcaa7f 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll @@ -13,23 +13,23 @@ target triple = "riscv32" define dso_local i32 @printf(ptr addrspace(2) noundef %fmt, ...) { ; VENTUS-LABEL: printf: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: addi tp, tp, 64 -; VENTUS-NEXT: .cfi_def_cfa_offset 64 +; VENTUS-NEXT: addi tp, tp, 40 +; VENTUS-NEXT: .cfi_def_cfa_offset 40 ; VENTUS-NEXT: vmv.v.x v8, tp -; VENTUS-NEXT: vsw.v v7, -60(v8) -; VENTUS-NEXT: vsw.v v6, -56(v8) -; VENTUS-NEXT: vsw.v v5, -52(v8) -; VENTUS-NEXT: vsw.v v4, -48(v8) -; VENTUS-NEXT: vsw.v v3, -44(v8) -; VENTUS-NEXT: vsw.v v2, -40(v8) -; VENTUS-NEXT: vsw.v v1, -36(v8) -; VENTUS-NEXT: addi t0, tp, -36 +; VENTUS-NEXT: vsw.v v7, -36(v8) +; VENTUS-NEXT: vsw.v v6, -32(v8) +; VENTUS-NEXT: vsw.v v5, -28(v8) +; VENTUS-NEXT: vsw.v v4, -24(v8) +; VENTUS-NEXT: vsw.v v3, -20(v8) +; VENTUS-NEXT: vsw.v v2, -16(v8) +; VENTUS-NEXT: vsw.v v1, -12(v8) +; VENTUS-NEXT: addi t0, tp, -12 ; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vsw.v v0, -36(v8) -; VENTUS-NEXT: addi t0, tp, -32 +; VENTUS-NEXT: vsw.v v0, -12(v8) +; VENTUS-NEXT: addi t0, tp, -8 ; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vsw.v v0, -36(v8) -; VENTUS-NEXT: addi tp, tp, -64 +; VENTUS-NEXT: vsw.v v0, -12(v8) +; VENTUS-NEXT: addi tp, tp, -40 ; VENTUS-NEXT: ret entry: %retval = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll index 06d3e9d28165..224900905e84 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll @@ -8,7 +8,7 @@ define dso_local i32 @branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 4 ; VENTUS-NEXT: .cfi_def_cfa_offset 4 -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z13get_global_idj @@ -37,7 +37,7 @@ define dso_local i32 @branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: vadd.vx v0, v1, zero -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: @@ -63,7 +63,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: sw a0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero @@ -88,7 +88,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: # =>This Inner Loop Header: Depth=1 ; VENTUS-NEXT: vlw12.v v4, 0(v3) ; VENTUS-NEXT: vadd.vv v2, v2, v4 -; VENTUS-NEXT: vadd.vi v0, v0, -1 +; VENTUS-NEXT: vsub12.vi v0, v0, 1 ; VENTUS-NEXT: vsw12.v v2, 0(v1) ; VENTUS-NEXT: .Lpcrel_hi3: ; VENTUS-NEXT: auipc t1, %pcrel_hi(.LBB1_3) @@ -97,7 +97,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: .LBB1_3: # %for.cond.cleanup ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: @@ -134,7 +134,7 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: .cfi_def_cfa_offset 4 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 4 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: vmv.v.x v0, zero @@ -169,6 +169,7 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: vblt v0, v33, .LBB2_5 ; VENTUS-NEXT: # %bb.3: # %if.then2 ; VENTUS-NEXT: li t0, 23 +; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: j .LBB2_6 ; VENTUS-NEXT: .LBB2_4: # %if.end7 ; VENTUS-NEXT: li t0, 4 @@ -177,14 +178,14 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: j .LBB2_7 ; VENTUS-NEXT: .LBB2_5: ; VENTUS-NEXT: li t0, 12 +; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: .LBB2_6: # %cleanup9 ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: .LBB2_7: # %cleanup9 ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -4 ; VENTUS-NEXT: ret @@ -218,7 +219,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: sw a0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero @@ -249,7 +250,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: # => This Inner Loop Header: Depth=2 ; VENTUS-NEXT: vlw12.v v5, 0(v3) ; VENTUS-NEXT: vadd.vv v2, v2, v5 -; VENTUS-NEXT: vadd.vi v4, v4, -1 +; VENTUS-NEXT: vsub12.vi v4, v4, 1 ; VENTUS-NEXT: vsw12.v v2, 0(v1) ; VENTUS-NEXT: .Lpcrel_hi8: ; VENTUS-NEXT: auipc t1, %pcrel_hi(.LBB3_4) @@ -268,7 +269,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: .LBB3_5: # %for.cond.cleanup ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: @@ -312,7 +313,7 @@ define dso_local ventus_kernel void @loop_switch(ptr addrspace(1) nocapture noun ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: sw a0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero @@ -374,7 +375,7 @@ define dso_local ventus_kernel void @loop_switch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: .LBB4_9: # %for.cond.cleanup ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: @@ -426,7 +427,7 @@ define dso_local i32 @_Z13get_global_idj(i32 noundef %dim) local_unnamed_addr { ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 4 ; VENTUS-NEXT: .cfi_def_cfa_offset 4 -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: li t0, 2 ; VENTUS-NEXT: vmv.v.x v1, t0 @@ -461,7 +462,7 @@ define dso_local i32 @_Z13get_global_idj(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: .LBB5_7: # %return ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: