diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 23202937f18e..b85c1b40f6c2 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -300,7 +300,8 @@ getNonLibcallCSI(const MachineFunction &MF, // TODO: For now, we don't define VGPR callee saved registers, when we later // add VGPR callee saved register, remember to modify here if (FI >= 0 && (MFI.getStackID(FI) == RISCVStackID::Default || - MFI.getStackID(FI) == RISCVStackID::SGPRSpill)) + MFI.getStackID(FI) == RISCVStackID::SGPRSpill || + MFI.getStackID(FI) == RISCVStackID::VGPRSpill)) NonLibcallCSI.push_back(CS); } @@ -505,18 +506,23 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, emitSCSEpilogue(MF, MBB, MBBI, DL); } -uint64_t RISCVFrameLowering::getExtractedStackOffset(const MachineFunction &MF, - unsigned FI, RISCVStackID::Value Stack) const { +uint64_t RISCVFrameLowering::getStackOffset(const MachineFunction &MF, + unsigned FI, + RISCVStackID::Value Stack) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t StackSize = 0; - for(int I = FI + 1; I != MFI.getObjectIndexEnd(); I++) { - if(static_cast(MFI.getStackID(I)) != Stack) { + for (int I = MFI.getObjectIndexBegin(); I != (int)FI + 1; I++) { + if (static_cast(MFI.getStackID(I)) == Stack) { // Need to consider the alignment for different frame index - uint64_t Size = MFI.getObjectSize(I); - StackSize += Size; + Align Alignment = + MFI.getObjectAlign(I).value() <= 4 ? Align(4) : MFI.getObjectAlign(I); + uint64_t AlignedSize = alignTo(MFI.getObjectSize(I), Alignment); + StackSize += AlignedSize; } } - return StackSize; + return alignTo(StackSize, MFI.getObjectAlign(FI).value() <= 4 + ? Align(4) + : MFI.getObjectAlign(FI)); } StackOffset @@ -536,33 +542,11 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, StackID == RISCVStackID::SGPRSpill || StackID == RISCVStackID::VGPRSpill) && "Unexpected stack ID for the frame object."); - uint8_t Stack = MFI.getStackID(FI); - StackOffset Offset = - StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() - -getExtractedStackOffset(MF, FI, RISCVStackID::Value(Stack)) - + MFI.getOffsetAdjustment()); - - // Different stacks for sALU and vALU threads. - FrameReg = StackID == RISCVStackID::SGPRSpill ? RISCV::X2 : RISCV::X4; - - if (CSI.size()) { - // For callee saved registers - MinCSFI = CSI[0].getFrameIdx(); - MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); - if (FI >= MinCSFI && FI <= MaxCSFI) { - Offset -= StackOffset::getFixed(RVFI->getVarArgsSaveSize()); - return Offset; - } - } - // TODO: This only saves sGPR CSRs, as we haven't define vGPR CSRs - // within getNonLibcallCSI. - // if (FI >= MinCSFI && FI <= MaxCSFI) { - Offset -= StackOffset::getFixed( - getStackSize(const_cast(MF), - (RISCVStackID::Value)StackID)); - return Offset; + FrameReg = StackID == RISCVStackID::VGPRSpill ? RISCV::X4 : RISCV::X2; + return -StackOffset::getFixed( + getStackOffset(MF, FI, (RISCVStackID::Value)StackID)); } void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, @@ -702,16 +686,17 @@ uint64_t RISCVFrameLowering::getStackSize(MachineFunction &MF, RISCVStackID::Value ID) const { MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t StackSize = 0; - + Align Alignment = Align(4); for(int I = MFI.getObjectIndexBegin(); I != MFI.getObjectIndexEnd(); I++) { if(static_cast(MFI.getStackID(I)) == ID) { - // Need to consider the alignment for different frame index - uint64_t Size = ((MFI.getObjectSize(I) + 3) >> 2) * 4; - StackSize += Size; + // FIXME: this code logic maybe not that correct? + StackSize += ((MFI.getObjectSize(I) + 3) >> 2) * 4; + // Get frame object largest alignment + Alignment = std::max(MFI.getObjectAlign(I), Alignment); } - } - return StackSize; + // FIXME: maybe this alignment is too simple? + return alignTo(StackSize, Alignment); } void RISCVFrameLowering::determineStackID(MachineFunction &MF) const { @@ -760,17 +745,17 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - // TODO: Have we allocated stack for vGPR spilling? if(Reg.id() < RISCV::V0 || Reg.id() > RISCV::V255) { MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::SGPRSpill); // FIXME: Right now, no vgpr callee saved register, maybe later needed TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(), RC, TRI); + } else { + assert(Reg.id() >= RISCV::V32 && Reg.id() <= RISCV::V255 && "TODO"); + MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::VGPRSpill); + TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(), + RC, TRI); } - // else { - // FIXME: Right now, no callee saved register for VGPR - // MF->getFrameInfo().setStackID(CS.getFrameIdx(), RISCVStackID::VGPRSpill); - // } } return true; @@ -798,8 +783,7 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( for (auto &CS : NonLibcallCSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - if(Reg.id() < RISCV::V0 || Reg.id() > RISCV::V255 ) - TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index e921c2f33923..fd7094ed4669 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -70,14 +70,22 @@ public: /// Get stack size for different stack ID uint64_t getStackSize(MachineFunction &MF, RISCVStackID::Value ID) const; + + /// Calculate frame object's stack offset /// Frame Objects: /// fi#0: id=4 size=48, align=4, at location [SP+8] /// fi#1: id=1 size=4, align=4, at location [SP+4] \ /// fi#2: id=1 size=4, align=4, at location [SP] \ + /// fi#3: id=4 size=4, align=4, at location [SP+16] \ /// As we can see, if we split the stack, different frame offset calculation - /// need to be modified too, when calculate the TP stack offset, we need to - /// extract the stack offset of 'SP' in machine function frame - uint64_t getExtractedStackOffset(const MachineFunction &MF, unsigned FI, + /// need to be modified too, basic routine follows belows: + /// 1st: Mark all the frame object, and give them unique identifier id, in + /// ventus, they will be: Default, VGPR, SGPR + /// + /// 2st: Calculate frame offset for different stack identifier, unlike + /// traditional riscv stack frame offset calculation, we simply this + /// procedure, we do not have to care about RVV .etc + uint64_t getStackOffset(const MachineFunction &MF, unsigned FI, RISCVStackID::Value Stack) const; /// Before insert prolog/epilog information, set stack ID for each frame index diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll index 41407866feb9..330b59e20e23 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll @@ -12,12 +12,14 @@ define dso_local ventus_kernel void @func(ptr addrspace(1) nocapture noundef ali ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill -; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v33, -4(v32) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: lw t0, 0(a0) -; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill -; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: sw t0, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: lw t0, 4(a0) +; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z13get_global_idj ; VENTUS-NEXT: regext zero, zero, 1 @@ -25,17 +27,19 @@ define dso_local ventus_kernel void @func(ptr addrspace(1) nocapture noundef ali ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z12get_local_idj ; VENTUS-NEXT: vsll.vi v0, v0, 2 -; VENTUS-NEXT: lw t1, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t1, -8(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vadd.vx v0, v0, t1 ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: regext zero, zero, 64 ; VENTUS-NEXT: vsll.vi v1, v33, 2 -; VENTUS-NEXT: lw t0, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t0, -4(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vadd.vx v1, v1, t0 ; VENTUS-NEXT: vlw12.v v2, 0(v1) ; VENTUS-NEXT: vadd.vv v0, v2, v0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) ; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v33, -4(v32) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -12 ; VENTUS-NEXT: addi tp, tp, -4 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll index eab09cbda1b1..cba0e1c494b3 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll @@ -14,16 +14,18 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill -; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v33, -24(v32) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: lw t0, 0(a0) ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v33, t0 ; VENTUS-NEXT: lui t1, %hi(foo.b) ; VENTUS-NEXT: addi t2, t1, %lo(foo.b) -; VENTUS-NEXT: addi t1, tp, -24 +; VENTUS-NEXT: addi t1, tp, -20 ; VENTUS-NEXT: vmv.v.x v0, t1 -; VENTUS-NEXT: sw t2, 16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw t2, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v1, t2 ; VENTUS-NEXT: vmv.v.x v2, t0 ; VENTUS-NEXT: call bar @@ -37,10 +39,10 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: vbltu v1, v0, .LBB0_2 ; VENTUS-NEXT: # %bb.1: # %if.then ; VENTUS-NEXT: vsll.vi v0, v0, 2 -; VENTUS-NEXT: addi t0, tp, -24 +; VENTUS-NEXT: addi t0, tp, -20 ; VENTUS-NEXT: vadd.vx v1, v0, t0 ; VENTUS-NEXT: vlw.v v1, 0(v1) -; VENTUS-NEXT: lw t1, 16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t1, -4(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vadd.vx v2, v0, t1 ; VENTUS-NEXT: vlw12.v v2, 0(v2) ; VENTUS-NEXT: regext zero, zero, 64 @@ -58,8 +60,11 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: vadd.vv v0, v33, v0 ; VENTUS-NEXT: vsw12.v v1, 0(v0) ; VENTUS-NEXT: .LBB0_3: # %if.end -; VENTUS-NEXT: join +; VENTUS-NEXT: # Label of block must be emitted +; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v33, -24(v32) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: addi tp, tp, -24 ; VENTUS-NEXT: ret @@ -241,10 +246,9 @@ define dso_local ventus_kernel void @local_memmory1(ptr addrspace(3) nocapture n ; VENTUS-LABEL: local_memmory1: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lw t0, 0(a0) -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v1, 0(v0) -; VENTUS-NEXT: vadd.vi v1, v1, 1 -; VENTUS-NEXT: vsw12.v v1, 0(v0) +; VENTUS-NEXT: lw t1, 0(t0) +; VENTUS-NEXT: addi t1, t1, 1 +; VENTUS-NEXT: sw t1, 0(t0) ; VENTUS-NEXT: ret entry: %0 = load i32, ptr addrspace(3) %b, align 4 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll index 19a9281b4f5e..b30c267c9738 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll @@ -8,7 +8,7 @@ define dso_local ventus_kernel void @bitcast(float noundef %a, ptr addrspace(5) ; VENTUS-NEXT: lw t0, 8(a0) ; VENTUS-NEXT: flw t1, 0(a0) ; VENTUS-NEXT: lw t2, 4(a0) -; VENTUS-NEXT: vfmv.s.f v0, t1 +; VENTUS-NEXT: vmv.v.x v0, t1 ; VENTUS-NEXT: vmv.v.x v1, t2 ; VENTUS-NEXT: vsw.v v0, 0(v1) ; VENTUS-NEXT: fsw t1, 0(t0) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll index c69ba586d903..26ad7ded1865 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll @@ -14,7 +14,8 @@ define i32 @foo(i32 noundef %cond, i32 noundef %a, i32 noundef %b, i32 noundef % ; VENTUS-NEXT: # %bb.1: ; VENTUS-NEXT: vrsub.vi v3, v3, 0 ; VENTUS-NEXT: .LBB0_2: # %entry -; VENTUS-NEXT: join +; VENTUS-NEXT: # Label of block must be emitted +; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: vmadd.vv v2, v1, v3 ; VENTUS-NEXT: vadd.vx v0, v2, zero ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll index f1537d70be48..28976f1c66aa 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll @@ -12,8 +12,12 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill -; VENTUS-NEXT: .cfi_offset ra, 8 -; VENTUS-NEXT: .cfi_offset v33.l, 4 +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v33, -4(v32) # 4-byte Folded Spill +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v34, -8(v32) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 0 +; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: .cfi_offset v34.l, 0 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vadd.vx v33, v1, zero @@ -31,6 +35,10 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) ; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v33, -4(v32) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v34, -8(v32) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll index 9ca6df0f8cbc..99b30620e794 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll @@ -10,16 +10,16 @@ define ventus_kernel void @foo_ker(ptr addrspace(1) nocapture noundef align 4 %A ; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 0(a0) -; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill -; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: sw t0, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: lw t0, 4(a0) +; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z13get_global_idj ; VENTUS-NEXT: vsll.vi v0, v0, 2 -; VENTUS-NEXT: lw t1, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t1, -8(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vadd.vx v1, v0, t1 ; VENTUS-NEXT: vlw12.v v1, 0(v1) -; VENTUS-NEXT: lw t0, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t0, -4(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vadd.vx v0, v0, t0 ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 @@ -48,8 +48,12 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill -; VENTUS-NEXT: .cfi_offset ra, 8 -; VENTUS-NEXT: .cfi_offset v33.l, 4 +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v33, -4(v32) # 4-byte Folded Spill +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v34, -8(v32) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 0 +; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: .cfi_offset v34.l, 0 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vadd.vx v33, v1, zero @@ -67,6 +71,10 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) ; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v33, -4(v32) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v34, -8(v32) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll index ab2ee139f621..d44c2aee0cec 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll @@ -15,14 +15,14 @@ define dso_local ventus_kernel void @kernel_calling_convention(ptr addrspace(1) ; VENTUS-NEXT: sw ra, -16(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 4(a0) -; VENTUS-NEXT: sw t0, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw t0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: lw t0, 8(a0) ; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill ; VENTUS-NEXT: lw t0, 0(a0) -; VENTUS-NEXT: sw t0, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw t0, -12(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z13get_global_idj -; VENTUS-NEXT: lw s0, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -12(sp) # 4-byte Folded Reload ; VENTUS-NEXT: lw t0, 0(s0) ; VENTUS-NEXT: lw t2, -8(sp) # 4-byte Folded Reload ; VENTUS-NEXT: lw t1, 0(t2) @@ -30,12 +30,11 @@ define dso_local ventus_kernel void @kernel_calling_convention(ptr addrspace(1) ; VENTUS-NEXT: vadd.vx v0, v0, t1 ; VENTUS-NEXT: vmv.v.x v1, s0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw t0, -12(sp) # 4-byte Folded Reload -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v1, 0(v0) -; VENTUS-NEXT: lw t0, 0(t2) -; VENTUS-NEXT: vadd.vx v1, v1, t0 -; VENTUS-NEXT: vsw12.v v1, 0(v0) +; VENTUS-NEXT: lw s0, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t0, 0(s0) +; VENTUS-NEXT: lw t2, 0(t2) +; VENTUS-NEXT: add t0, t2, t0 +; VENTUS-NEXT: sw t0, 0(s0) ; VENTUS-NEXT: lw ra, -16(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -16 ; VENTUS-NEXT: ret @@ -82,16 +81,14 @@ entry: define dso_local i32 @non_kernel_calling_convention(ptr nocapture noundef readonly %a1, ptr nocapture noundef readonly %a2, ptr nocapture noundef readonly %a3, ptr nocapture noundef readonly %a4, ptr nocapture noundef readonly %a5, ptr nocapture noundef readonly %a6, ptr nocapture noundef readonly %a7, ptr nocapture noundef readonly %a8, ptr nocapture noundef readonly %a9, ptr nocapture noundef readonly %a10, ptr nocapture noundef readonly %a11, ptr nocapture noundef readonly %a12, ptr nocapture noundef readonly %a13, ptr nocapture noundef readonly %a14, ptr nocapture noundef readonly %a15, ptr nocapture noundef readonly %a16, ptr nocapture noundef readonly %a17, ptr nocapture noundef readonly %a18, ptr nocapture noundef readonly %a19, ptr nocapture noundef readonly %a20, ptr nocapture noundef readonly %a21, ptr nocapture noundef readonly %a22, ptr nocapture noundef readonly %a23, ptr nocapture noundef readonly %a24, ptr nocapture noundef readonly %a25, ptr nocapture noundef readonly %a26, ptr nocapture noundef readonly %a27, ptr nocapture noundef readonly %a28, ptr nocapture noundef readonly %a29, ptr nocapture noundef readonly %a30, ptr nocapture noundef readonly %a31, ptr nocapture noundef readonly %a32, ptr addrspace(3) nocapture noundef readonly %a33, ptr addrspace(5) nocapture noundef readonly %a34) local_unnamed_addr #2 { ; VENTUS-LABEL: non_kernel_calling_convention: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: addi tp, tp, 28 -; VENTUS-NEXT: .cfi_def_cfa_offset 28 -; VENTUS-NEXT: regext zero, zero, 1 -; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: .cfi_offset v33.l, 4 -; VENTUS-NEXT: .cfi_offset v34.l, 0 +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v33, -24(v32) # 4-byte Folded Spill +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v34, -28(v32) # 4-byte Folded Spill ; VENTUS-NEXT: regext zero, zero, 9 -; VENTUS-NEXT: vlw.v v33, -24(v32) +; VENTUS-NEXT: vlw.v v33, -4(v32) ; VENTUS-NEXT: regext zero, zero, 9 -; VENTUS-NEXT: vlw.v v34, -28(v32) +; VENTUS-NEXT: vlw.v v34, -32(v32) ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: vlw12.v v1, 0(v1) ; VENTUS-NEXT: vlw12.v v2, 0(v2) @@ -161,7 +158,11 @@ define dso_local i32 @non_kernel_calling_convention(ptr nocapture noundef readon ; VENTUS-NEXT: vadd.vv v0, v0, v1 ; VENTUS-NEXT: vadd.vv v0, v0, v2 ; VENTUS-NEXT: vadd.vv v0, v0, v3 -; VENTUS-NEXT: addi tp, tp, -28 +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v33, -24(v32) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v34, -28(v32) # 4-byte Folded Reload +; VENTUS-NEXT: addi tp, tp, -16 ; VENTUS-NEXT: ret entry: %0 = load i32, ptr %a1, align 4 @@ -275,18 +276,18 @@ define dso_local i32 @test_add(ptr nocapture noundef readonly %a, ptr nocapture ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: vadd.vi v0, v0, 1 ; VENTUS-NEXT: regext zero, zero, 8 -; VENTUS-NEXT: vsw.v v0, -8(v32) +; VENTUS-NEXT: vsw.v v0, -4(v32) ; VENTUS-NEXT: vlw12.v v0, 0(v1) ; VENTUS-NEXT: vadd.vi v0, v0, 2 ; VENTUS-NEXT: regext zero, zero, 8 -; VENTUS-NEXT: vsw.v v0, -4(v32) -; VENTUS-NEXT: addi t0, tp, -8 -; VENTUS-NEXT: addi t1, tp, -4 +; VENTUS-NEXT: vsw.v v0, -8(v32) +; VENTUS-NEXT: addi t0, tp, -4 +; VENTUS-NEXT: addi t1, tp, -8 ; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: vmv.v.x v1, t1 ; VENTUS-NEXT: call add ; VENTUS-NEXT: regext zero, zero, 8 -; VENTUS-NEXT: vlw.v v1, -8(v32) +; VENTUS-NEXT: vlw.v v1, -4(v32) ; VENTUS-NEXT: vadd.vv v0, v1, v0 ; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/float-arith-zfinx.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/float-arith-zfinx.ll index 7dec2d48c627..f4d870408ea8 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/float-arith-zfinx.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/float-arith-zfinx.ll @@ -6,11 +6,11 @@ define dso_local ventus_kernel void @fadd(float noundef %c, float noundef %d, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { ; VENTUS-LABEL: fadd: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: lw t0, 4(a0) -; VENTUS-NEXT: lw t1, 0(a0) +; VENTUS-NEXT: flw t0, 4(a0) +; VENTUS-NEXT: flw t1, 0(a0) ; VENTUS-NEXT: fadd.s t0, t1, t0 ; VENTUS-NEXT: lw t1, 8(a0) -; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: fsw t0, 0(t1) ; VENTUS-NEXT: ret entry: %add1 = fadd float %c, %d @@ -21,11 +21,11 @@ entry: define dso_local ventus_kernel void @fsub(float noundef %c, float noundef %d, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { ; VENTUS-LABEL: fsub: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: lw t0, 4(a0) -; VENTUS-NEXT: lw t1, 0(a0) +; VENTUS-NEXT: flw t0, 4(a0) +; VENTUS-NEXT: flw t1, 0(a0) ; VENTUS-NEXT: fsub.s t0, t1, t0 ; VENTUS-NEXT: lw t1, 8(a0) -; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: fsw t0, 0(t1) ; VENTUS-NEXT: ret entry: %sub = fsub float %c, %d @@ -36,11 +36,11 @@ entry: define dso_local ventus_kernel void @fmul(float noundef %c, float noundef %d, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { ; VENTUS-LABEL: fmul: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: lw t0, 4(a0) -; VENTUS-NEXT: lw t1, 0(a0) +; VENTUS-NEXT: flw t0, 4(a0) +; VENTUS-NEXT: flw t1, 0(a0) ; VENTUS-NEXT: fmul.s t0, t1, t0 ; VENTUS-NEXT: lw t1, 8(a0) -; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: fsw t0, 0(t1) ; VENTUS-NEXT: ret entry: %mul = fmul float %c, %d @@ -51,11 +51,11 @@ entry: define dso_local ventus_kernel void @fdiv(float noundef %c, float noundef %d, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { ; VENTUS-LABEL: fdiv: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: lw t0, 4(a0) -; VENTUS-NEXT: lw t1, 0(a0) +; VENTUS-NEXT: flw t0, 4(a0) +; VENTUS-NEXT: flw t1, 0(a0) ; VENTUS-NEXT: fdiv.s t0, t1, t0 ; VENTUS-NEXT: lw t1, 8(a0) -; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: fsw t0, 0(t1) ; VENTUS-NEXT: ret entry: %div = fdiv float %c, %d @@ -66,12 +66,12 @@ entry: define dso_local ventus_kernel void @fmadd(float noundef %a, float noundef %b, float noundef %c, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { ; VENTUS-LABEL: fmadd: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: lw t0, 8(a0) -; VENTUS-NEXT: lw t1, 4(a0) -; VENTUS-NEXT: lw t2, 0(a0) +; VENTUS-NEXT: flw t0, 8(a0) +; VENTUS-NEXT: flw t1, 4(a0) +; VENTUS-NEXT: flw t2, 0(a0) ; VENTUS-NEXT: fmadd.s t0, t2, t1, t0 ; VENTUS-NEXT: lw t1, 12(a0) -; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: fsw t0, 0(t1) ; VENTUS-NEXT: ret entry: %div = call float @llvm.fma.f32(float %a, float %b, float %c) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll index b93930235c91..71ce11fce029 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll @@ -16,7 +16,7 @@ define float @fadd_f(float noundef %a) { ; VENTUS-LABEL: fadd_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(global_val) -; VENTUS-NEXT: lw t0, %lo(global_val)(t0) +; VENTUS-NEXT: flw t0, %lo(global_val)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfadd.vv v0, v0, v1 ; VENTUS-NEXT: ret @@ -40,7 +40,7 @@ define float @fsub_f(float noundef %a) { ; VENTUS-LABEL: fsub_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(global_val) -; VENTUS-NEXT: lw t0, %lo(global_val)(t0) +; VENTUS-NEXT: flw t0, %lo(global_val)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfsub.vv v0, v0, v1 ; VENTUS-NEXT: ret @@ -64,7 +64,7 @@ define float @fmul_f(float noundef %a) { ; VENTUS-LABEL: fmul_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(global_val) -; VENTUS-NEXT: lw t0, %lo(global_val)(t0) +; VENTUS-NEXT: flw t0, %lo(global_val)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfmul.vv v0, v0, v1 ; VENTUS-NEXT: ret @@ -88,7 +88,7 @@ define float @fdiv_f(float noundef %a, float noundef %b) { ; VENTUS-LABEL: fdiv_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(global_val) -; VENTUS-NEXT: lw t0, %lo(global_val)(t0) +; VENTUS-NEXT: flw t0, %lo(global_val)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfdiv.vv v0, v0, v1 ; VENTUS-NEXT: ret @@ -102,7 +102,7 @@ define float @foo_constant(float noundef %a) { ; VENTUS-LABEL: foo_constant: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI8_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI8_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI8_0)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfmul.vv v0, v0, v1 ; VENTUS-NEXT: ret @@ -193,7 +193,7 @@ define dso_local float @fgt(float noundef %a) { ; VENTUS-LABEL: fgt: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI14_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI14_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI14_0)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vmflt.vv v0, v1, v0 ; VENTUS-NEXT: vsll.vi v0, v0, 2 @@ -217,7 +217,7 @@ define dso_local float @fge(float noundef %a) { ; VENTUS-LABEL: fge: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI15_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI15_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI15_0)(t0) ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vmfle.vv v0, v1, v0 ; VENTUS-NEXT: vsll.vi v0, v0, 2 @@ -332,7 +332,7 @@ define dso_local float @fmadd_f(float noundef %a, float noundef %b, float nounde ; VENTUS-LABEL: fmadd_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI24_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI24_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI24_0)(t0) ; VENTUS-NEXT: vadd.vx v0, v1, zero ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfmadd.vv v0, v1, v2 @@ -360,7 +360,7 @@ define dso_local float @fnmadd_f(float noundef %a, float noundef %b, float nound ; VENTUS-LABEL: fnmadd_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI26_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI26_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI26_0)(t0) ; VENTUS-NEXT: vadd.vx v0, v1, zero ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfmsub.vv v0, v1, v2 @@ -388,7 +388,7 @@ define dso_local float @fmsub_f(float noundef %a, float noundef %b) local_unname ; VENTUS-LABEL: fmsub_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI28_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI28_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI28_0)(t0) ; VENTUS-NEXT: vmv.v.x v2, t0 ; VENTUS-NEXT: vfmsub.vv v0, v2, v1 ; VENTUS-NEXT: ret @@ -415,7 +415,7 @@ define dso_local float @fnmsub_f(float noundef %a, float noundef %b, float nound ; VENTUS-LABEL: fnmsub_f: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lui t0, %hi(.LCPI30_0) -; VENTUS-NEXT: lw t0, %lo(.LCPI30_0)(t0) +; VENTUS-NEXT: flw t0, %lo(.LCPI30_0)(t0) ; VENTUS-NEXT: vadd.vx v0, v1, zero ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vfmadd.vv v0, v1, v2 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll index 6cc3e1a9a89d..f461cb75e268 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll @@ -14,14 +14,11 @@ define dso_local ventus_kernel void @usage(ptr addrspace(1) nocapture noundef al ; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: lw t1, 0(a0) -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v0, 0(v0) -; VENTUS-NEXT: lw t0, 0(t1) -; VENTUS-NEXT: vadd.vx v0, v0, t0 -; VENTUS-NEXT: vmv.v.x v1, t1 -; VENTUS-NEXT: vsw12.v v0, 0(v1) +; VENTUS-NEXT: lw t0, 0(t0) +; VENTUS-NEXT: lw t2, 0(t1) +; VENTUS-NEXT: add t0, t2, t0 +; VENTUS-NEXT: sw t0, 0(t1) ; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload -; VENTUS-NEXT: barrier x0, x0, 1 ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll index 8520cc25bf11..95d5ef8ba0fd 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll @@ -13,23 +13,23 @@ target triple = "riscv32" define dso_local i32 @printf(ptr addrspace(2) noundef %fmt, ...) { ; VENTUS-LABEL: printf: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: addi tp, tp, 64 -; VENTUS-NEXT: .cfi_def_cfa_offset 64 +; VENTUS-NEXT: addi tp, tp, 48 +; VENTUS-NEXT: .cfi_def_cfa_offset 48 ; VENTUS-NEXT: vmv.v.x v8, tp -; VENTUS-NEXT: vsw.v v7, -60(v8) -; VENTUS-NEXT: vsw.v v6, -56(v8) -; VENTUS-NEXT: vsw.v v5, -52(v8) -; VENTUS-NEXT: vsw.v v4, -48(v8) -; VENTUS-NEXT: vsw.v v3, -44(v8) -; VENTUS-NEXT: vsw.v v2, -40(v8) -; VENTUS-NEXT: vsw.v v1, -36(v8) -; VENTUS-NEXT: addi t0, tp, -36 +; VENTUS-NEXT: vsw.v v7, -4(v8) +; VENTUS-NEXT: vsw.v v6, -16(v8) +; VENTUS-NEXT: vsw.v v5, -16(v8) +; VENTUS-NEXT: vsw.v v4, -32(v8) +; VENTUS-NEXT: vsw.v v3, -36(v8) +; VENTUS-NEXT: vsw.v v2, -48(v8) +; VENTUS-NEXT: vsw.v v1, -48(v8) +; VENTUS-NEXT: addi t0, tp, -60 ; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vsw.v v0, -36(v8) -; VENTUS-NEXT: addi t0, tp, -32 +; VENTUS-NEXT: vsw.v v0, -64(v8) +; VENTUS-NEXT: addi t0, tp, -56 ; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vsw.v v0, -36(v8) -; VENTUS-NEXT: addi tp, tp, -64 +; VENTUS-NEXT: vsw.v v0, -64(v8) +; VENTUS-NEXT: addi tp, tp, -48 ; VENTUS-NEXT: ret entry: %retval = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll index 06d3e9d28165..9bf22089f01a 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll @@ -88,7 +88,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: # =>This Inner Loop Header: Depth=1 ; VENTUS-NEXT: vlw12.v v4, 0(v3) ; VENTUS-NEXT: vadd.vv v2, v2, v4 -; VENTUS-NEXT: vadd.vi v0, v0, -1 +; VENTUS-NEXT: vsub12.vi v0, v0, 1 ; VENTUS-NEXT: vsw12.v v2, 0(v1) ; VENTUS-NEXT: .Lpcrel_hi3: ; VENTUS-NEXT: auipc t1, %pcrel_hi(.LBB1_3) @@ -135,7 +135,9 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill -; VENTUS-NEXT: .cfi_offset ra, 4 +; VENTUS-NEXT: regext zero, zero, 72 +; VENTUS-NEXT: vsw.v v33, -4(v32) # 4-byte Folded Spill +; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z13get_global_idj @@ -169,6 +171,7 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: vblt v0, v33, .LBB2_5 ; VENTUS-NEXT: # %bb.3: # %if.then2 ; VENTUS-NEXT: li t0, 23 +; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: j .LBB2_6 ; VENTUS-NEXT: .LBB2_4: # %if.end7 ; VENTUS-NEXT: li t0, 4 @@ -177,14 +180,16 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: j .LBB2_7 ; VENTUS-NEXT: .LBB2_5: ; VENTUS-NEXT: li t0, 12 +; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: .LBB2_6: # %cleanup9 ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: .LBB2_7: # %cleanup9 ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: regext zero, zero, 9 +; VENTUS-NEXT: vlw.v v33, -4(v32) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -4 ; VENTUS-NEXT: ret @@ -249,7 +254,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: # => This Inner Loop Header: Depth=2 ; VENTUS-NEXT: vlw12.v v5, 0(v3) ; VENTUS-NEXT: vadd.vv v2, v2, v5 -; VENTUS-NEXT: vadd.vi v4, v4, -1 +; VENTUS-NEXT: vsub12.vi v4, v4, 1 ; VENTUS-NEXT: vsw12.v v2, 0(v1) ; VENTUS-NEXT: .Lpcrel_hi8: ; VENTUS-NEXT: auipc t1, %pcrel_hi(.LBB3_4)