diff --git a/libclc/generic/include/clc/clc.h b/libclc/generic/include/clc/clc.h index 0210aea50b18..5c0be955e0f4 100644 --- a/libclc/generic/include/clc/clc.h +++ b/libclc/generic/include/clc/clc.h @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include diff --git a/libclc/generic/include/clc/integer/ctz.h b/libclc/generic/include/clc/integer/ctz.h new file mode 100644 index 000000000000..948a9c5d51a3 --- /dev/null +++ b/libclc/generic/include/clc/integer/ctz.h @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include \ No newline at end of file diff --git a/libclc/generic/include/clc/integer/ctz.inc b/libclc/generic/include/clc/integer/ctz.inc new file mode 100644 index 000000000000..585d149882af --- /dev/null +++ b/libclc/generic/include/clc/integer/ctz.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE ctz(__CLC_GENTYPE x); \ No newline at end of file diff --git a/libclc/generic/include/clc/integer/mad_hi.h b/libclc/generic/include/clc/integer/mad_hi.h index 863ce92d9f2d..c6f1181ecc67 100644 --- a/libclc/generic/include/clc/integer/mad_hi.h +++ b/libclc/generic/include/clc/integer/mad_hi.h @@ -1 +1,3 @@ -#define mad_hi(a, b, c) (mul_hi((a),(b))+(c)) +#define __CLC_BODY +#include +#undef __CLC_BODY diff --git a/libclc/generic/include/clc/integer/mad_hi.inc b/libclc/generic/include/clc/integer/mad_hi.inc new file mode 100644 index 000000000000..0443f1f30157 --- /dev/null +++ b/libclc/generic/include/clc/integer/mad_hi.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad_hi(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z); diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index ee2736b5fbc5..32a314f2f99b 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -67,9 +67,11 @@ integer/abs.cl integer/abs_diff.cl integer/add_sat.cl integer/clz.cl +integer/ctz.cl integer/hadd.cl integer/mad24.cl integer/mad_sat.cl +integer/mad_hi.cl integer/mul24.cl integer/mul_hi.cl integer/popcount.cl diff --git a/libclc/generic/lib/integer/ctz.cl b/libclc/generic/lib/integer/ctz.cl new file mode 100644 index 000000000000..2daa0c27d666 --- /dev/null +++ b/libclc/generic/lib/integer/ctz.cl @@ -0,0 +1,42 @@ +#include +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF char ctz(char x) { + return x ? ctz((ushort)(uchar)x) : 8; +} +_CLC_OVERLOAD _CLC_DEF uchar ctz(uchar x) { + return x ? ctz((ushort)x) : 8; +} + +_CLC_OVERLOAD _CLC_DEF short ctz(short x) { + return x ? __builtin_ctzs(x) : 16; +} + +_CLC_OVERLOAD _CLC_DEF ushort ctz(ushort x) { + return x ? __builtin_ctzs(x) : 16; +} + +_CLC_OVERLOAD _CLC_DEF int ctz(int x) { + return x ? __builtin_ctz(x) : 32; +} + +_CLC_OVERLOAD _CLC_DEF uint ctz(uint x) { + return x ? __builtin_ctz(x) : 32; +} + +_CLC_OVERLOAD _CLC_DEF long ctz(long x) { + return x ? __builtin_ctzl(x) : 64; +} + +_CLC_OVERLOAD _CLC_DEF ulong ctz(ulong x) { + return x ? __builtin_ctzl(x) : 64; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, ctz, char) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, ctz, uchar) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, ctz, short) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, ctz, ushort) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ctz, int) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, ctz, uint) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, ctz, long) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, ctz, ulong) diff --git a/libclc/generic/lib/integer/mad_hi.cl b/libclc/generic/lib/integer/mad_hi.cl new file mode 100644 index 000000000000..475635fda683 --- /dev/null +++ b/libclc/generic/lib/integer/mad_hi.cl @@ -0,0 +1,44 @@ +#include +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF char mad_hi(char x, char y, char z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF uchar mad_hi(uchar x, uchar y, uchar z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF short mad_hi(short x, short y, short z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF ushort mad_hi(ushort x, ushort y, ushort z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF int mad_hi(int x, int y, int z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF uint mad_hi(uint x, uint y, uint z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF long mad_hi(long x, long y, long z) { + return mul_hi(x, y) + z; +} + +_CLC_OVERLOAD _CLC_DEF ulong mad_hi(ulong x, ulong y, ulong z) { + return mul_hi(x, y) + z; +} + + +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, mad_hi, char, char, char) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, mad_hi, uchar, uchar, uchar) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, mad_hi, short, short, short) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, mad_hi, ushort, ushort, ushort) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, mad_hi, int, int, int) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, mad_hi, uint, uint, uint) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, mad_hi, long, long, long) +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, mad_hi, ulong, ulong, ulong) diff --git a/libclc/generic/lib/shared/clamp.inc b/libclc/generic/lib/shared/clamp.inc index c918f9c499e7..897283ac3c90 100644 --- a/libclc/generic/lib/shared/clamp.inc +++ b/libclc/generic/lib/shared/clamp.inc @@ -1,9 +1,9 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) { - return (x > z ? z : (x < y ? y : x)); + return (x >= z ? z : (x <= y ? y : x)); } #ifndef __CLC_SCALAR _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z) { - return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x)); + return (x >= (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x <= (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x)); } #endif diff --git a/libclc/riscv32/lib/compiler-rt/floatdidf.cl b/libclc/riscv32/lib/compiler-rt/floatdidf.cl index f77cf99ffc23..1631d4135765 100644 --- a/libclc/riscv32/lib/compiler-rt/floatdidf.cl +++ b/libclc/riscv32/lib/compiler-rt/floatdidf.cl @@ -22,32 +22,32 @@ double __floatdidf(di_int a) { const unsigned N = sizeof(di_int) * 8; const di_int s = a >> (N - 1); a = (a ^ s) - s; - int sd = N - __builtin_clzll(a); + int sd = N - clzl(a); int e = sd - 1; - if (sd > 53) { + if (sd > DBL_MANT_DIG) { switch (sd) { - case 53 + 1: + case DBL_MANT_DIG + 1: a <<= 1; break; - case 53 + 2: + case DBL_MANT_DIG + 2: break; default: - a = ((du_int)a >> (sd - (53 + 2))) | - ((a & ((du_int)(-1) >> ((N + 53 + 2) - sd))) != 0); + a = ((du_int)a >> (sd - (DBL_MANT_DIG + 2))) | + ((a & ((du_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0); }; a |= (a & 4) != 0; ++a; a >>= 2; - if (a & ((du_int)1 << 53)) { + if (a & ((du_int)1 << DBL_MANT_DIG)) { a >>= 1; ++e; } } else { - a <<= (53 - sd); + a <<= (DBL_MANT_DIG - sd); } double_bits fb; fb.u.s.high = ((su_int)s & 0x80000000) | ((su_int)(e + 1023) << 20) | diff --git a/libclc/riscv32/lib/compiler-rt/floatdisf.cl b/libclc/riscv32/lib/compiler-rt/floatdisf.cl index 8cf423efedc9..c4c5848a3b9a 100644 --- a/libclc/riscv32/lib/compiler-rt/floatdisf.cl +++ b/libclc/riscv32/lib/compiler-rt/floatdisf.cl @@ -22,32 +22,32 @@ float __floatdisf(di_int a) { const unsigned N = sizeof(di_int) * 8; const di_int s = a >> (N - 1); a = (a ^ s) - s; - int sd = N - __builtin_clzll(a); + int sd = N - clz64(a); si_int e = sd - 1; - if (sd > 24) { + if (sd > FLT_MANT_DIG) { switch (sd) { - case 24 + 1: + case FLT_MANT_DIG + 1: a <<= 1; break; - case 24 + 2: + case FLT_MANT_DIG + 2: break; default: - a = ((du_int)a >> (sd - (24 + 2))) | - ((a & ((du_int)(-1) >> ((N + 24 + 2) - sd))) != 0); + a = ((du_int)a >> (sd - (FLT_MANT_DIG + 2))) | + ((a & ((du_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0); }; a |= (a & 4) != 0; ++a; a >>= 2; - if (a & ((du_int)1 << 24)) { + if (a & ((du_int)1 << FLT_MANT_DIG)) { a >>= 1; ++e; } } else { - a <<= (24 - sd); + a <<= (FLT_MANT_DIG - sd); } float_bits fb; fb.u = diff --git a/libclc/riscv32/lib/compiler-rt/floatundidf.cl b/libclc/riscv32/lib/compiler-rt/floatundidf.cl index 3f713b823c7b..051cebdd936e 100644 --- a/libclc/riscv32/lib/compiler-rt/floatundidf.cl +++ b/libclc/riscv32/lib/compiler-rt/floatundidf.cl @@ -22,30 +22,30 @@ double __floatundidf(du_int a) { const unsigned N = sizeof(du_int) * 8; int sd = N - __builtin_clzll(a); int e = sd - 1; - if (sd > 53) { + if (sd > DBL_MANT_DIG) { switch (sd) { - case 53 + 1: + case DBL_MANT_DIG + 1: a <<= 1; break; - case 53 + 2: + case DBL_MANT_DIG + 2: break; default: - a = (a >> (sd - (53 + 2))) | - ((a & ((du_int)(-1) >> ((N + 53 + 2) - sd))) != 0); + a = (a >> (sd - (DBL_MANT_DIG + 2))) | + ((a & ((du_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0); }; a |= (a & 4) != 0; ++a; a >>= 2; - if (a & ((du_int)1 << 53)) { + if (a & ((du_int)1 << DBL_MANT_DIG)) { a >>= 1; ++e; } } else { - a <<= (53 - sd); + a <<= (DBL_MANT_DIG - sd); } double_bits fb; fb.u.s.high = ((su_int)(e + 1023) << 20) | ((su_int)(a >> 32) & 0x000FFFFF); diff --git a/libclc/riscv32/lib/compiler-rt/floatundisf.cl b/libclc/riscv32/lib/compiler-rt/floatundisf.cl index f957637383d9..e72295471c52 100644 --- a/libclc/riscv32/lib/compiler-rt/floatundisf.cl +++ b/libclc/riscv32/lib/compiler-rt/floatundisf.cl @@ -17,18 +17,19 @@ // seee eeee emmm mmmm mmmm mmmm mmmm mmmm -typedef union { - int u; - float f; -} float_bits; +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#include "types.h" float __floatundisf(long a) { if (a == 0) return 0.0F; const unsigned N = sizeof(long) * 8; - int sd = N - __builtin_clzll(a); // number of significant digits + int sd = N - clz64(a); // number of significant digits int e = sd - 1; // 8 exponent - if (sd > 24) { + if (sd > FLT_MANT_DIG) { // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR // 12345678901234567890123456 @@ -37,27 +38,27 @@ float __floatundisf(long a) { // Q = bit FLT_MANT_DIG bits to the right of 1 // R = "or" of all bits to the right of Q switch (sd) { - case 24 + 1: + case FLT_MANT_DIG + 1: a <<= 1; break; - case 24 + 2: + case FLT_MANT_DIG + 2: break; default: - a = (a >> (sd - (24 + 2))) | - ((a & ((long)(-1) >> ((N + 24+ 2) - sd))) != 0); + a = (a >> (sd - (FLT_MANT_DIG + 2))) | + ((a & ((long)(-1) >> ((N + FLT_MANT_DIG+ 2) - sd))) != 0); }; // finish: a |= (a & 4) != 0; // Or P into R ++a; // round - this step may add a significant bit a >>= 2; // dump Q and R // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits - if (a & ((long)1 << 24)) { + if (a & ((long)1 << FLT_MANT_DIG)) { a >>= 1; ++e; } // a is now rounded to FLT_MANT_DIG bits } else { - a <<= (24 - sd); + a <<= (FLT_MANT_DIG - sd); // a is now rounded to FLT_MANT_DIG bits } float_bits fb; @@ -65,3 +66,5 @@ float __floatundisf(long a) { ((unsigned)a & 0x007FFFFF); // mantissa return fb.f; } + +#endif diff --git a/libclc/riscv32/lib/compiler-rt/types.h b/libclc/riscv32/lib/compiler-rt/types.h index d3734eb27a93..9c2dc1994361 100644 --- a/libclc/riscv32/lib/compiler-rt/types.h +++ b/libclc/riscv32/lib/compiler-rt/types.h @@ -1,6 +1,8 @@ #ifndef TYPES_H #define TYPES_H +#include + typedef char char2 __attribute__((__ext_vector_type__(2))); typedef char char3 __attribute__((__ext_vector_type__(3))); typedef char char4 __attribute__((__ext_vector_type__(4))); @@ -525,4 +527,43 @@ do { \ (d) = __u.value; \ } while (0) + +static int __attribute__((noinline)) clzl(unsigned long x) +//static int inline clzl(unsigned long x) +{ + for (int i = 0; i != 64; ++i) + if ((x >> (63 - i)) & 1) + return i; + + return 0; +} + +static int ctz64(unsigned long x) +{ + int r = 63; + + x &= ~x + 1; + if (x & 0x00000000FFFFFFFF) r -= 32; + if (x & 0x0000FFFF0000FFFF) r -= 16; + if (x & 0x00FF00FF00FF00FF) r -= 8; + if (x & 0x0F0F0F0F0F0F0F0F) r -= 4; + if (x & 0x3333333333333333) r -= 2; + if (x & 0x5555555555555555) r -= 1; + + return r; +} + +static int clz64(unsigned long x) { + int r = 0; + + if ((x & 0xFFFFFFFF00000000) == 0) r += 32, x <<= 32; + if ((x & 0xFFFF000000000000) == 0) r += 16, x <<= 16; + if ((x & 0xFF00000000000000) == 0) r += 8, x <<= 8; + if ((x & 0xF000000000000000) == 0) r += 4, x <<= 4; + if ((x & 0xC000000000000000) == 0) r += 2, x <<= 2; + if ((x & 0x8000000000000000) == 0) r += 1, x <<= 1; + + return r; +} + #endif // TYPES_H diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 4a9e56e6d596..8cccbea60444 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2650,15 +2650,6 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case RISCV::PseudoLD: emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false); return false; - case RISCV::PseudoFLH: - emitLoadStoreSymbol(Inst, RISCV::FLH, IDLoc, Out, /*HasTmpReg=*/true); - return false; - case RISCV::PseudoFLW: - emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true); - return false; - case RISCV::PseudoFLD: - emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /*HasTmpReg=*/true); - return false; case RISCV::PseudoSB: emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /*HasTmpReg=*/true); return false; @@ -2674,12 +2665,6 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case RISCV::PseudoFSH: emitLoadStoreSymbol(Inst, RISCV::FSH, IDLoc, Out, /*HasTmpReg=*/true); return false; - case RISCV::PseudoFSW: - emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true); - return false; - case RISCV::PseudoFSD: - emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /*HasTmpReg=*/true); - return false; case RISCV::PseudoAddTPRel: if (checkPseudoAddTPRel(Inst, Operands)) return true; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 8a7680f3dd7f..f80853d82f57 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -99,6 +99,9 @@ enum { // Check if this instruction meets the format of RVInstVV IsVVALUInstrShift = UsesMaskPolicyShift + 1, IsVVALUInstrMask = 1 << IsVVALUInstrShift, + + IsVOPIMM11Shift = IsVVALUInstrShift + 1, + IsVOPIMM11Mask = 1 << IsVOPIMM11Shift, }; // Match with the definitions in RISCVInstrFormats.td @@ -142,6 +145,10 @@ static inline bool isVVALUInstr(uint64_t TSFlags) { return TSFlags & IsVVALUInstrMask; } +static inline bool isVOPIMM11(uint64_t TSFlags) { + return TSFlags & IsVOPIMM11Mask; +} + /// \returns true if tail agnostic is enforced for the instruction. static inline bool doesForceTailAgnostic(uint64_t TSFlags) { return TSFlags & ForceTailAgnosticMask; diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 88d09ea19dc5..101569d01438 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -117,7 +117,8 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) { LowerHWASAN_CHECK_MEMACCESS(*MI); return; } - + if (MI->getOpcode() == RISCV::PseudoVXOR_VI_IMM11) + return; if (!lowerRISCVMachineInstrToMCInst(MI, TmpInst, *this)) EmitToStreamer(*OutStreamer, TmpInst); } @@ -197,8 +198,8 @@ bool RISCVAsmPrinter::runOnMachineFunction(MachineFunction &MF) { NewSTI.setFeatureBits(MF.getSubtarget().getFeatureBits()); MCSTI = &NewSTI; STI = &MF.getSubtarget(); - auto *CurrentProgramInfo = const_cast( - STI->getVentusProgramInfo()); + auto *CurrentProgramInfo = + const_cast(STI->getVentusProgramInfo()); if (MF.getInfo()->isEntryFunction()) { MCSectionELF *ResourceSection = OutContext.getELFSection( ".ventus.resource", ELF::SHT_PROGBITS, ELF::SHF_WRITE); diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index a5adbcd20f42..7c8d14a34004 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -53,6 +53,8 @@ private: bool expandCompareSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandVIIMM11(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); }; char RISCVExpandPseudo::ID = 0; @@ -85,6 +87,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the // expanded instructions for each pseudo is correct in the Size field of the // tablegen definition for the pseudo. + if (RISCVII::isVOPIMM11(MBBI->getDesc().TSFlags)) + return expandVIIMM11(MBB, MBBI); + switch (MBBI->getOpcode()) { case RISCV::PseudoCCMOVGPR: return expandCCOp(MBB, MBBI, NextMBBI); @@ -100,6 +105,101 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, return false; } +bool RISCVExpandPseudo::expandVIIMM11(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + const TargetRegisterInfo *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + const MCInstrDesc *MCID = nullptr; + + switch (MBBI->getOpcode()) { + default: + llvm_unreachable("Please add IMM11 Pseudo case here!"); + case RISCV::PseudoVOR_VI_IMM11: + MCID = &TII->get(RISCV::VOR_VI); + break; + case RISCV::PseudoVXOR_VI_IMM11: + MCID = &TII->get(RISCV::VXOR_VI); + break; + case RISCV::PseudoVRSUB_VI_IMM11: + MCID = &TII->get(RISCV::VRSUB_VI); + break; + case RISCV::PseudoVAND_VI_IMM11: + MCID = &TII->get(RISCV::VAND_VI); + break; + case RISCV::PseudoVMSNE_VI_IMM11: + MCID = &TII->get(RISCV::VMSNE_VI); + break; + case RISCV::PseudoVMSEQ_VI_IMM11: + MCID = &TII->get(RISCV::VMSEQ_VI); + break; + } + + assert(MCID && "Unexpected opcode"); + MBBI->setDesc(*MCID); + + int64_t Imm = 0; + signed LowImm = 0; + signed HighImm = 0; + signed Offsets = 0; + signed TmpImm = 0; + + for (unsigned i = 0; i < MBBI->getNumOperands(); ++i) { + MachineOperand &Op = MBBI->getOperand(i); + + if (Op.isImm()) { + Imm = Op.getImm(); + assert((Imm <= 1023 && Imm >= -1024) && "imm not in Imm11 range!"); + + if (Imm >= 0) { + Imm &= 0b01111111111; + } + else { + Imm = -Imm; + Imm = ~Imm + 1; + Imm &= 0b01111111111; + Imm |= 0b10000000000; + } + + LowImm = Imm & 0b00000011111; + TmpImm = ~LowImm + 1; + TmpImm &= 0b01111; + LowImm = (LowImm & 0b10000) ? (TmpImm ? -TmpImm : -15) : LowImm; + + HighImm = (Imm & 0b11111100000) >> 5; + TmpImm = ~HighImm + 1; + TmpImm &= 0b011111; + HighImm = (HighImm & 0b100000) ? (TmpImm ? -TmpImm : -31) : HighImm; + + Op.ChangeToImmediate(LowImm); + + continue; + } + + if (!Op.isReg() || MBBI->getDesc().getOperandConstraint(i, MCOI::TIED_TO) != -1) + continue; + + // deal with register numbers larger than 32. + if (Op.isReg() && + MBBI->getDesc().getOperandConstraint(i, MCOI::TIED_TO) == -1) { + uint16_t RegEncodingValue = TRI->getEncodingValue(Op.getReg()); + if (RegEncodingValue > 31) { + int Pos = MBBI->getDesc().getOperandConstraint(i, MCOI::CUSTOM); + assert(Pos != -1 && "Out of range[0, 31] register operand custom " + "constraint that must be present."); + assert(Pos != 1 && Pos != 3 && "Unexpected Pos!"); + Offsets |= (RegEncodingValue >> 5 & 0x7) << (3 * (Pos == 2 ? 1 : Pos)); + } + } + } + + DebugLoc DL = MBBI->getDebugLoc(); + // Create instruction to expand imm5 or register basic offset as imm * 32. + BuildMI(MBB, MBBI, DL, TII->get(RISCV::REGEXTI), RISCV::X0) + .addReg(RISCV::X0) + .addImm((HighImm << 6) | Offsets); + + return true; +} + bool RISCVExpandPseudo::expandBarrier(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 574c52a4395c..f6ef1418cab4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7459,6 +7459,46 @@ SDValue RISCVTargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); } +SDValue RISCVTargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const { + return Op.getValueType().bitsLE(VT) ? + DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : + DAG.getNode(ISD::FP_ROUND, DL, VT, Op, + DAG.getTargetConstant(0, DL, MVT::i32)); +} + +SDValue RISCVTargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Val, + bool Signed, + const ISD::InputArg *Arg) const { + // First, if it is a widened vector, narrow it. + if (VT.isVector() && + VT.getVectorNumElements() != MemVT.getVectorNumElements()) { + EVT NarrowedVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), + VT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, + DAG.getConstant(0, SL, MVT::i32)); + } + + // Then convert the vector elements or scalar value. + if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && + VT.bitsLT(MemVT)) { + unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; + Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); + } + + if (MemVT.isFloatingPoint()) + Val = getFPExtOrFPRound(DAG, Val, SL, VT); + else if (Signed) + Val = DAG.getSExtOrTrunc(Val, SL, VT); + else + Val = DAG.getZExtOrTrunc(Val, SL, VT); + + return Val; +} SDValue RISCVTargetLowering::lowerKernargMemParameter( SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, @@ -7489,7 +7529,7 @@ SDValue RISCVTargetLowering::lowerKernargMemParameter( SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); // TODO: Support vector and half type. - //ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); + ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); } @@ -7499,8 +7539,9 @@ SDValue RISCVTargetLowering::lowerKernargMemParameter( MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); - // SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); - return DAG.getMergeValues({ Load, Load.getValue(1) }, SL); + SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); + // return DAG.getMergeValues({ Load, Load.getValue(1) }, SL); + return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); } // Returns the opcode of the target-specific SDNode that implements the 32-bit @@ -11582,15 +11623,20 @@ void RISCVTargetLowering::analyzeFormalArgumentsCompute(MachineFunction &MF, const bool IsByRef = Arg.hasByRefAttr(); Type *BaseArgTy = Arg.getType(); Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; + + uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); + IntegerType *ArgIntTy = IntegerType::get(Ctx, 32); + bool IsSmall = (AllocSize < 4); + Align Alignment = DL.getValueOrABITypeAlignment( - IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy); + IsByRef ? Arg.getParamAlign() : std::nullopt, IsSmall ? ArgIntTy : MemArgTy); ArgOffset = alignTo(ArgOffset, Alignment); SmallVector ValueVTs; SmallVector Offsets; ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); - ArgOffset += DL.getTypeAllocSize(MemArgTy); + ArgOffset += AllocSize; for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 71ffa4a2b0f0..0965b7020ed1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -722,6 +722,13 @@ private: SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const; SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; + SDValue getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, + const SDLoc &DL, + EVT VT) const; + SDValue convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Val, + bool Signed, + const ISD::InputArg *Arg) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, Align Alignment, diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index 4dc11c06b02b..b352b8d379ee 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -208,6 +208,9 @@ class RVInst vGPR move if (RISCV::GPRF32RegClass.contains(SrcReg) && RISCV::VGPRRegClass.contains(DstReg)) { - BuildMI(MBB, MBBI, DL, get(RISCV::VFMV_S_F), DstReg) - .addReg(DstReg, RegState::Undef) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; + llvm_unreachable("Not supported by HW, use vmv.v.x instead."); } // Handle copy from csr @@ -238,10 +229,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW : RISCV::SD; - } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FSH; - } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FSD; } else if (RISCV::VGPRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::VSW; } else @@ -279,10 +266,6 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW : RISCV::LD; - } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FLH; - } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { - Opcode = RISCV::FLD; } else if (RISCV::VGPRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::VLW; } else diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index 42d2943c6787..a9b4e49af9e6 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -103,12 +103,9 @@ static unsigned log2LdstWidth(unsigned Opcode) { llvm_unreachable("Unexpected opcode"); case RISCV::LW: case RISCV::SW: - case RISCV::FLW: - case RISCV::FSW: return 2; case RISCV::LD: case RISCV::SD: - case RISCV::FLD: case RISCV::FSD: return 3; } @@ -147,8 +144,7 @@ static bool isCompressibleLoad(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget(); const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) || - Opcode == RISCV::LD || Opcode == RISCV::FLD; + return Opcode == RISCV::LW || Opcode == RISCV::LD; } // Return true if MI is a store for which there exists a compressed version. @@ -156,8 +152,7 @@ static bool isCompressibleStore(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget(); const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) || - Opcode == RISCV::SD || Opcode == RISCV::FSD; + return Opcode == RISCV::SW || Opcode == RISCV::SD; } // Find a single register and/or large offset which, if compressible, would diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index a57635abb7e0..2d05077663b3 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -355,16 +355,10 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, case RISCV::LHU: case RISCV::LWU: case RISCV::LD: - case RISCV::FLH: - case RISCV::FLW: - case RISCV::FLD: case RISCV::SB: case RISCV::SH: case RISCV::SW: - case RISCV::SD: - case RISCV::FSH: - case RISCV::FSW: - case RISCV::FSD: { + case RISCV::SD: { if (UseMI.getOperand(1).isFI()) return false; // Register defined by Lo should not be the value register. diff --git a/llvm/lib/Target/RISCV/VentusInstrFormatsV.td b/llvm/lib/Target/RISCV/VentusInstrFormatsV.td index c2ba617a1077..c27993047e41 100644 --- a/llvm/lib/Target/RISCV/VentusInstrFormatsV.td +++ b/llvm/lib/Target/RISCV/VentusInstrFormatsV.td @@ -62,6 +62,27 @@ class RVInstSetVLi let Opcode = OPC_OP_V.Value; } +// VENTUS VOP VI instructions with 11 bits immediate number +class PseudoVOPVIIMM11 : + Pseudo<(outs VGPR:$vd), (ins VGPR:$vs2, simm11:$imm11), + [(set (XLenVT VGPR:$vd), (Op (XLenVT VGPR:$vs2), simm11:$imm11))]> { + // let usesCustomInserter = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let IsVOPIMM11 = 1; +} + +class PseudoVOPIVIMM11 : + Pseudo<(outs VGPR:$vd), (ins VGPR:$vs2, simm11:$imm11), + [(set (XLenVT VGPR:$vd), (Op simm11:$imm11, (XLenVT VGPR:$vs2)))]> { + // let usesCustomInserter = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let IsVOPIMM11 = 1; +} + class RVInstVV funct6, RISCVVFormat opv, dag outs, dag ins, string opcodestr, string argstr> : RVInst { diff --git a/llvm/lib/Target/RISCV/VentusInstrInfoF.td b/llvm/lib/Target/RISCV/VentusInstrInfoF.td index fefe36189e99..46f3831f2ef3 100644 --- a/llvm/lib/Target/RISCV/VentusInstrInfoF.td +++ b/llvm/lib/Target/RISCV/VentusInstrInfoF.td @@ -341,11 +341,13 @@ class PseudoVFROUND //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZfinx] in { -def FLW : FPLoad_r<0b010, "flw", GPRF32, WriteFLD32>; -// Operands for stores are in the order srcreg, base, offset rather than -// reflecting the order these fields are specified in the instruction -// encoding. -def FSW : FPStore_r<0b010, "fsw", GPRF32, WriteFST32>; +/// Loads +def : Pat<(f32 (load (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12))), + (COPY_TO_REGCLASS (LW GPR:$rs1, simm12:$imm12), GPRF32)>; + +/// Stores +def : Pat<(store (f32 FPR32INX:$rs2), (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12)), + (SW (COPY_TO_REGCLASS FPR32INX:$rs2, GPR), GPR:$rs1, simm12:$imm12)>; } // Predicates = [HasStdExtZfinx] let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in { @@ -450,8 +452,8 @@ defm : FPUnaryOpDynFrmAlias_m; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZfinx] in { -def : InstAlias<"flw $rd, (${rs1})", (FLW GPRF32:$rd, GPR:$rs1, 0), 0>; -def : InstAlias<"fsw $rs2, (${rs1})", (FSW GPRF32:$rs2, GPR:$rs1, 0), 0>; +// def : InstAlias<"flw $rd, (${rs1})", (FLW GPRF32:$rd, GPR:$rs1, 0), 0>; +// def : InstAlias<"fsw $rs2, (${rs1})", (FSW GPRF32:$rs2, GPR:$rs1, 0), 0>; def : InstAlias<"fmv.s $rd, $rs", (FSGNJ_S GPRF32:$rd, GPRF32:$rs, GPRF32:$rs)>; def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S GPRF32:$rd, GPRF32:$rs, GPRF32:$rs)>; @@ -636,13 +638,13 @@ defm Select_FPR32 : SelectCC_GPR_rrirr; def PseudoVFROUND_S : PseudoVFROUND; def PseudoFROUND_S : PseudoFROUND; -/// Loads +// /// Loads -defm : UniformLdPat; +// defm : UniformLdPat; -/// Stores +// /// Stores -defm : UniformStPat; +// defm : UniformStPat; } // Predicates = [HasStdExtZfinx] diff --git a/llvm/lib/Target/RISCV/VentusInstrInfoV.td b/llvm/lib/Target/RISCV/VentusInstrInfoV.td index 8f396aa0e95d..959a2945b250 100644 --- a/llvm/lib/Target/RISCV/VentusInstrInfoV.td +++ b/llvm/lib/Target/RISCV/VentusInstrInfoV.td @@ -128,8 +128,8 @@ multiclass SleOpePatVXIBin Ops, list Insts, // Setcc pattern for interger operations // FIXME: this pattern class can substitude the multiclass above -// class PatIntSetCC Ty, CondCode Cond, RVInst Inst> -// : Pat<(setcc (XLenVT Ty[0]:$rs1), (XLenVT Ty[1]:$rs2), Cond), (i32 (Inst Ty[0]:$rs1, Ty[1]:$rs2))>; +class PatIntSetCC Ty, CondCode Cond, RVInst Inst> + : Pat<(setcc (XLenVT Ty[0]:$rs1), (XLenVT Ty[1]:$rs2), Cond), (i32 (Inst Ty[0]:$rs1, Ty[1]:$rs2))>; // Setcc pattern for float operations multiclass PatFloatSetCC Ty, list Conds, RVInst Inst> { @@ -1134,8 +1134,9 @@ defm VFCVT_XU_F_V : VCVTI_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>; defm VFCVT_X_F_V : VCVTI_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>; } // Follow the way by RISCVInstrInfoF -defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>; -defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>; +// TODO: later support +// defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>; +// defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2_FRM<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>; let Uses = [FRM] in { defm VFCVT_F_XU_V : VCVTF_IV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>; defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>; @@ -1275,16 +1276,16 @@ def : DivergentNonPriStPat; def : DivergentNonPriStPat; // FIXME: check this review: https://reviews.llvm.org/D131729#inline-1269307 -// def : PatIntSetCC<[VGPR, VGPR], SETLE, VMSLE_VV>; -// def : PatIntSetCC<[VGPR, GPR], SETLE, VMSLE_VX>; -// def : PatIntSetCC<[VGPR, uimm5], SETLE, VMSLE_VI>; -// def : PatIntSetCC<[VGPR, GPR], SETGT, VMSGT_VX>; -// def : PatIntSetCC<[VGPR, uimm5], SETGT, VMSGT_VI>; -// def : PatIntSetCC<[VGPR, VGPR], SETULE, VMSLEU_VV>; -// def : PatIntSetCC<[VGPR, GPR], SETULE, VMSLEU_VX>; -// def : PatIntSetCC<[VGPR, uimm5], SETULE, VMSLEU_VI>; -// def : PatIntSetCC<[VGPR, GPR], SETUGT, VMSGTU_VX>; -// def : PatIntSetCC<[VGPR, uimm5], SETUGT, VMSGTU_VI>; +def : PatIntSetCC<[VGPR, VGPR], SETLE, VMSLE_VV>; +def : PatIntSetCC<[VGPR, GPR], SETLE, VMSLE_VX>; +def : PatIntSetCC<[VGPR, uimm5], SETLE, VMSLE_VI>; +def : PatIntSetCC<[VGPR, GPR], SETGT, VMSGT_VX>; +def : PatIntSetCC<[VGPR, uimm5], SETGT, VMSGT_VI>; +def : PatIntSetCC<[VGPR, VGPR], SETULE, VMSLEU_VV>; +def : PatIntSetCC<[VGPR, GPR], SETULE, VMSLEU_VX>; +def : PatIntSetCC<[VGPR, uimm5], SETULE, VMSLEU_VI>; +def : PatIntSetCC<[VGPR, GPR], SETUGT, VMSGTU_VX>; +def : PatIntSetCC<[VGPR, uimm5], SETUGT, VMSGTU_VI>; defm : PatVXIBin, [VMIN_VV, VMIN_VX]>; defm : PatVXIBin, [VMINU_VV, VMINU_VX]>; @@ -1316,12 +1317,6 @@ def : Pat<(sub GPR:$rs1, VGPR:$rs2), (VRSUB_VX VGPR:$rs2, GPR:$rs1)>; def : Pat<(XLenVT (sub simm5:$imm, (XLenVT VGPR:$rs1))), (VRSUB_VI VGPR:$rs1, simm5:$imm)>; -// For now, some instructions are aliaed to other instructions -defm : SleOpePatVXIBin<[DivergentBinFrag, DivergentBinFrag], - [VMSLE_VV, VMSLE_VX, VMSGT_VI], simm5>; -defm : SleOpePatVXIBin<[DivergentBinFrag, DivergentBinFrag], - [VMSLEU_VV, VMSLEU_VX, VMSGTU_VI], uimm5>; - defm : PatFloatSetCC<[VGPR, VGPR], [SETOEQ, SETEQ], VMFEQ_VV>; defm : PatFloatSetCC<[VGPR, GPRF32], [SETOEQ, SETEQ], VMFEQ_VF>; @@ -1336,10 +1331,10 @@ defm : PatFloatSetCC<[VGPR, GPRF32], [SETOLE, SETLE], VMFLE_VV>; defm : PatFloatSetCC<[VGPR, GPRF32], [SETOGT, SETGT], VMFGT_VF>; defm : PatFloatSetCC<[VGPR, GPRF32], [SETOGE, SETGE], VMFGE_VF>; -def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), - (VFCVT_RTZ_X_F_V (f32 VGPR:$rs1), $frm)>; -def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), - (VFCVT_RTZ_XU_F_V (f32 VGPR:$rs1), $frm)>; +// def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), +// (VFCVT_RTZ_X_F_V (f32 VGPR:$rs1), $frm)>; +// def : Pat<(i32 (DivergentBinFrag (f32 VGPR:$rs1), timm:$frm)), +// (VFCVT_RTZ_XU_F_V (f32 VGPR:$rs1), $frm)>; def : PatFXConvert, [XLenVT, f32], VFCVT_X_F_V>; def : PatFXConvert, @@ -1485,6 +1480,14 @@ foreach n = [1] in { // TODO: Will enable this after non divergent execution path are implemented. // include "VentusInstrInfoVPseudos.td" +// TODO: add other instructions +def PseudoVOR_VI_IMM11 : PseudoVOPVIIMM11; +def PseudoVXOR_VI_IMM11 : PseudoVOPVIIMM11; +def PseudoVRSUB_VI_IMM11 : PseudoVOPIVIMM11; +def PseudoVAND_VI_IMM11 : PseudoVOPVIIMM11; +def PseudoVMSNE_VI_IMM11 : PseudoVOPVIIMM11; +def PseudoVMSEQ_VI_IMM11 : PseudoVOPVIIMM11; + //===----------------------------------------------------------------------===// // Ventus vALU divergent extended execution patterns //===----------------------------------------------------------------------===// @@ -1495,6 +1498,6 @@ def : Pat<(XLenVT (DivergentBinFrag (XLenVT VGPR:$rs1), uimm12:$imm)), // There already has patterns defined in VentusInstrInfo.td let Predicates = [HasStdExtZfinx] in { -// def : Pat<(f32 (bitconvert (i32 GPR:$src))), (VMV_V_X GPR:$src)>; -def : Pat<(i32 (bitconvert GPRF32:$src)), (VFMV_V_F GPRF32:$src)>; +def : Pat<(f32 (bitconvert (i32 GPR:$src))), (VMV_V_X GPR:$src)>; +// def : Pat<(i32 (bitconvert GPRF32:$src)), (VFMV_V_F GPRF32:$src)>; } // Predicates = [HasStdExtZfinx] diff --git a/llvm/lib/Target/RISCV/VentusRegextInsertion.cpp b/llvm/lib/Target/RISCV/VentusRegextInsertion.cpp index 9129b5eeafab..6ec382741e28 100644 --- a/llvm/lib/Target/RISCV/VentusRegextInsertion.cpp +++ b/llvm/lib/Target/RISCV/VentusRegextInsertion.cpp @@ -75,6 +75,9 @@ bool VentusRegextInsertion::insertRegext(MachineBasicBlock &MBB, MachineInstr &MI) { bool hasOverflow = false; + if (MI.isPseudo()) + return false; + // 3 bits encoding for each rd, rs1, rs2, rs3, total 12 bits. // Each 3 bit can encode 0~7 which stands for base register offset 0~7 * 32. unsigned Offsets = 0; diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll index 41407866feb9..97f21e5f6ff1 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space.ll @@ -11,7 +11,7 @@ define dso_local ventus_kernel void @func(ptr addrspace(1) nocapture noundef ali ; VENTUS-NEXT: .cfi_def_cfa_offset 4 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 4 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: lw t0, 0(a0) @@ -35,7 +35,7 @@ define dso_local ventus_kernel void @func(ptr addrspace(1) nocapture noundef ali ; VENTUS-NEXT: vlw12.v v2, 0(v1) ; VENTUS-NEXT: vadd.vv v0, v2, v0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -12 ; VENTUS-NEXT: addi tp, tp, -4 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll index eab09cbda1b1..a090733ba98a 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/addr-space2.ll @@ -13,7 +13,7 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: .cfi_def_cfa_offset 24 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 4 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: lw t0, 0(a0) @@ -58,8 +58,9 @@ define ventus_kernel void @foo(ptr addrspace(1) noundef align 4 %out) { ; VENTUS-NEXT: vadd.vv v0, v33, v0 ; VENTUS-NEXT: vsw12.v v1, 0(v0) ; VENTUS-NEXT: .LBB0_3: # %if.end -; VENTUS-NEXT: join -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: # Label of block must be emitted +; VENTUS-NEXT: join zero, zero, 0 +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: addi tp, tp, -24 ; VENTUS-NEXT: ret @@ -241,10 +242,9 @@ define dso_local ventus_kernel void @local_memmory1(ptr addrspace(3) nocapture n ; VENTUS-LABEL: local_memmory1: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lw t0, 0(a0) -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v1, 0(v0) -; VENTUS-NEXT: vadd.vi v1, v1, 1 -; VENTUS-NEXT: vsw12.v v1, 0(v0) +; VENTUS-NEXT: lw t1, 0(t0) +; VENTUS-NEXT: addi t1, t1, 1 +; VENTUS-NEXT: sw t1, 0(t0) ; VENTUS-NEXT: ret entry: %0 = load i32, ptr addrspace(3) %b, align 4 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll index 19a9281b4f5e..f9a21a444f04 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/bitcast.ll @@ -6,12 +6,12 @@ define dso_local ventus_kernel void @bitcast(float noundef %a, ptr addrspace(5) ; VENTUS-LABEL: bitcast: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: lw t0, 8(a0) -; VENTUS-NEXT: flw t1, 0(a0) +; VENTUS-NEXT: lw t1, 0(a0) ; VENTUS-NEXT: lw t2, 4(a0) -; VENTUS-NEXT: vfmv.s.f v0, t1 +; VENTUS-NEXT: vmv.v.x v0, t1 ; VENTUS-NEXT: vmv.v.x v1, t2 ; VENTUS-NEXT: vsw.v v0, 0(v1) -; VENTUS-NEXT: fsw t1, 0(t0) +; VENTUS-NEXT: sw t1, 0(t0) ; VENTUS-NEXT: ret entry: %conv = bitcast float %a to i32 diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll index c69ba586d903..26ad7ded1865 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/branch.ll @@ -14,7 +14,8 @@ define i32 @foo(i32 noundef %cond, i32 noundef %a, i32 noundef %b, i32 noundef % ; VENTUS-NEXT: # %bb.1: ; VENTUS-NEXT: vrsub.vi v3, v3, 0 ; VENTUS-NEXT: .LBB0_2: # %entry -; VENTUS-NEXT: join +; VENTUS-NEXT: # Label of block must be emitted +; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: vmadd.vv v2, v1, v3 ; VENTUS-NEXT: vadd.vx v0, v2, zero ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll index f1537d70be48..bb3343f7c922 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin-noverify.ll @@ -11,7 +11,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: .cfi_def_cfa_offset 8 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 8 ; VENTUS-NEXT: .cfi_offset v33.l, 4 ; VENTUS-NEXT: .cfi_offset v34.l, 0 @@ -30,7 +30,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll index 9ca6df0f8cbc..65e5569a56ea 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/builtin.ll @@ -7,7 +7,7 @@ define ventus_kernel void @foo_ker(ptr addrspace(1) nocapture noundef align 4 %A ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 12 ; VENTUS-NEXT: .cfi_def_cfa_offset 12 -; VENTUS-NEXT: sw ra, -12(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 0(a0) ; VENTUS-NEXT: sw t0, -8(sp) # 4-byte Folded Spill @@ -24,7 +24,7 @@ define ventus_kernel void @foo_ker(ptr addrspace(1) nocapture noundef align 4 %A ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -12 ; VENTUS-NEXT: ret entry: @@ -47,7 +47,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: .cfi_def_cfa_offset 8 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 8 ; VENTUS-NEXT: .cfi_offset v33.l, 4 ; VENTUS-NEXT: .cfi_offset v34.l, 0 @@ -66,7 +66,7 @@ define dso_local void @foo_fun(ptr addrspace(1) nocapture noundef %A, ptr addrsp ; VENTUS-NEXT: vlw12.v v2, 0(v0) ; VENTUS-NEXT: vadd.vv v1, v2, v1 ; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll index ab2ee139f621..453f7c4f2c1a 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/calling-convention.ll @@ -12,7 +12,7 @@ define dso_local ventus_kernel void @kernel_calling_convention(ptr addrspace(1) ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 16 ; VENTUS-NEXT: .cfi_def_cfa_offset 16 -; VENTUS-NEXT: sw ra, -16(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: sw t0, -12(sp) # 4-byte Folded Spill @@ -30,13 +30,12 @@ define dso_local ventus_kernel void @kernel_calling_convention(ptr addrspace(1) ; VENTUS-NEXT: vadd.vx v0, v0, t1 ; VENTUS-NEXT: vmv.v.x v1, s0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw t0, -12(sp) # 4-byte Folded Reload -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v1, 0(v0) -; VENTUS-NEXT: lw t0, 0(t2) -; VENTUS-NEXT: vadd.vx v1, v1, t0 -; VENTUS-NEXT: vsw12.v v1, 0(v0) -; VENTUS-NEXT: lw ra, -16(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw s0, -12(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw t0, 0(s0) +; VENTUS-NEXT: lw t2, 0(t2) +; VENTUS-NEXT: add t0, t2, t0 +; VENTUS-NEXT: sw t0, 0(s0) +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -16 ; VENTUS-NEXT: ret entry: @@ -82,16 +81,16 @@ entry: define dso_local i32 @non_kernel_calling_convention(ptr nocapture noundef readonly %a1, ptr nocapture noundef readonly %a2, ptr nocapture noundef readonly %a3, ptr nocapture noundef readonly %a4, ptr nocapture noundef readonly %a5, ptr nocapture noundef readonly %a6, ptr nocapture noundef readonly %a7, ptr nocapture noundef readonly %a8, ptr nocapture noundef readonly %a9, ptr nocapture noundef readonly %a10, ptr nocapture noundef readonly %a11, ptr nocapture noundef readonly %a12, ptr nocapture noundef readonly %a13, ptr nocapture noundef readonly %a14, ptr nocapture noundef readonly %a15, ptr nocapture noundef readonly %a16, ptr nocapture noundef readonly %a17, ptr nocapture noundef readonly %a18, ptr nocapture noundef readonly %a19, ptr nocapture noundef readonly %a20, ptr nocapture noundef readonly %a21, ptr nocapture noundef readonly %a22, ptr nocapture noundef readonly %a23, ptr nocapture noundef readonly %a24, ptr nocapture noundef readonly %a25, ptr nocapture noundef readonly %a26, ptr nocapture noundef readonly %a27, ptr nocapture noundef readonly %a28, ptr nocapture noundef readonly %a29, ptr nocapture noundef readonly %a30, ptr nocapture noundef readonly %a31, ptr nocapture noundef readonly %a32, ptr addrspace(3) nocapture noundef readonly %a33, ptr addrspace(5) nocapture noundef readonly %a34) local_unnamed_addr #2 { ; VENTUS-LABEL: non_kernel_calling_convention: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: addi tp, tp, 28 -; VENTUS-NEXT: .cfi_def_cfa_offset 28 +; VENTUS-NEXT: addi tp, tp, 16 +; VENTUS-NEXT: .cfi_def_cfa_offset 16 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp ; VENTUS-NEXT: .cfi_offset v33.l, 4 ; VENTUS-NEXT: .cfi_offset v34.l, 0 ; VENTUS-NEXT: regext zero, zero, 9 -; VENTUS-NEXT: vlw.v v33, -24(v32) +; VENTUS-NEXT: vlw.v v33, -12(v32) ; VENTUS-NEXT: regext zero, zero, 9 -; VENTUS-NEXT: vlw.v v34, -28(v32) +; VENTUS-NEXT: vlw.v v34, -16(v32) ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: vlw12.v v1, 0(v1) ; VENTUS-NEXT: vlw12.v v2, 0(v2) @@ -161,7 +160,7 @@ define dso_local i32 @non_kernel_calling_convention(ptr nocapture noundef readon ; VENTUS-NEXT: vadd.vv v0, v0, v1 ; VENTUS-NEXT: vadd.vv v0, v0, v2 ; VENTUS-NEXT: vadd.vv v0, v0, v3 -; VENTUS-NEXT: addi tp, tp, -28 +; VENTUS-NEXT: addi tp, tp, -16 ; VENTUS-NEXT: ret entry: %0 = load i32, ptr %a1, align 4 @@ -270,7 +269,7 @@ define dso_local i32 @test_add(ptr nocapture noundef readonly %a, ptr nocapture ; VENTUS-NEXT: .cfi_def_cfa_offset 8 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: vlw12.v v0, 0(v0) ; VENTUS-NEXT: vadd.vi v0, v0, 1 @@ -288,7 +287,7 @@ define dso_local i32 @test_add(ptr nocapture noundef readonly %a, ptr nocapture ; VENTUS-NEXT: regext zero, zero, 8 ; VENTUS-NEXT: vlw.v v1, -8(v32) ; VENTUS-NEXT: vadd.vv v0, v1, v0 -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -8 ; VENTUS-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll index b93930235c91..51fa6ec4d020 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/float.ll @@ -466,25 +466,6 @@ define float @fsgnjnx_v(float %a) nounwind { ret float %1 } -define i32 @fcvt_rtz_x_f_v(float %a) nounwind { -; VENTUS-LABEL: fcvt_rtz_x_f_v: -; VENTUS: # %bb.0: -; VENTUS-NEXT: vfcvt.rtz.x.f.v v0, v0 -; VENTUS-NEXT: ret - %1 = call float @llvm.trunc.f32(float %a) - %conv = fptosi float %1 to i32 - ret i32 %conv -} - -define i32 @fcvt_rtz_xu_f_v(float %x) { -; VENTUS-LABEL: fcvt_rtz_xu_f_v: -; VENTUS: # %bb.0: -; VENTUS-NEXT: vfcvt.rtz.xu.f.v v0, v0 -; VENTUS-NEXT: ret - %a = call float @llvm.trunc.f32(float %x) - %b = fptoui float %a to i32 - ret i32 %b -} @global_val = dso_local global float 0x3FF547AE20000000, align 4 declare float @llvm.sqrt.f32(float %Val) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll index 8e0dca08af1c..40a944d2e4aa 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/function-call.ll @@ -25,7 +25,7 @@ define dso_local ventus_kernel void @foo(i32 noundef %a, i32 noundef %b, ptr add ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: lw t0, 8(a0) ; VENTUS-NEXT: sw t0, -4(sp) # 4-byte Folded Spill @@ -37,7 +37,7 @@ define dso_local ventus_kernel void @foo(i32 noundef %a, i32 noundef %b, ptr add ; VENTUS-NEXT: lw t0, -4(sp) # 4-byte Folded Reload ; VENTUS-NEXT: vmv.v.x v1, t0 ; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/parameter-vector-struct-types.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/parameter-vector-struct-types.ll new file mode 100644 index 000000000000..175e968314fd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/parameter-vector-struct-types.ll @@ -0,0 +1,168 @@ +; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +%struct.MyStruct = type { i32, i8, i64 } + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) vscale_range(1,2048) +; Here we foucus on kernel struct argument +define dso_local ventus_kernel void @test_kernel1(i8 noundef %c, %struct.MyStruct %st.coerce, i8 noundef %uc, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { +entry: + ; CHECK: lb t0, 0(a0) + ; CHECK: lbu t1, 24(a0) + ; CHECK: lw t2, 28(a0) + ; CHECK: lw s0, 8(a0) + %st.coerce.fca.0.extract = extractvalue %struct.MyStruct %st.coerce, 0 + %conv = sitofp i8 %c to float + store float %conv, ptr addrspace(1) %result, align 4 + %conv1 = sitofp i32 %st.coerce.fca.0.extract to float + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %result, i32 1 + store float %conv1, ptr addrspace(1) %arrayidx2, align 4 + %conv3 = uitofp i8 %uc to float + %arrayidx4 = getelementptr inbounds float, ptr addrspace(1) %result, i32 2 + store float %conv3, ptr addrspace(1) %arrayidx4, align 4 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) vscale_range(1,2048) +; Here we foucus on scalar argument +define dso_local ventus_kernel void @test_kernel2(i8 noundef %c, i8 noundef %uc, i16 noundef %s, i16 noundef %us, i32 noundef %i, i32 noundef %ui, float noundef %f, ptr addrspace(1) nocapture noundef writeonly align 4 %result) { +entry: + ; CHECK: flw t0, 24(a0) + ; CHECK: lw t1, 20(a0) + ; CHECK: lw t2, 16(a0) + ; CHECK: lhu s0, 12(a0) + ; CHECK: lh s1, 8(a0) + ; CHECK: lb a1, 0(a0) + ; CHECK: lbu a2, 4(a0) + ; CHECK: lw a0, 28(a0) + %conv = sitofp i8 %c to float + store float %conv, ptr addrspace(1) %result, align 4 + %conv1 = uitofp i8 %uc to float + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %result, i32 1 + store float %conv1, ptr addrspace(1) %arrayidx2, align 4 + %conv3 = sitofp i16 %s to float + %arrayidx4 = getelementptr inbounds float, ptr addrspace(1) %result, i32 2 + store float %conv3, ptr addrspace(1) %arrayidx4, align 4 + %conv5 = uitofp i16 %us to float + %arrayidx6 = getelementptr inbounds float, ptr addrspace(1) %result, i32 3 + store float %conv5, ptr addrspace(1) %arrayidx6, align 4 + %conv7 = sitofp i32 %i to float + %arrayidx8 = getelementptr inbounds float, ptr addrspace(1) %result, i32 4 + store float %conv7, ptr addrspace(1) %arrayidx8, align 4 + %conv9 = uitofp i32 %ui to float + %arrayidx10 = getelementptr inbounds float, ptr addrspace(1) %result, i32 5 + store float %conv9, ptr addrspace(1) %arrayidx10, align 4 + %arrayidx11 = getelementptr inbounds float, ptr addrspace(1) %result, i32 6 + store float %f, ptr addrspace(1) %arrayidx11, align 4 + ret void +} + +; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(argmem: write) vscale_range(1,2048) +; Here we foucus on vector argument +define dso_local ventus_kernel void @test_kernel3(<2 x i8> noundef %c, <2 x i8> noundef %uc, <2 x i16> noundef %s, <2 x i16> noundef %us, <2 x i32> noundef %i, <2 x i32> noundef %ui, <2 x float> noundef %f, ptr addrspace(1) nocapture noundef writeonly align 8 %result) { +entry: + ;CHECK: flw t0, 36(a0) + ;CHECK: flw t0, 32(a0) + ;CHECK: lw t0, 28(a0) + ;CHECK: lw t0, 24(a0) + ;CHECK: lw t0, 20(a0) + ;CHECK: lw t0, 16(a0) + ;CHECK: lhu t0, 14(a0) + ;CHECK: lhu t0, 12(a0) + ;CHECK: lhu t0, 10(a0) + ;CHECK: lhu t0, 8(a0) + ;CHECK: lbu t0, 5(a0) + ;CHECK: lbu t0, 4(a0) + ;CHECK: lbu t0, 1(a0) + ;CHECK: lbu t1, 0(a0) + ;CHECK: lw t2, 40(a0) + %call = call <2 x float> @_Z14convert_float2Dv2_c(<2 x i8> noundef %c) + store <2 x float> %call, ptr addrspace(1) %result, align 8 + %call1 = call <2 x float> @_Z14convert_float2Dv2_h(<2 x i8> noundef %uc) + %arrayidx2 = getelementptr inbounds <2 x float>, ptr addrspace(1) %result, i32 1 + store <2 x float> %call1, ptr addrspace(1) %arrayidx2, align 8 + %call3 = call <2 x float> @_Z14convert_float2Dv2_s(<2 x i16> noundef %s) + %arrayidx4 = getelementptr inbounds <2 x float>, ptr addrspace(1) %result, i32 2 + store <2 x float> %call3, ptr addrspace(1) %arrayidx4, align 8 + %call5 = call <2 x float> @_Z14convert_float2Dv2_t(<2 x i16> noundef %us) + %arrayidx6 = getelementptr inbounds <2 x float>, ptr addrspace(1) %result, i32 3 + store <2 x float> %call5, ptr addrspace(1) %arrayidx6, align 8 + %call7 = call <2 x float> @_Z14convert_float2Dv2_i(<2 x i32> noundef %i) + %arrayidx8 = getelementptr inbounds <2 x float>, ptr addrspace(1) %result, i32 4 + store <2 x float> %call7, ptr addrspace(1) %arrayidx8, align 8 + %call9 = call <2 x float> @_Z14convert_float2Dv2_j(<2 x i32> noundef %ui) + %arrayidx10 = getelementptr inbounds <2 x float>, ptr addrspace(1) %result, i32 5 + store <2 x float> %call9, ptr addrspace(1) %arrayidx10, align 8 + %call11 = call <2 x float> @_Z14convert_float2Dv2_f(<2 x float> noundef %f) + %arrayidx12 = getelementptr inbounds <2 x float>, ptr addrspace(1) %result, i32 6 + store <2 x float> %call11, ptr addrspace(1) %arrayidx12, align 8 + ret void +} + +declare dso_local <2 x float> @_Z14convert_float2Dv2_c(<2 x i8> noundef) +declare dso_local <2 x float> @_Z14convert_float2Dv2_h(<2 x i8> noundef) +declare dso_local <2 x float> @_Z14convert_float2Dv2_s(<2 x i16> noundef) +declare dso_local <2 x float> @_Z14convert_float2Dv2_t(<2 x i16> noundef) +declare dso_local <2 x float> @_Z14convert_float2Dv2_i(<2 x i32> noundef) +declare dso_local <2 x float> @_Z14convert_float2Dv2_j(<2 x i32> noundef) +declare dso_local <2 x float> @_Z14convert_float2Dv2_f(<2 x float> noundef) + +; Function Attrs: convergent mustprogress nofree norecurse nounwind willreturn memory(argmem: write) vscale_range(1,2048) +; Here we foucus on vector argument +define dso_local ventus_kernel void @test_kernel4(<4 x i8> noundef %c, <4 x i8> noundef %uc, <4 x i16> noundef %s, <4 x i16> noundef %us, <4 x i32> noundef %i, <4 x i32> noundef %ui, <4 x float> noundef %f, ptr addrspace(1) nocapture noundef writeonly align 16 %result) { +entry: + ;CHECK: flw t0, 76(a0) + ;CHECK: flw t0, 72(a0) + ;CHECK: flw t0, 68(a0) + ;CHECK: flw t0, 64(a0) + ;CHECK: lw t0, 60(a0) + ;CHECK: lw t0, 56(a0) + ;CHECK: lw t0, 52(a0) + ;CHECK: lw t0, 48(a0) + ;CHECK: lw t0, 44(a0) + ;CHECK: lw t0, 40(a0) + ;CHECK: lw t0, 36(a0) + ;CHECK: lw t0, 32(a0) + ;CHECK: lhu t0, 22(a0) + ;CHECK: lhu t0, 20(a0) + ;CHECK: lhu t0, 18(a0) + ;CHECK: lhu t0, 16(a0) + ;CHECK: lhu t0, 14(a0) + ;CHECK: lhu t0, 12(a0) + ;CHECK: lhu t0, 10(a0) + ;CHECK: lhu t0, 8(a0) + ;CHECK: lw t0, 4(a0) + ;CHECK: lbu t1, 4(a0) + ;CHECK: lw t0, 80(a0) + ;CHECK: lw t0, 0(a0) + ;CHECK: lbu t1, 0(a0) + %call = call <4 x float> @_Z14convert_float4Dv4_c(<4 x i8> noundef %c) + store <4 x float> %call, ptr addrspace(1) %result, align 16 + %call1 = call <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> noundef %uc) + %arrayidx2 = getelementptr inbounds <4 x float>, ptr addrspace(1) %result, i32 1 + store <4 x float> %call1, ptr addrspace(1) %arrayidx2, align 16 + %call3 = call <4 x float> @_Z14convert_float4Dv4_s(<4 x i16> noundef %s) + %arrayidx4 = getelementptr inbounds <4 x float>, ptr addrspace(1) %result, i32 2 + store <4 x float> %call3, ptr addrspace(1) %arrayidx4, align 16 + %call5 = call <4 x float> @_Z14convert_float4Dv4_t(<4 x i16> noundef %us) + %arrayidx6 = getelementptr inbounds <4 x float>, ptr addrspace(1) %result, i32 3 + store <4 x float> %call5, ptr addrspace(1) %arrayidx6, align 16 + %call7 = call <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> noundef %i) + %arrayidx8 = getelementptr inbounds <4 x float>, ptr addrspace(1) %result, i32 4 + store <4 x float> %call7, ptr addrspace(1) %arrayidx8, align 16 + %call9 = call <4 x float> @_Z14convert_float4Dv4_j(<4 x i32> noundef %ui) + %arrayidx10 = getelementptr inbounds <4 x float>, ptr addrspace(1) %result, i32 5 + store <4 x float> %call9, ptr addrspace(1) %arrayidx10, align 16 + %call11 = call <4 x float> @_Z14convert_float4Dv4_f(<4 x float> noundef %f) + %arrayidx12 = getelementptr inbounds <4 x float>, ptr addrspace(1) %result, i32 6 + store <4 x float> %call11, ptr addrspace(1) %arrayidx12, align 16 + ret void +} + +declare dso_local <4 x float> @_Z14convert_float4Dv4_c(<4 x i8> noundef) +declare dso_local <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> noundef) +declare dso_local <4 x float> @_Z14convert_float4Dv4_s(<4 x i16> noundef) +declare dso_local <4 x float> @_Z14convert_float4Dv4_t(<4 x i16> noundef) +declare dso_local <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> noundef) +declare dso_local <4 x float> @_Z14convert_float4Dv4_j(<4 x i32> noundef) +declare dso_local <4 x float> @_Z14convert_float4Dv4_f(<4 x float> noundef) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/regexti.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/regexti.ll new file mode 100644 index 000000000000..7e9981e39782 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/regexti.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mcpu=ventus-gpgpu -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define dso_local i32 @regexti1(i32 noundef %a) { +; CHECK-LABEL: regexti1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 768 +; CHECK-NEXT: vor.vi v0, v0, 15 +; CHECK-NEXT: ret +entry: + %res = or i32 %a, 399 + ret i32 %res +} + +define dso_local i32 @regexti2(i32 noundef %a) { +; CHECK-LABEL: regexti2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -832 +; CHECK-NEXT: vor.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = or i32 %a, -399 + ret i32 %res +} + +define dso_local i32 @regexti1_1(i32 noundef %a) { +; CHECK-LABEL: regexti1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 0 +; CHECK-NEXT: vor.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = or i32 %a, 17 + ret i32 %res +} + +define dso_local i32 @regexti2_1(i32 noundef %a) { +; CHECK-LABEL: regexti2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -64 +; CHECK-NEXT: vor.vi v0, v0, 15 +; CHECK-NEXT: ret +entry: + %res = or i32 %a, -17 + ret i32 %res +} + +define dso_local i32 @regexti1_2(i32 noundef %a) { +; CHECK-LABEL: regexti1_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vor.vi v0, v0, 13 +; CHECK-NEXT: ret +entry: + %res = or i32 %a, 13 + ret i32 %res +} + +define dso_local i32 @regexti2_2(i32 noundef %a) { +; CHECK-LABEL: regexti2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vor.vi v0, v0, -13 +; CHECK-NEXT: ret +entry: + %res = or i32 %a, -13 + ret i32 %res +} + +define dso_local i32 @regexti3(i32 noundef %a) { +; CHECK-LABEL: regexti3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 448 +; CHECK-NEXT: vnot.v v0, v0 +; CHECK-NEXT: ret +entry: + %res = xor i32 %a, 255 + ret i32 %res +} + +define dso_local i32 @regexti4(i32 noundef %a) { +; CHECK-LABEL: regexti4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -832 +; CHECK-NEXT: vxor.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = xor i32 %a, -399 + ret i32 %res +} + +define dso_local i32 @regexti5(i32 noundef %a) { +; CHECK-LABEL: regexti5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 768 +; CHECK-NEXT: vrsub.vi v0, v0, 15 +; CHECK-NEXT: ret +entry: + %res = sub i32 399, %a + ret i32 %res +} + +define dso_local i32 @regexti6(i32 noundef %a) { +; CHECK-LABEL: regexti6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -832 +; CHECK-NEXT: vrsub.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = sub i32 -399, %a + ret i32 %res +} + +define dso_local i32 @regexti7(i32 noundef %a) { +; CHECK-LABEL: regexti7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 768 +; CHECK-NEXT: vand.vi v0, v0, 15 +; CHECK-NEXT: ret +entry: + %res = and i32 %a, 399 + ret i32 %res +} + +define dso_local i32 @regexti8(i32 noundef %a) { +; CHECK-LABEL: regexti8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -832 +; CHECK-NEXT: vand.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = and i32 %a, -399 + ret i32 %res +} + +define dso_local i1 @regexti9(i32 noundef %a) { +; CHECK-LABEL: regexti9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 768 +; CHECK-NEXT: vmseq.vi v0, v0, 15 +; CHECK-NEXT: ret +entry: + %res = icmp eq i32 %a, 399 + ret i1 %res +} + +define dso_local i1 @regexti10(i32 noundef %a) { +; CHECK-LABEL: regexti10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -832 +; CHECK-NEXT: vmseq.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = icmp eq i32 %a, -399 + ret i1 %res +} + +define dso_local i1 @regexti11(i32 noundef %a) { +; CHECK-LABEL: regexti11: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, 768 +; CHECK-NEXT: vmsne.vi v0, v0, 15 +; CHECK-NEXT: ret +entry: + %res = icmp ne i32 %a, 399 + ret i1 %res +} + +define dso_local i1 @regexti12(i32 noundef %a) { +; CHECK-LABEL: regexti12: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: regexti zero, zero, -832 +; CHECK-NEXT: vmsne.vi v0, v0, -15 +; CHECK-NEXT: ret +entry: + %res = icmp ne i32 %a, -399 + ret i1 %res +} + +define dso_local ventus_kernel void @regexti13(ptr addrspace(1) nocapture + noundef align 4 %A, ptr addrspace(3) nocapture noundef align 4 %B) { +; CHECK-LABEL: regexti13: +; CHECK: # %bb.0: # %entry +; CHECK: regexti zero, zero, 769 +; CHECK-NEXT: vand.vi v33, v0, 15 + +entry: + %call = tail call i32 @_Z13get_global_idj(i32 noundef 0) + %calland = and i32 %call, 399 + %call1 = tail call i32 @_Z12get_local_idj(i32 noundef 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %B, i32 %call1 + %0 = load i32, ptr addrspace(3) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %A, i32 %calland + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + store i32 %add, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +declare dso_local i32 @_Z13get_global_idj(i32 noundef) +declare dso_local i32 @_Z12get_local_idj(i32 noundef) \ No newline at end of file diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll index 6cc3e1a9a89d..4705bce7008d 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/resource-usage.ll @@ -11,17 +11,14 @@ define dso_local ventus_kernel void @usage(ptr addrspace(1) nocapture noundef al ; VENTUS-LABEL: usage: ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 4 -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: lw t0, 4(a0) ; VENTUS-NEXT: lw t1, 0(a0) -; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vlw12.v v0, 0(v0) -; VENTUS-NEXT: lw t0, 0(t1) -; VENTUS-NEXT: vadd.vx v0, v0, t0 -; VENTUS-NEXT: vmv.v.x v1, t1 -; VENTUS-NEXT: vsw12.v v0, 0(v1) -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload -; VENTUS-NEXT: barrier x0, x0, 1 +; VENTUS-NEXT: lw t0, 0(t0) +; VENTUS-NEXT: lw t2, 0(t1) +; VENTUS-NEXT: add t0, t2, t0 +; VENTUS-NEXT: sw t0, 0(t1) +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll index 8520cc25bf11..1c23a2dcaa7f 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/var-arg.ll @@ -13,23 +13,23 @@ target triple = "riscv32" define dso_local i32 @printf(ptr addrspace(2) noundef %fmt, ...) { ; VENTUS-LABEL: printf: ; VENTUS: # %bb.0: # %entry -; VENTUS-NEXT: addi tp, tp, 64 -; VENTUS-NEXT: .cfi_def_cfa_offset 64 +; VENTUS-NEXT: addi tp, tp, 40 +; VENTUS-NEXT: .cfi_def_cfa_offset 40 ; VENTUS-NEXT: vmv.v.x v8, tp -; VENTUS-NEXT: vsw.v v7, -60(v8) -; VENTUS-NEXT: vsw.v v6, -56(v8) -; VENTUS-NEXT: vsw.v v5, -52(v8) -; VENTUS-NEXT: vsw.v v4, -48(v8) -; VENTUS-NEXT: vsw.v v3, -44(v8) -; VENTUS-NEXT: vsw.v v2, -40(v8) -; VENTUS-NEXT: vsw.v v1, -36(v8) -; VENTUS-NEXT: addi t0, tp, -36 +; VENTUS-NEXT: vsw.v v7, -36(v8) +; VENTUS-NEXT: vsw.v v6, -32(v8) +; VENTUS-NEXT: vsw.v v5, -28(v8) +; VENTUS-NEXT: vsw.v v4, -24(v8) +; VENTUS-NEXT: vsw.v v3, -20(v8) +; VENTUS-NEXT: vsw.v v2, -16(v8) +; VENTUS-NEXT: vsw.v v1, -12(v8) +; VENTUS-NEXT: addi t0, tp, -12 ; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vsw.v v0, -36(v8) -; VENTUS-NEXT: addi t0, tp, -32 +; VENTUS-NEXT: vsw.v v0, -12(v8) +; VENTUS-NEXT: addi t0, tp, -8 ; VENTUS-NEXT: vmv.v.x v0, t0 -; VENTUS-NEXT: vsw.v v0, -36(v8) -; VENTUS-NEXT: addi tp, tp, -64 +; VENTUS-NEXT: vsw.v v0, -12(v8) +; VENTUS-NEXT: addi tp, tp, -40 ; VENTUS-NEXT: ret entry: %retval = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll index 06d3e9d28165..224900905e84 100644 --- a/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/vbranch-join.ll @@ -8,7 +8,7 @@ define dso_local i32 @branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 4 ; VENTUS-NEXT: .cfi_def_cfa_offset 4 -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: vmv.v.x v0, zero ; VENTUS-NEXT: call _Z13get_global_idj @@ -37,7 +37,7 @@ define dso_local i32 @branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 ; VENTUS-NEXT: vadd.vx v0, v1, zero -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: @@ -63,7 +63,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: sw a0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero @@ -88,7 +88,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: # =>This Inner Loop Header: Depth=1 ; VENTUS-NEXT: vlw12.v v4, 0(v3) ; VENTUS-NEXT: vadd.vv v2, v2, v4 -; VENTUS-NEXT: vadd.vi v0, v0, -1 +; VENTUS-NEXT: vsub12.vi v0, v0, 1 ; VENTUS-NEXT: vsw12.v v2, 0(v1) ; VENTUS-NEXT: .Lpcrel_hi3: ; VENTUS-NEXT: auipc t1, %pcrel_hi(.LBB1_3) @@ -97,7 +97,7 @@ define dso_local ventus_kernel void @loop_branch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: .LBB1_3: # %for.cond.cleanup ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: @@ -134,7 +134,7 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: .cfi_def_cfa_offset 4 ; VENTUS-NEXT: regext zero, zero, 1 ; VENTUS-NEXT: vmv.v.x v32, tp -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 4 ; VENTUS-NEXT: .cfi_offset v33.l, 0 ; VENTUS-NEXT: vmv.v.x v0, zero @@ -169,6 +169,7 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: vblt v0, v33, .LBB2_5 ; VENTUS-NEXT: # %bb.3: # %if.then2 ; VENTUS-NEXT: li t0, 23 +; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: j .LBB2_6 ; VENTUS-NEXT: .LBB2_4: # %if.end7 ; VENTUS-NEXT: li t0, 4 @@ -177,14 +178,14 @@ define dso_local i32 @branch_in_branch(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: j .LBB2_7 ; VENTUS-NEXT: .LBB2_5: ; VENTUS-NEXT: li t0, 12 +; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: .LBB2_6: # %cleanup9 ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: vmv.v.x v0, t0 ; VENTUS-NEXT: .LBB2_7: # %cleanup9 ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: addi tp, tp, -4 ; VENTUS-NEXT: ret @@ -218,7 +219,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: sw a0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero @@ -249,7 +250,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: # => This Inner Loop Header: Depth=2 ; VENTUS-NEXT: vlw12.v v5, 0(v3) ; VENTUS-NEXT: vadd.vv v2, v2, v5 -; VENTUS-NEXT: vadd.vi v4, v4, -1 +; VENTUS-NEXT: vsub12.vi v4, v4, 1 ; VENTUS-NEXT: vsw12.v v2, 0(v1) ; VENTUS-NEXT: .Lpcrel_hi8: ; VENTUS-NEXT: auipc t1, %pcrel_hi(.LBB3_4) @@ -268,7 +269,7 @@ define dso_local ventus_kernel void @double_loop(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: .LBB3_5: # %for.cond.cleanup ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: @@ -312,7 +313,7 @@ define dso_local ventus_kernel void @loop_switch(ptr addrspace(1) nocapture noun ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 8 ; VENTUS-NEXT: .cfi_def_cfa_offset 8 -; VENTUS-NEXT: sw ra, -8(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: sw a0, -4(sp) # 4-byte Folded Spill ; VENTUS-NEXT: vmv.v.x v0, zero @@ -374,7 +375,7 @@ define dso_local ventus_kernel void @loop_switch(ptr addrspace(1) nocapture noun ; VENTUS-NEXT: .LBB4_9: # %for.cond.cleanup ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -8(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -8 ; VENTUS-NEXT: ret entry: @@ -426,7 +427,7 @@ define dso_local i32 @_Z13get_global_idj(i32 noundef %dim) local_unnamed_addr { ; VENTUS: # %bb.0: # %entry ; VENTUS-NEXT: addi sp, sp, 4 ; VENTUS-NEXT: .cfi_def_cfa_offset 4 -; VENTUS-NEXT: sw ra, -4(sp) # 4-byte Folded Spill +; VENTUS-NEXT: sw ra, 0(sp) # 4-byte Folded Spill ; VENTUS-NEXT: .cfi_offset ra, 0 ; VENTUS-NEXT: li t0, 2 ; VENTUS-NEXT: vmv.v.x v1, t0 @@ -461,7 +462,7 @@ define dso_local i32 @_Z13get_global_idj(i32 noundef %dim) local_unnamed_addr { ; VENTUS-NEXT: .LBB5_7: # %return ; VENTUS-NEXT: # Label of block must be emitted ; VENTUS-NEXT: join zero, zero, 0 -; VENTUS-NEXT: lw ra, -4(sp) # 4-byte Folded Reload +; VENTUS-NEXT: lw ra, 0(sp) # 4-byte Folded Reload ; VENTUS-NEXT: addi sp, sp, -4 ; VENTUS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/VentusGPGPU/vecargtest.cl b/llvm/test/CodeGen/RISCV/VentusGPGPU/vecargtest.cl new file mode 100644 index 000000000000..f40c5702f7bf --- /dev/null +++ b/llvm/test/CodeGen/RISCV/VentusGPGPU/vecargtest.cl @@ -0,0 +1,19 @@ +// RUN: clang -target riscv32 -mcpu=ventus-gpgpu < %s \ +// RUN: | FileCheck -check-prefix=VENTUS %s + +__kernel void test_kernel( +char2 c, uchar2 uc, short2 s, ushort2 us, int2 i, uint2 ui, float2 f, +__global float2 *result) +{ + // there is no FileCheck now + result[0] = convert_float2(c); + result[1] = convert_float2(uc); + result[2] = convert_float2(s); + result[3] = convert_float2(us); + result[4] = convert_float2(i); + result[5] = convert_float2(ui); + result[6] = f; +} + + +