[AArch64] Reassociate integer extending reductions to pairwise addition.
Given an (integer) vecreduce, we know the order of the inputs does not matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) into UADDV(UADDLP(x)). This can also happen through an extra add, where we transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). This makes sure the same thing happens signed cases too, which requires adding a new SADDLP node. Differential Revision: https://reviews.llvm.org/D118107
This commit is contained in:
parent
eaef54f213
commit
31373fb88a
|
@ -2254,6 +2254,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
|
||||
MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
|
||||
MAKE_CASE(AArch64ISD::INDEX_VECTOR)
|
||||
MAKE_CASE(AArch64ISD::SADDLP)
|
||||
MAKE_CASE(AArch64ISD::UADDLP)
|
||||
MAKE_CASE(AArch64ISD::CALL_RVMARKER)
|
||||
MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
|
||||
|
@ -4378,8 +4379,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
|
||||
Op.getOperand(2));
|
||||
}
|
||||
case Intrinsic::aarch64_neon_saddlp:
|
||||
case Intrinsic::aarch64_neon_uaddlp: {
|
||||
unsigned Opcode = AArch64ISD::UADDLP;
|
||||
unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
|
||||
? AArch64ISD::UADDLP
|
||||
: AArch64ISD::SADDLP;
|
||||
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
|
||||
}
|
||||
case Intrinsic::aarch64_neon_sdot:
|
||||
|
@ -13196,6 +13200,61 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
|
|||
return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
|
||||
}
|
||||
|
||||
// Given an (integer) vecreduce, we know the order of the inputs does not
|
||||
// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
|
||||
// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
|
||||
// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
|
||||
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
|
||||
auto DetectAddExtract = [&](SDValue A) {
|
||||
// Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
|
||||
// UADDLP(x) if found.
|
||||
if (A.getOpcode() != ISD::ADD)
|
||||
return SDValue();
|
||||
EVT VT = A.getValueType();
|
||||
SDValue Op0 = A.getOperand(0);
|
||||
SDValue Op1 = A.getOperand(1);
|
||||
if (Op0.getOpcode() != Op0.getOpcode() ||
|
||||
(Op0.getOpcode() != ISD::ZERO_EXTEND &&
|
||||
Op0.getOpcode() != ISD::SIGN_EXTEND))
|
||||
return SDValue();
|
||||
SDValue Ext0 = Op0.getOperand(0);
|
||||
SDValue Ext1 = Op1.getOperand(0);
|
||||
if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
|
||||
Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
|
||||
Ext0.getOperand(0) != Ext1.getOperand(0))
|
||||
return SDValue();
|
||||
// Check that the type is twice the add types, and the extract are from
|
||||
// upper/lower parts of the same source.
|
||||
if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
|
||||
VT.getVectorNumElements() * 2)
|
||||
return SDValue();
|
||||
if ((Ext0.getConstantOperandVal(1) != 0 &&
|
||||
Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
|
||||
(Ext1.getConstantOperandVal(1) != 0 &&
|
||||
Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
|
||||
return SDValue();
|
||||
unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
|
||||
: AArch64ISD::SADDLP;
|
||||
return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
|
||||
};
|
||||
|
||||
SDValue A = N->getOperand(0);
|
||||
if (SDValue R = DetectAddExtract(A))
|
||||
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
|
||||
if (A.getOpcode() == ISD::ADD) {
|
||||
if (SDValue R = DetectAddExtract(A.getOperand(0)))
|
||||
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
|
||||
DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
|
||||
A.getOperand(1)));
|
||||
if (SDValue R = DetectAddExtract(A.getOperand(1)))
|
||||
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
|
||||
DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
|
||||
A.getOperand(0)));
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
||||
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const AArch64Subtarget *Subtarget) {
|
||||
|
@ -14722,7 +14781,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
|
|||
}
|
||||
|
||||
// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
|
||||
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
|
||||
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
|
||||
EVT VT = N->getValueType(0);
|
||||
// Only scalar integer and vector types.
|
||||
if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
|
||||
|
@ -14838,7 +14897,7 @@ static SDValue performAddSubCombine(SDNode *N,
|
|||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
SelectionDAG &DAG) {
|
||||
// Try to change sum of two reductions.
|
||||
if (SDValue Val = performUADDVCombine(N, DAG))
|
||||
if (SDValue Val = performAddUADDVCombine(N, DAG))
|
||||
return Val;
|
||||
if (SDValue Val = performAddDotCombine(N, DAG))
|
||||
return Val;
|
||||
|
@ -17805,6 +17864,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
return performExtractVectorEltCombine(N, DAG);
|
||||
case ISD::VECREDUCE_ADD:
|
||||
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
|
||||
case AArch64ISD::UADDV:
|
||||
return performUADDVCombine(N, DAG);
|
||||
case ISD::INTRINSIC_VOID:
|
||||
case ISD::INTRINSIC_W_CHAIN:
|
||||
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
|
||||
|
|
|
@ -240,7 +240,8 @@ enum NodeType : unsigned {
|
|||
SRHADD,
|
||||
URHADD,
|
||||
|
||||
// Unsigned Add Long Pairwise
|
||||
// Add Long Pairwise
|
||||
SADDLP,
|
||||
UADDLP,
|
||||
|
||||
// udot/sdot instructions
|
||||
|
|
|
@ -643,9 +643,13 @@ def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
|
|||
(int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
|
||||
|
||||
def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>;
|
||||
def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>;
|
||||
def AArch64uaddlp : PatFrags<(ops node:$src),
|
||||
[(AArch64uaddlp_n node:$src),
|
||||
(int_aarch64_neon_uaddlp node:$src)]>;
|
||||
def AArch64saddlp : PatFrags<(ops node:$src),
|
||||
[(AArch64saddlp_n node:$src),
|
||||
(int_aarch64_neon_saddlp node:$src)]>;
|
||||
|
||||
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
|
||||
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
|
@ -4312,8 +4316,8 @@ defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
|
|||
defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
|
||||
defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
|
||||
defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
|
||||
BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
|
||||
defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
|
||||
BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >;
|
||||
defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>;
|
||||
defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
|
||||
defm SHLL : SIMDVectorLShiftLongBySizeBHS;
|
||||
defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
|
||||
|
|
|
@ -65,9 +65,7 @@ define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias
|
|||
; CHECK-NEXT: ldr d0, [x0]
|
||||
; CHECK-NEXT: ldr d1, [x1]
|
||||
; CHECK-NEXT: uabdl v0.8h, v0.8b, v1.8b
|
||||
; CHECK-NEXT: ushll v1.4s, v0.4h, #0
|
||||
; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: uaddlv s0, v0.8h
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
|
|
|
@ -198,9 +198,7 @@ define i16 @uabd16b_rdx(<16 x i8>* %a, <16 x i8>* %b) {
|
|||
; CHECK-NEXT: ldr q0, [x0]
|
||||
; CHECK-NEXT: ldr q1, [x1]
|
||||
; CHECK-NEXT: uabd.16b v0, v0, v1
|
||||
; CHECK-NEXT: ushll.8h v1, v0, #0
|
||||
; CHECK-NEXT: uaddw2.8h v0, v1, v0
|
||||
; CHECK-NEXT: addv.8h h0, v0
|
||||
; CHECK-NEXT: uaddlv.16b h0, v0
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
%aload = load <16 x i8>, <16 x i8>* %a, align 1
|
||||
|
@ -261,9 +259,7 @@ define i32 @uabd8h_rdx(<8 x i16>* %a, <8 x i16>* %b) {
|
|||
; CHECK-NEXT: ldr q0, [x0]
|
||||
; CHECK-NEXT: ldr q1, [x1]
|
||||
; CHECK-NEXT: uabd.8h v0, v0, v1
|
||||
; CHECK-NEXT: ushll.4s v1, v0, #0
|
||||
; CHECK-NEXT: uaddw2.4s v0, v1, v0
|
||||
; CHECK-NEXT: addv.4s s0, v0
|
||||
; CHECK-NEXT: uaddlv.8h s0, v0
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
%aload = load <8 x i16>, <8 x i16>* %a, align 1
|
||||
|
@ -282,9 +278,7 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
|
|||
; CHECK-LABEL: sabd8h_rdx:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: sabd.8h v0, v0, v1
|
||||
; CHECK-NEXT: ushll.4s v1, v0, #0
|
||||
; CHECK-NEXT: uaddw2.4s v0, v1, v0
|
||||
; CHECK-NEXT: addv.4s s0, v0
|
||||
; CHECK-NEXT: uaddlv.8h s0, v0
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
%aext = sext <8 x i16> %a to <8 x i32>
|
||||
|
@ -338,9 +332,7 @@ define i64 @uabd4s_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
|
|||
; CHECK-NEXT: ldr q0, [x0]
|
||||
; CHECK-NEXT: ldr q1, [x1]
|
||||
; CHECK-NEXT: uabd.4s v0, v0, v1
|
||||
; CHECK-NEXT: ushll.2d v1, v0, #0
|
||||
; CHECK-NEXT: uaddw2.2d v0, v1, v0
|
||||
; CHECK-NEXT: addp.2d d0, v0
|
||||
; CHECK-NEXT: uaddlv.4s d0, v0
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
%aload = load <4 x i32>, <4 x i32>* %a, align 1
|
||||
|
@ -359,9 +351,7 @@ define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
|
|||
; CHECK-LABEL: sabd4s_rdx:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: sabd.4s v0, v0, v1
|
||||
; CHECK-NEXT: ushll.2d v1, v0, #0
|
||||
; CHECK-NEXT: uaddw2.2d v0, v1, v0
|
||||
; CHECK-NEXT: addp.2d d0, v0
|
||||
; CHECK-NEXT: uaddlv.4s d0, v0
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
%aext = sext <4 x i32> %a to <4 x i64>
|
||||
|
|
|
@ -16,9 +16,7 @@ entry:
|
|||
define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
|
||||
; CHECK-LABEL: add_v4i32_v4i64_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: uaddlv d0, v0.4s
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
|
@ -30,8 +28,7 @@ entry:
|
|||
define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
|
||||
; CHECK-LABEL: add_v4i32_v4i64_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: saddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -70,9 +67,7 @@ entry:
|
|||
define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
|
||||
; CHECK-LABEL: add_v8i16_v8i32_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v1.4s, v0.4h, #0
|
||||
; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: uaddlv s0, v0.8h
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
|
@ -84,8 +79,7 @@ entry:
|
|||
define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
|
||||
; CHECK-LABEL: add_v8i16_v8i32_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v1.4s, v0.4h, #0
|
||||
; CHECK-NEXT: saddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-NEXT: saddlp v0.4s, v0.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -170,9 +164,7 @@ define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
|
|||
; CHECK-LABEL: add_v4i16_v4i64_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: uaddlv d0, v0.4s
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
|
@ -185,8 +177,7 @@ define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
|
|||
; CHECK-LABEL: add_v4i16_v4i64_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: sshll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: saddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -282,9 +273,7 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
|
|||
; CHECK-BASE-LABEL: add_v8i8_v8i32_zext:
|
||||
; CHECK-BASE: // %bb.0: // %entry
|
||||
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
|
||||
; CHECK-BASE-NEXT: ushll v1.4s, v0.4h, #0
|
||||
; CHECK-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-BASE-NEXT: uaddlv s0, v0.8h
|
||||
; CHECK-BASE-NEXT: fmov w0, s0
|
||||
; CHECK-BASE-NEXT: ret
|
||||
;
|
||||
|
@ -306,8 +295,7 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
|
|||
; CHECK-BASE-LABEL: add_v8i8_v8i32_sext:
|
||||
; CHECK-BASE: // %bb.0: // %entry
|
||||
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
|
||||
; CHECK-BASE-NEXT: sshll v1.4s, v0.4h, #0
|
||||
; CHECK-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-BASE-NEXT: fmov w0, s0
|
||||
; CHECK-BASE-NEXT: ret
|
||||
|
@ -358,8 +346,7 @@ entry:
|
|||
define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
|
||||
; CHECK-LABEL: add_v16i8_v16i16_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v1.8h, v0.8b, #0
|
||||
; CHECK-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
|
||||
; CHECK-NEXT: uaddlp v0.8h, v0.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -372,8 +359,7 @@ entry:
|
|||
define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
|
||||
; CHECK-LABEL: add_v16i8_v16i16_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v1.8h, v0.8b, #0
|
||||
; CHECK-NEXT: saddw2 v0.8h, v1.8h, v0.16b
|
||||
; CHECK-NEXT: saddlp v0.8h, v0.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: smov w0, v0.h[0]
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -511,9 +497,7 @@ define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
|
|||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: bic v0.4h, #255, lsl #8
|
||||
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: uaddlv d0, v0.4s
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
|
@ -598,9 +582,7 @@ entry:
|
|||
define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
|
||||
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: uaddlv d0, v0.4s
|
||||
; CHECK-NEXT: fmov x8, d0
|
||||
; CHECK-NEXT: add x0, x8, x0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -614,8 +596,7 @@ entry:
|
|||
define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
|
||||
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: saddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x8, d0
|
||||
; CHECK-NEXT: add x0, x8, x0
|
||||
|
@ -660,9 +641,7 @@ entry:
|
|||
define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
|
||||
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v1.4s, v0.4h, #0
|
||||
; CHECK-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: uaddlv s0, v0.8h
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: add w0, w8, w0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -676,8 +655,7 @@ entry:
|
|||
define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
|
||||
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v1.4s, v0.4h, #0
|
||||
; CHECK-NEXT: saddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-NEXT: saddlp v0.4s, v0.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: add w0, w8, w0
|
||||
|
@ -775,9 +753,7 @@ define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
|
|||
; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: uaddlv d0, v0.4s
|
||||
; CHECK-NEXT: fmov x8, d0
|
||||
; CHECK-NEXT: add x0, x8, x0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -792,8 +768,7 @@ define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
|
|||
; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: sshll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: saddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x8, d0
|
||||
; CHECK-NEXT: add x0, x8, x0
|
||||
|
@ -901,9 +876,7 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
|
|||
; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_zext:
|
||||
; CHECK-BASE: // %bb.0: // %entry
|
||||
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
|
||||
; CHECK-BASE-NEXT: ushll v1.4s, v0.4h, #0
|
||||
; CHECK-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-BASE-NEXT: uaddlv s0, v0.8h
|
||||
; CHECK-BASE-NEXT: fmov w8, s0
|
||||
; CHECK-BASE-NEXT: add w0, w8, w0
|
||||
; CHECK-BASE-NEXT: ret
|
||||
|
@ -928,8 +901,7 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
|
|||
; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext:
|
||||
; CHECK-BASE: // %bb.0: // %entry
|
||||
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
|
||||
; CHECK-BASE-NEXT: sshll v1.4s, v0.4h, #0
|
||||
; CHECK-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-BASE-NEXT: fmov w8, s0
|
||||
; CHECK-BASE-NEXT: add w0, w8, w0
|
||||
|
@ -987,9 +959,7 @@ entry:
|
|||
define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
|
||||
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v1.8h, v0.8b, #0
|
||||
; CHECK-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: uaddlv h0, v0.16b
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: add w8, w8, w0
|
||||
; CHECK-NEXT: and w0, w8, #0xffff
|
||||
|
@ -1004,8 +974,7 @@ entry:
|
|||
define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
|
||||
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v1.8h, v0.8b, #0
|
||||
; CHECK-NEXT: saddw2 v0.8h, v1.8h, v0.16b
|
||||
; CHECK-NEXT: saddlp v0.8h, v0.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: add w8, w8, w0
|
||||
|
@ -1163,9 +1132,7 @@ define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
|
|||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: bic v0.4h, #255, lsl #8
|
||||
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: uaddlv d0, v0.4s
|
||||
; CHECK-NEXT: fmov x8, d0
|
||||
; CHECK-NEXT: add x0, x8, x0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1261,11 +1228,8 @@ entry:
|
|||
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
|
||||
; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v2.2d, v0.2s, #0
|
||||
; CHECK-NEXT: ushll v3.2d, v1.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
|
||||
; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
|
||||
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
|
||||
; CHECK-NEXT: uaddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: uadalp v0.2d, v1.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1281,11 +1245,8 @@ entry:
|
|||
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
|
||||
; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v2.2d, v0.2s, #0
|
||||
; CHECK-NEXT: sshll v3.2d, v1.2s, #0
|
||||
; CHECK-NEXT: saddw2 v0.2d, v2.2d, v0.4s
|
||||
; CHECK-NEXT: saddw2 v1.2d, v3.2d, v1.4s
|
||||
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
|
||||
; CHECK-NEXT: saddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: sadalp v0.2d, v1.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1333,11 +1294,8 @@ entry:
|
|||
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
|
||||
; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v2.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v3.4s, v1.4h, #0
|
||||
; CHECK-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
|
||||
; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
|
||||
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: uaddlp v0.4s, v0.8h
|
||||
; CHECK-NEXT: uadalp v0.4s, v1.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1353,11 +1311,8 @@ entry:
|
|||
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
|
||||
; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v2.4s, v0.4h, #0
|
||||
; CHECK-NEXT: sshll v3.4s, v1.4h, #0
|
||||
; CHECK-NEXT: saddw2 v0.4s, v2.4s, v0.8h
|
||||
; CHECK-NEXT: saddw2 v1.4s, v3.4s, v1.8h
|
||||
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: saddlp v0.4s, v0.8h
|
||||
; CHECK-NEXT: sadalp v0.4s, v1.8h
|
||||
; CHECK-NEXT: addv s0, v0.4s
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1476,11 +1431,8 @@ define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
|
|||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
|
||||
; CHECK-NEXT: ushll v2.2d, v0.2s, #0
|
||||
; CHECK-NEXT: ushll v3.2d, v1.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
|
||||
; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
|
||||
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
|
||||
; CHECK-NEXT: uaddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: uadalp v0.2d, v1.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1498,11 +1450,8 @@ define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
|
|||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: sshll v1.4s, v1.4h, #0
|
||||
; CHECK-NEXT: sshll v2.2d, v0.2s, #0
|
||||
; CHECK-NEXT: sshll v3.2d, v1.2s, #0
|
||||
; CHECK-NEXT: saddw2 v0.2d, v2.2d, v0.4s
|
||||
; CHECK-NEXT: saddw2 v1.2d, v3.2d, v1.4s
|
||||
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
|
||||
; CHECK-NEXT: saddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: sadalp v0.2d, v1.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
@ -1632,11 +1581,8 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
|
|||
; CHECK-BASE: // %bb.0: // %entry
|
||||
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
|
||||
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
|
||||
; CHECK-BASE-NEXT: ushll v2.4s, v0.4h, #0
|
||||
; CHECK-BASE-NEXT: ushll v3.4s, v1.4h, #0
|
||||
; CHECK-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
|
||||
; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
|
||||
; CHECK-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-BASE-NEXT: fmov w0, s0
|
||||
; CHECK-BASE-NEXT: ret
|
||||
|
@ -1664,11 +1610,8 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
|
|||
; CHECK-BASE: // %bb.0: // %entry
|
||||
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
|
||||
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
|
||||
; CHECK-BASE-NEXT: sshll v2.4s, v0.4h, #0
|
||||
; CHECK-BASE-NEXT: sshll v3.4s, v1.4h, #0
|
||||
; CHECK-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h
|
||||
; CHECK-BASE-NEXT: add v0.4s, v0.4s, v1.4s
|
||||
; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
|
||||
; CHECK-BASE-NEXT: sadalp v0.4s, v1.8h
|
||||
; CHECK-BASE-NEXT: addv s0, v0.4s
|
||||
; CHECK-BASE-NEXT: fmov w0, s0
|
||||
; CHECK-BASE-NEXT: ret
|
||||
|
@ -1733,12 +1676,8 @@ entry:
|
|||
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
|
||||
; CHECK-LABEL: add_pair_v16i8_v16i16_zext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
|
||||
; CHECK-NEXT: ushll v3.8h, v1.8b, #0
|
||||
; CHECK-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
|
||||
; CHECK-NEXT: uaddw2 v1.8h, v3.8h, v1.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: addv h1, v1.8h
|
||||
; CHECK-NEXT: uaddlv h0, v0.16b
|
||||
; CHECK-NEXT: uaddlv h1, v1.16b
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: fmov w9, s1
|
||||
; CHECK-NEXT: add w8, w8, w9
|
||||
|
@ -1756,10 +1695,8 @@ entry:
|
|||
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
|
||||
; CHECK-LABEL: add_pair_v16i8_v16i16_sext:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: sshll v2.8h, v0.8b, #0
|
||||
; CHECK-NEXT: sshll v3.8h, v1.8b, #0
|
||||
; CHECK-NEXT: saddw2 v0.8h, v2.8h, v0.16b
|
||||
; CHECK-NEXT: saddw2 v1.8h, v3.8h, v1.16b
|
||||
; CHECK-NEXT: saddlp v0.8h, v0.16b
|
||||
; CHECK-NEXT: saddlp v1.8h, v1.16b
|
||||
; CHECK-NEXT: addv h0, v0.8h
|
||||
; CHECK-NEXT: addv h1, v1.8h
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
|
@ -1982,11 +1919,8 @@ define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
|
|||
; CHECK-NEXT: bic v1.4h, #255, lsl #8
|
||||
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
|
||||
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
|
||||
; CHECK-NEXT: ushll v2.2d, v0.2s, #0
|
||||
; CHECK-NEXT: ushll v3.2d, v1.2s, #0
|
||||
; CHECK-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
|
||||
; CHECK-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
|
||||
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
|
||||
; CHECK-NEXT: uaddlp v0.2d, v0.4s
|
||||
; CHECK-NEXT: uadalp v0.2d, v1.4s
|
||||
; CHECK-NEXT: addp d0, v0.2d
|
||||
; CHECK-NEXT: fmov x0, d0
|
||||
; CHECK-NEXT: ret
|
||||
|
|
Loading…
Reference in New Issue