[AArch64][SVE] Add DAG-Combine to push bitcasts from floating point loads after DUPLANE128

This patch lowers duplane128(insert_subvector(undef, bitcast(op(128bitsubvec)), 0), 0) to bitcast(duplane128(insert_subvector(undef, op(128bitsubvec), 0), 0)). This enables floating-point loads to match patterns added in https://reviews.llvm.org/D130010 Differential Revision: https://reviews.llvm.org/D130013
2022-07-18 14:39:35 +00:00 · 2022-07-18 14:39:35 +00:00 · cd3d7bf15d
parent e0fbd990c9
commit cd3d7bf15d
2 changed files with 45 additions and 8 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -19256,6 +19256,41 @@ static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
  return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
 }

+static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue Insert = N->getOperand(0);
+  if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return SDValue();
+
+  if (!Insert.getOperand(0).isUndef())
+    return SDValue();
+
+  uint64_t IdxInsert = Insert.getConstantOperandVal(2);
+  uint64_t IdxDupLane = N->getConstantOperandVal(1);
+  if (IdxInsert != IdxDupLane)
+    return SDValue();
+
+  SDValue Bitcast = Insert.getOperand(1);
+  if (Bitcast.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  SDValue Subvec = Bitcast.getOperand(0);
+  EVT SubvecVT = Subvec.getValueType();
+  if (!SubvecVT.is128BitVector())
+    return SDValue();
+  EVT NewSubvecVT =
+      getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
+
+  SDLoc DL(N);
+  SDValue NewInsert =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
+                  DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
+  SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
+                                      NewInsert, N->getOperand(1));
+  return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
  SelectionDAG &DAG = DCI.DAG;
@ -19342,6 +19377,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    return performCSELCombine(N, DCI, DAG);
  case AArch64ISD::DUP:
    return performDUPCombine(N, DCI);
+  case AArch64ISD::DUPLANE128:
+    return performDupLane128Combine(N, DAG);
  case AArch64ISD::NVCAST:
    return performNVCASTCombine(N);
  case AArch64ISD::SPLICE:
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@ -726,8 +726,8 @@ define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(double* %valp)
 define <vscale x 2 x double> @dupq_ld1rqd_f64(<2 x double>* %a) {
 ; CHECK-LABEL: dupq_ld1rqd_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ret
  %1 = load <2 x double>, <2 x double>* %a
  %2 = tail call fast <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %1, i64 0)
@ -738,8 +738,8 @@ define <vscale x 2 x double> @dupq_ld1rqd_f64(<2 x double>* %a) {
 define <vscale x 4 x float> @dupq_ld1rqw_f32(<4 x float>* %a) {
 ; CHECK-LABEL: dupq_ld1rqw_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
  %1 = load <4 x float>, <4 x float>* %a
  %2 = tail call fast <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %1, i64 0)
@ -750,8 +750,8 @@ define <vscale x 4 x float> @dupq_ld1rqw_f32(<4 x float>* %a) {
 define <vscale x 8 x half> @dupq_ld1rqh_f16(<8 x half>* %a) {
 ; CHECK-LABEL: dupq_ld1rqh_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ret
  %1 = load <8 x half>, <8 x half>* %a
  %2 = tail call fast <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %1, i64 0)
@ -762,8 +762,8 @@ define <vscale x 8 x half> @dupq_ld1rqh_f16(<8 x half>* %a) {
 define <vscale x 8 x bfloat> @dupq_ld1rqh_bf16(<8 x bfloat>* %a) #0 {
 ; CHECK-LABEL: dupq_ld1rqh_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ret
  %1 = load <8 x bfloat>, <8 x bfloat>* %a
  %2 = tail call fast <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %1, i64 0)