[AArch64][SVE] Add DAG-Combine to push bitcasts from floating point loads after DUPLANE128

This patch lowers
  duplane128(insert_subvector(undef, bitcast(op(128bitsubvec)), 0), 0)
to
  bitcast(duplane128(insert_subvector(undef, op(128bitsubvec), 0), 0)).

This enables floating-point loads to match patterns added in
https://reviews.llvm.org/D130010

Differential Revision: https://reviews.llvm.org/D130013
This commit is contained in:
Matt Devereau 2022-07-18 14:39:35 +00:00
parent e0fbd990c9
commit cd3d7bf15d
2 changed files with 45 additions and 8 deletions

View File

@ -19256,6 +19256,41 @@ static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
}
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue Insert = N->getOperand(0);
if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
return SDValue();
if (!Insert.getOperand(0).isUndef())
return SDValue();
uint64_t IdxInsert = Insert.getConstantOperandVal(2);
uint64_t IdxDupLane = N->getConstantOperandVal(1);
if (IdxInsert != IdxDupLane)
return SDValue();
SDValue Bitcast = Insert.getOperand(1);
if (Bitcast.getOpcode() != ISD::BITCAST)
return SDValue();
SDValue Subvec = Bitcast.getOperand(0);
EVT SubvecVT = Subvec.getValueType();
if (!SubvecVT.is128BitVector())
return SDValue();
EVT NewSubvecVT =
getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
SDLoc DL(N);
SDValue NewInsert =
DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
NewInsert, N->getOperand(1));
return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
}
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@ -19342,6 +19377,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performCSELCombine(N, DCI, DAG);
case AArch64ISD::DUP:
return performDUPCombine(N, DCI);
case AArch64ISD::DUPLANE128:
return performDupLane128Combine(N, DAG);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
case AArch64ISD::SPLICE:

View File

@ -726,8 +726,8 @@ define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(double* %valp)
define <vscale x 2 x double> @dupq_ld1rqd_f64(<2 x double>* %a) {
; CHECK-LABEL: dupq_ld1rqd_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%1 = load <2 x double>, <2 x double>* %a
%2 = tail call fast <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %1, i64 0)
@ -738,8 +738,8 @@ define <vscale x 2 x double> @dupq_ld1rqd_f64(<2 x double>* %a) {
define <vscale x 4 x float> @dupq_ld1rqw_f32(<4 x float>* %a) {
; CHECK-LABEL: dupq_ld1rqw_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%1 = load <4 x float>, <4 x float>* %a
%2 = tail call fast <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %1, i64 0)
@ -750,8 +750,8 @@ define <vscale x 4 x float> @dupq_ld1rqw_f32(<4 x float>* %a) {
define <vscale x 8 x half> @dupq_ld1rqh_f16(<8 x half>* %a) {
; CHECK-LABEL: dupq_ld1rqh_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%1 = load <8 x half>, <8 x half>* %a
%2 = tail call fast <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %1, i64 0)
@ -762,8 +762,8 @@ define <vscale x 8 x half> @dupq_ld1rqh_f16(<8 x half>* %a) {
define <vscale x 8 x bfloat> @dupq_ld1rqh_bf16(<8 x bfloat>* %a) #0 {
; CHECK-LABEL: dupq_ld1rqh_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%1 = load <8 x bfloat>, <8 x bfloat>* %a
%2 = tail call fast <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %1, i64 0)