[ARM] Simplify address calculation for NEON load/store

The patch attempts to optimize a sequence of SIMD loads from the same
base pointer:

    %0 = gep float*, float* base, i32 4
    %1 = bitcast float* %0 to <4 x float>*
    %2 = load <4 x float>, <4 x float>* %1
    ...
    %n1 = gep float*, float* base, i32 N
    %n2 = bitcast float* %n1 to <4 x float>*
    %n3 = load <4 x float>, <4 x float>* %n2

For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16].
However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so
the address is computed before every ld/st instruction:

    add r2, r0, #32
    add r0, r0, #16
    vld1.32 {d18, d19}, [r2]
    vld1.32 {d22, d23}, [r0]

This can be improved by computing address for the first load, and then
using a post-indexed form of VLD1/VST1 to load the rest:

    add r0, r0, #16
    vld1.32 {d18, d19}, [r0]!
    vld1.32 {d22, d23}, [r0]

In order to do that, the patch adds more patterns to DAGCombine:

  - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1
    and inc2 are constants.

  - (or ptr inc) is now recognized as a pointer increment if ptr is
    sufficiently aligned.

In addition to that, we now search for all possible base updates and
then pick the best one.

Differential Revision: https://reviews.llvm.org/D108988
This commit is contained in:
Andrew Savonichev 2021-09-08 18:19:57 +03:00
parent 88487662f7
commit dc8a41de34
12 changed files with 1119 additions and 620 deletions

View File

@ -15244,6 +15244,390 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
DAG.getUNDEF(VT), NewMask);
}
/// Load/store instruction that can be merged with a base address
/// update
struct BaseUpdateTarget {
SDNode *N;
bool isIntrinsic;
bool isStore;
unsigned AddrOpIdx;
};
struct BaseUpdateUser {
/// Instruction that updates a pointer
SDNode *N;
/// Pointer increment operand
SDValue Inc;
/// Pointer increment value if it is a constant, or 0 otherwise
unsigned ConstInc;
};
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
struct BaseUpdateUser &User,
bool SimpleConstIncOnly,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDNode *N = Target.N;
MemSDNode *MemN = cast<MemSDNode>(N);
SDLoc dl(N);
// Find the new opcode for the updating load/store.
bool isLoadOp = true;
bool isLaneOp = false;
// Workaround for vst1x and vld1x intrinsics which do not have alignment
// as an operand.
bool hasAlignment = true;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
if (Target.isIntrinsic) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default:
llvm_unreachable("unexpected intrinsic for Neon base update");
case Intrinsic::arm_neon_vld1:
NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1;
break;
case Intrinsic::arm_neon_vld2:
NewOpc = ARMISD::VLD2_UPD;
NumVecs = 2;
break;
case Intrinsic::arm_neon_vld3:
NewOpc = ARMISD::VLD3_UPD;
NumVecs = 3;
break;
case Intrinsic::arm_neon_vld4:
NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4;
break;
case Intrinsic::arm_neon_vld1x2:
NewOpc = ARMISD::VLD1x2_UPD;
NumVecs = 2;
hasAlignment = false;
break;
case Intrinsic::arm_neon_vld1x3:
NewOpc = ARMISD::VLD1x3_UPD;
NumVecs = 3;
hasAlignment = false;
break;
case Intrinsic::arm_neon_vld1x4:
NewOpc = ARMISD::VLD1x4_UPD;
NumVecs = 4;
hasAlignment = false;
break;
case Intrinsic::arm_neon_vld2dup:
NewOpc = ARMISD::VLD2DUP_UPD;
NumVecs = 2;
break;
case Intrinsic::arm_neon_vld3dup:
NewOpc = ARMISD::VLD3DUP_UPD;
NumVecs = 3;
break;
case Intrinsic::arm_neon_vld4dup:
NewOpc = ARMISD::VLD4DUP_UPD;
NumVecs = 4;
break;
case Intrinsic::arm_neon_vld2lane:
NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2;
isLaneOp = true;
break;
case Intrinsic::arm_neon_vld3lane:
NewOpc = ARMISD::VLD3LN_UPD;
NumVecs = 3;
isLaneOp = true;
break;
case Intrinsic::arm_neon_vld4lane:
NewOpc = ARMISD::VLD4LN_UPD;
NumVecs = 4;
isLaneOp = true;
break;
case Intrinsic::arm_neon_vst1:
NewOpc = ARMISD::VST1_UPD;
NumVecs = 1;
isLoadOp = false;
break;
case Intrinsic::arm_neon_vst2:
NewOpc = ARMISD::VST2_UPD;
NumVecs = 2;
isLoadOp = false;
break;
case Intrinsic::arm_neon_vst3:
NewOpc = ARMISD::VST3_UPD;
NumVecs = 3;
isLoadOp = false;
break;
case Intrinsic::arm_neon_vst4:
NewOpc = ARMISD::VST4_UPD;
NumVecs = 4;
isLoadOp = false;
break;
case Intrinsic::arm_neon_vst2lane:
NewOpc = ARMISD::VST2LN_UPD;
NumVecs = 2;
isLoadOp = false;
isLaneOp = true;
break;
case Intrinsic::arm_neon_vst3lane:
NewOpc = ARMISD::VST3LN_UPD;
NumVecs = 3;
isLoadOp = false;
isLaneOp = true;
break;
case Intrinsic::arm_neon_vst4lane:
NewOpc = ARMISD::VST4LN_UPD;
NumVecs = 4;
isLoadOp = false;
isLaneOp = true;
break;
case Intrinsic::arm_neon_vst1x2:
NewOpc = ARMISD::VST1x2_UPD;
NumVecs = 2;
isLoadOp = false;
hasAlignment = false;
break;
case Intrinsic::arm_neon_vst1x3:
NewOpc = ARMISD::VST1x3_UPD;
NumVecs = 3;
isLoadOp = false;
hasAlignment = false;
break;
case Intrinsic::arm_neon_vst1x4:
NewOpc = ARMISD::VST1x4_UPD;
NumVecs = 4;
isLoadOp = false;
hasAlignment = false;
break;
}
} else {
isLaneOp = true;
switch (N->getOpcode()) {
default:
llvm_unreachable("unexpected opcode for Neon base update");
case ARMISD::VLD1DUP:
NewOpc = ARMISD::VLD1DUP_UPD;
NumVecs = 1;
break;
case ARMISD::VLD2DUP:
NewOpc = ARMISD::VLD2DUP_UPD;
NumVecs = 2;
break;
case ARMISD::VLD3DUP:
NewOpc = ARMISD::VLD3DUP_UPD;
NumVecs = 3;
break;
case ARMISD::VLD4DUP:
NewOpc = ARMISD::VLD4DUP_UPD;
NumVecs = 4;
break;
case ISD::LOAD:
NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1;
isLaneOp = false;
break;
case ISD::STORE:
NewOpc = ARMISD::VST1_UPD;
NumVecs = 1;
isLaneOp = false;
isLoadOp = false;
break;
}
}
// Find the size of memory referenced by the load/store.
EVT VecTy;
if (isLoadOp) {
VecTy = N->getValueType(0);
} else if (Target.isIntrinsic) {
VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
} else {
assert(Target.isStore &&
"Node has to be a load, a store, or an intrinsic!");
VecTy = N->getOperand(1).getValueType();
}
bool isVLDDUPOp =
NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (isLaneOp || isVLDDUPOp)
NumBytes /= VecTy.getVectorNumElements();
if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
// separate instructions that make it harder to use a non-constant update.
return false;
}
if (SimpleConstIncOnly && User.ConstInc != NumBytes)
return false;
// OK, we found an ADD we can fold into the base update.
// Now, create a _UPD node, taking care of not breaking alignment.
EVT AlignedVecTy = VecTy;
unsigned Alignment = MemN->getAlignment();
// If this is a less-than-standard-aligned load/store, change the type to
// match the standard alignment.
// The alignment is overlooked when selecting _UPD variants; and it's
// easier to introduce bitcasts here than fix that.
// There are 3 ways to get to this base-update combine:
// - intrinsics: they are assumed to be properly aligned (to the standard
// alignment of the memory type), so we don't need to do anything.
// - ARMISD::VLDx nodes: they are only generated from the aforementioned
// intrinsics, so, likewise, there's nothing to do.
// - generic load/store instructions: the alignment is specified as an
// explicit operand, rather than implicitly as the standard alignment
// of the memory type (like the intrisics). We need to change the
// memory type to match the explicit alignment. That way, we don't
// generate non-standard-aligned ARMISD::VLDx nodes.
if (isa<LSBaseSDNode>(N)) {
if (Alignment == 0)
Alignment = 1;
if (Alignment < VecTy.getScalarSizeInBits() / 8) {
MVT EltTy = MVT::getIntegerVT(Alignment * 8);
assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
assert(!isLaneOp && "Unexpected generic load/store lane.");
unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
}
// Don't set an explicit alignment on regular load/stores that we want
// to transform to VLD/VST 1_UPD nodes.
// This matches the behavior of regular load/stores, which only get an
// explicit alignment if the MMO alignment is larger than the standard
// alignment of the memory type.
// Intrinsics, however, always get an explicit alignment, set to the
// alignment of the MMO.
Alignment = 1;
}
// Create the new updating load/store node.
// First, create an SDVTList for the new updating node's results.
EVT Tys[6];
unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = AlignedVecTy;
Tys[n++] = MVT::i32;
Tys[n] = MVT::Other;
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
// Then, gather the new node's operands.
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // incoming chain
Ops.push_back(N->getOperand(Target.AddrOpIdx));
Ops.push_back(User.Inc);
if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
// Try to match the intrinsic's signature
Ops.push_back(StN->getValue());
} else {
// Loads (and of course intrinsics) match the intrinsics' signature,
// so just add all but the alignment operand.
unsigned LastOperand =
hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
Ops.push_back(N->getOperand(i));
}
// For all node types, the alignment operand is always the last one.
Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
// If this is a non-standard-aligned STORE, the penultimate operand is the
// stored value. Bitcast it to the aligned type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
SDValue &StVal = Ops[Ops.size() - 2];
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
}
EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
MemN->getMemOperand());
// Update the uses.
SmallVector<SDValue, 5> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i)
NewResults.push_back(SDValue(UpdN.getNode(), i));
// If this is an non-standard-aligned LOAD, the first result is the loaded
// value. Bitcast it to the expected result type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
SDValue &LdVal = NewResults[0];
LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
}
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
return true;
}
// If (opcode ptr inc) is and ADD-like instruction, return the
// increment value. Otherwise return 0.
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
SDValue Inc, const SelectionDAG &DAG) {
ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
if (!CInc)
return 0;
switch (Opcode) {
case ARMISD::VLD1_UPD:
case ISD::ADD:
return CInc->getZExtValue();
case ISD::OR: {
if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
// (OR ptr inc) is the same as (ADD ptr inc)
return CInc->getZExtValue();
}
return 0;
}
default:
return 0;
}
}
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
switch (N->getOpcode()) {
case ISD::ADD:
case ISD::OR: {
if (isa<ConstantSDNode>(N->getOperand(1))) {
*Ptr = N->getOperand(0);
*CInc = N->getOperand(1);
return true;
}
return false;
}
case ARMISD::VLD1_UPD: {
if (isa<ConstantSDNode>(N->getOperand(2))) {
*Ptr = N->getOperand(1);
*CInc = N->getOperand(2);
return true;
}
return false;
}
default:
return false;
}
}
static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
// Check that the add is independent of the load/store.
// Otherwise, folding it would create a cycle. Search through Addr
// as well, since the User may not be a direct user of Addr and
// only share a base pointer.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
Worklist.push_back(N);
Worklist.push_back(User);
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
return false;
return true;
}
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
/// NEON load/store intrinsics, and generic vector load/stores, to merge
/// base address updates.
@ -15251,237 +15635,89 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
/// The caller is assumed to have checked legality.
static SDValue CombineBaseUpdate(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
const bool isStore = N->getOpcode() == ISD::STORE;
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
SDValue Addr = N->getOperand(AddrOpIdx);
MemSDNode *MemN = cast<MemSDNode>(N);
SDLoc dl(N);
SmallVector<BaseUpdateUser, 8> BaseUpdates;
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD ||
UI.getUse().getResNo() != Addr.getResNo())
if (UI.getUse().getResNo() != Addr.getResNo() ||
User->getNumOperands() != 2)
continue;
// Check that the add is independent of the load/store. Otherwise, folding
// it would create a cycle. We can avoid searching through Addr as it's a
// predecessor to both.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
Visited.insert(Addr.getNode());
Worklist.push_back(N);
Worklist.push_back(User);
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
continue;
SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
unsigned ConstInc =
getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
// Find the new opcode for the updating load/store.
bool isLoadOp = true;
bool isLaneOp = false;
// Workaround for vst1x and vld1x intrinsics which do not have alignment
// as an operand.
bool hasAlignment = true;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
if (isIntrinsic) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default: llvm_unreachable("unexpected intrinsic for Neon base update");
case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1; break;
case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
NumVecs = 2; break;
case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4; break;
case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD;
NumVecs = 2; hasAlignment = false; break;
case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD;
NumVecs = 3; hasAlignment = false; break;
case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
NumVecs = 4; hasAlignment = false; break;
case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
NumVecs = 2; break;
case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
NumVecs = 4; break;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2; isLaneOp = true; break;
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
NumVecs = 3; isLaneOp = true; break;
case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
NumVecs = 4; isLaneOp = true; break;
case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
NumVecs = 1; isLoadOp = false; break;
case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
NumVecs = 2; isLoadOp = false; break;
case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
NumVecs = 3; isLoadOp = false; break;
case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
NumVecs = 4; isLoadOp = false; break;
case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD;
NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD;
NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD;
NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
}
} else {
isLaneOp = true;
switch (N->getOpcode()) {
default: llvm_unreachable("unexpected opcode for Neon base update");
case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1; isLaneOp = false; break;
case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
}
if (ConstInc || User->getOpcode() == ISD::ADD)
BaseUpdates.push_back({User, Inc, ConstInc});
}
// If the address is a constant pointer increment itself, find
// another constant increment that has the same base operand
SDValue Base;
SDValue CInc;
if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
unsigned Offset =
getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
User->getNumOperands() != 2)
continue;
SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
unsigned UserOffset =
getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
if (!UserOffset || UserOffset <= Offset)
continue;
unsigned NewConstInc = UserOffset - Offset;
SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
BaseUpdates.push_back({User, NewInc, NewConstInc});
}
}
// Find the size of memory referenced by the load/store.
EVT VecTy;
if (isLoadOp) {
VecTy = N->getValueType(0);
} else if (isIntrinsic) {
VecTy = N->getOperand(AddrOpIdx+1).getValueType();
} else {
assert(isStore && "Node has to be a load, a store, or an intrinsic!");
VecTy = N->getOperand(1).getValueType();
}
bool isVLDDUPOp =
NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (isLaneOp || isVLDDUPOp)
NumBytes /= VecTy.getVectorNumElements();
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
// separate instructions that make it harder to use a non-constant update.
// Try to fold the load/store with an update that matches memory
// access size. This should work well for sequential loads.
//
// Filter out invalid updates as well.
unsigned NumValidUpd = BaseUpdates.size();
for (unsigned I = 0; I < NumValidUpd;) {
BaseUpdateUser &User = BaseUpdates[I];
if (!isValidBaseUpdate(N, User.N)) {
--NumValidUpd;
std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
continue;
}
// OK, we found an ADD we can fold into the base update.
// Now, create a _UPD node, taking care of not breaking alignment.
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
return SDValue();
++I;
}
BaseUpdates.resize(NumValidUpd);
EVT AlignedVecTy = VecTy;
unsigned Alignment = MemN->getAlignment();
// If this is a less-than-standard-aligned load/store, change the type to
// match the standard alignment.
// The alignment is overlooked when selecting _UPD variants; and it's
// easier to introduce bitcasts here than fix that.
// There are 3 ways to get to this base-update combine:
// - intrinsics: they are assumed to be properly aligned (to the standard
// alignment of the memory type), so we don't need to do anything.
// - ARMISD::VLDx nodes: they are only generated from the aforementioned
// intrinsics, so, likewise, there's nothing to do.
// - generic load/store instructions: the alignment is specified as an
// explicit operand, rather than implicitly as the standard alignment
// of the memory type (like the intrisics). We need to change the
// memory type to match the explicit alignment. That way, we don't
// generate non-standard-aligned ARMISD::VLDx nodes.
if (isa<LSBaseSDNode>(N)) {
if (Alignment == 0)
Alignment = 1;
if (Alignment < VecTy.getScalarSizeInBits() / 8) {
MVT EltTy = MVT::getIntegerVT(Alignment * 8);
assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
assert(!isLaneOp && "Unexpected generic load/store lane.");
unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
}
// Don't set an explicit alignment on regular load/stores that we want
// to transform to VLD/VST 1_UPD nodes.
// This matches the behavior of regular load/stores, which only get an
// explicit alignment if the MMO alignment is larger than the standard
// alignment of the memory type.
// Intrinsics, however, always get an explicit alignment, set to the
// alignment of the MMO.
Alignment = 1;
}
// Create the new updating load/store node.
// First, create an SDVTList for the new updating node's results.
EVT Tys[6];
unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = AlignedVecTy;
Tys[n++] = MVT::i32;
Tys[n] = MVT::Other;
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
// Then, gather the new node's operands.
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // incoming chain
Ops.push_back(N->getOperand(AddrOpIdx));
Ops.push_back(Inc);
if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
// Try to match the intrinsic's signature
Ops.push_back(StN->getValue());
} else {
// Loads (and of course intrinsics) match the intrinsics' signature,
// so just add all but the alignment operand.
unsigned LastOperand =
hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
Ops.push_back(N->getOperand(i));
}
// For all node types, the alignment operand is always the last one.
Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
// If this is a non-standard-aligned STORE, the penultimate operand is the
// stored value. Bitcast it to the aligned type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
SDValue &StVal = Ops[Ops.size()-2];
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
}
EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
MemN->getMemOperand());
// Update the uses.
SmallVector<SDValue, 5> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i)
NewResults.push_back(SDValue(UpdN.getNode(), i));
// If this is an non-standard-aligned LOAD, the first result is the loaded
// value. Bitcast it to the expected result type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
SDValue &LdVal = NewResults[0];
LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
}
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
break;
// Try to fold with other users. Non-constant updates are considered
// first, and constant updates are sorted to not break a sequence of
// strided accesses (if there is any).
std::sort(BaseUpdates.begin(), BaseUpdates.end(),
[](BaseUpdateUser &LHS, BaseUpdateUser &RHS) {
return LHS.ConstInc < RHS.ConstInc;
});
for (BaseUpdateUser &User : BaseUpdates) {
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
return SDValue();
}
return SDValue();
}

View File

@ -3,80 +3,57 @@
; rdar://12713765
; When realign-stack is set to false, make sure we are not creating stack
; objects that are assumed to be 64-byte aligned.
@T3_retval = common global <16 x float> zeroinitializer, align 16
define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
entry:
; CHECK-LABEL: test1:
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #32
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #48
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: mov r[[R2:[0-9]+]], sp
; CHECK: add r[[R3:[0-9]+]], r[[R2]], #48
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: add r[[R4:[0-9]+]], r[[R2]], #32
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
; CHECK: mov r[[R5:[0-9]+]], r[[R2]]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #48
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #32
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
; CHECK: add r[[PTR]], r[[PTR]], #32
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
entry:
%retval = alloca <16 x float>, align 64
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
store <16 x float> %0, <16 x float>* %retval
%1 = load <16 x float>, <16 x float>* %retval
store <16 x float> %1, <16 x float>* %agg.result, align 16
%a1 = bitcast <16 x float>* %retval to float*
%a2 = getelementptr inbounds float, float* %a1, i64 8
%a3 = bitcast float* %a2 to <4 x float>*
%b1 = bitcast <16 x float>* %agg.result to float*
%b2 = getelementptr inbounds float, float* %b1, i64 8
%b3 = bitcast float* %b2 to <4 x float>*
%0 = load <4 x float>, <4 x float>* %a3, align 16
%1 = load <4 x float>, <4 x float>* %b3, align 16
store <4 x float> %0, <4 x float>* %b3, align 16
store <4 x float> %1, <4 x float>* %a3, align 16
ret void
}
define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
entry:
; CHECK-LABEL: test2:
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: mov r[[R1:[0-9]+]], sp
; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #16
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: mov r[[R3:[0-9]+]], #32
; CHECK: mov r[[R9:[0-9]+]], r[[R1]]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]]
; CHECK: mov r[[R3:[0-9]+]], r[[R9]]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #48
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: add r[[R1:[0-9]+]], r0, #32
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
; CHECK: mov r[[ALIGNED:[0-9]+]], sp
; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
; CHECK: add r[[PTR]], r[[PTR]], #32
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
entry:
%retval = alloca <16 x float>, align 64
%a1 = bitcast <16 x float>* %retval to float*
%a2 = getelementptr inbounds float, float* %a1, i64 8
%a3 = bitcast float* %a2 to <4 x float>*
%b1 = bitcast <16 x float>* %agg.result to float*
%b2 = getelementptr inbounds float, float* %b1, i64 8
%b3 = bitcast float* %b2 to <4 x float>*
%retval = alloca <16 x float>, align 64
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
store <16 x float> %0, <16 x float>* %retval
%1 = load <16 x float>, <16 x float>* %retval
store <16 x float> %1, <16 x float>* %agg.result, align 16
%0 = load <4 x float>, <4 x float>* %a3, align 16
%1 = load <4 x float>, <4 x float>* %b3, align 16
store <4 x float> %0, <4 x float>* %b3, align 16
store <4 x float> %1, <4 x float>* %a3, align 16
ret void
}

View File

@ -0,0 +1,325 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -o - < %s | FileCheck %s
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv8-unknown-linux-gnueabihf"
define <4 x float> @test(float* %A) {
; CHECK-LABEL: test:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X.ptr = bitcast float* %A to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
define <4 x float> @test_stride(float* %A) {
; CHECK-LABEL: test_stride:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r1, #24
; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X.ptr = bitcast float* %A to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
define <4 x float> @test_stride_mixed(float* %A) {
; CHECK-LABEL: test_stride_mixed:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r1, #24
; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X.ptr = bitcast float* %A to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
; Refrain from using multiple stride registers
define <4 x float> @test_stride_noop(float* %A) {
; CHECK-LABEL: test_stride_noop:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r1, #24
; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
; CHECK-NEXT: mov r1, #32
; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X.ptr = bitcast float* %A to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
define <4 x float> @test_positive_initial_offset(float* %A) {
; CHECK-LABEL: test_positive_initial_offset:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r0, #32
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X.ptr.elt = getelementptr inbounds float, float* %A, i32 8
%X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
define <4 x float> @test_negative_initial_offset(float* %A) {
; CHECK-LABEL: test_negative_initial_offset:
; CHECK: @ %bb.0:
; CHECK-NEXT: sub r0, r0, #64
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16
%X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
@global_float_array = external global [128 x float], align 4
define <4 x float> @test_global() {
; CHECK-LABEL: test_global:
; CHECK: @ %bb.0:
; CHECK-NEXT: movw r0, :lower16:global_float_array
; CHECK-NEXT: movt r0, :upper16:global_float_array
; CHECK-NEXT: add r0, r0, #32
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: bx lr
%X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4
%Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4
%Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
define <4 x float> @test_stack() {
; Use huge alignment to test that ADD would not be converted to OR
; CHECK-LABEL: test_stack:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r10, r11, lr}
; CHECK-NEXT: push {r4, r10, r11, lr}
; CHECK-NEXT: .setfp r11, sp, #8
; CHECK-NEXT: add r11, sp, #8
; CHECK-NEXT: .pad #240
; CHECK-NEXT: sub sp, sp, #240
; CHECK-NEXT: bfc sp, #0, #7
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: mov r0, r4
; CHECK-NEXT: bl external_function
; CHECK-NEXT: vld1.32 {d16, d17}, [r4:128]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r4:128]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128]
; CHECK-NEXT: vadd.f32 q0, q8, q9
; CHECK-NEXT: sub sp, r11, #8
; CHECK-NEXT: pop {r4, r10, r11, pc}
%array = alloca [32 x float], align 128
%arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0
call void @external_function(float* %arraydecay)
%X.ptr = bitcast [32 x float]* %array to <4 x float>*
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
%Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
%tmp.sum = fadd <4 x float> %X, %Y
%sum = fadd <4 x float> %tmp.sum, %Z
ret <4 x float> %sum
}
define <2 x double> @test_double(double* %A) {
; CHECK-LABEL: test_double:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r0, #64
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]!
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f64 d20, d17, d19
; CHECK-NEXT: vadd.f64 d16, d16, d18
; CHECK-NEXT: vld1.64 {d22, d23}, [r0]
; CHECK-NEXT: vadd.f64 d1, d20, d23
; CHECK-NEXT: vadd.f64 d0, d16, d22
; CHECK-NEXT: bx lr
%X.ptr.elt = getelementptr inbounds double, double* %A, i32 8
%X.ptr = bitcast double* %X.ptr.elt to <2 x double>*
%X = load <2 x double>, <2 x double>* %X.ptr, align 8
%Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10
%Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>*
%Y = load <2 x double>, <2 x double>* %Y.ptr, align 8
%Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12
%Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>*
%Z = load <2 x double>, <2 x double>* %Z.ptr, align 8
%tmp.sum = fadd <2 x double> %X, %Y
%sum = fadd <2 x double> %tmp.sum, %Z
ret <2 x double> %sum
}
define void @test_various_instructions(float* %A) {
; CHECK-LABEL: test_various_instructions:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
; CHECK-NEXT: vadd.f32 q8, q8, q9
; CHECK-NEXT: vst1.32 {d16, d17}, [r0]
; CHECK-NEXT: bx lr
%X.ptr = bitcast float* %A to i8*
%X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1)
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
%Z.ptr = bitcast float* %Z.ptr.elt to i8*
%Z = fadd <4 x float> %X, %Y
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4)
ret void
}
define void @test_lsr_geps(float* %a, float* %b, i32 %n) {
; CHECK-LABEL: test_lsr_geps:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB10_1: @ %for.body.preheader
; CHECK-NEXT: mov r12, #0
; CHECK-NEXT: .LBB10_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add r3, r0, r12
; CHECK-NEXT: subs r2, r2, #1
; CHECK-NEXT: vld1.32 {d16, d17}, [r3]!
; CHECK-NEXT: vld1.32 {d18, d19}, [r3]!
; CHECK-NEXT: vld1.32 {d20, d21}, [r3]!
; CHECK-NEXT: vld1.32 {d22, d23}, [r3]
; CHECK-NEXT: add r3, r1, r12
; CHECK-NEXT: add r12, r12, #64
; CHECK-NEXT: vst1.32 {d16, d17}, [r3]!
; CHECK-NEXT: vst1.32 {d18, d19}, [r3]!
; CHECK-NEXT: vst1.32 {d20, d21}, [r3]!
; CHECK-NEXT: vst1.32 {d22, d23}, [r3]
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: bx lr
entry:
%cmp61 = icmp sgt i32 %n, 0
br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader:
br label %for.body
for.cond.cleanup:
ret void
for.body:
%lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ]
%lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
%0 = bitcast float* %a to i8*
%1 = bitcast float* %b to i8*
%uglygep19 = getelementptr i8, i8* %0, i32 %lsr.iv1
%uglygep1920 = bitcast i8* %uglygep19 to <4 x float>*
%2 = load <4 x float>, <4 x float>* %uglygep1920, align 4
%uglygep16 = getelementptr i8, i8* %0, i32 %lsr.iv1
%uglygep1617 = bitcast i8* %uglygep16 to <4 x float>*
%scevgep18 = getelementptr <4 x float>, <4 x float>* %uglygep1617, i32 1
%3 = load <4 x float>, <4 x float>* %scevgep18, align 4
%uglygep13 = getelementptr i8, i8* %0, i32 %lsr.iv1
%uglygep1314 = bitcast i8* %uglygep13 to <4 x float>*
%scevgep15 = getelementptr <4 x float>, <4 x float>* %uglygep1314, i32 2
%4 = load <4 x float>, <4 x float>* %scevgep15, align 4
%uglygep10 = getelementptr i8, i8* %0, i32 %lsr.iv1
%uglygep1011 = bitcast i8* %uglygep10 to <4 x float>*
%scevgep12 = getelementptr <4 x float>, <4 x float>* %uglygep1011, i32 3
%5 = load <4 x float>, <4 x float>* %scevgep12, align 4
%uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv1
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %uglygep8, <4 x float> %2, i32 4)
%uglygep6 = getelementptr i8, i8* %1, i32 %lsr.iv1
%scevgep7 = getelementptr i8, i8* %uglygep6, i32 16
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep7, <4 x float> %3, i32 4)
%uglygep4 = getelementptr i8, i8* %1, i32 %lsr.iv1
%scevgep5 = getelementptr i8, i8* %uglygep4, i32 32
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep5, <4 x float> %4, i32 4)
%uglygep = getelementptr i8, i8* %1, i32 %lsr.iv1
%scevgep = getelementptr i8, i8* %uglygep, i32 48
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep, <4 x float> %5, i32 4)
%lsr.iv.next = add i32 %lsr.iv, -1
%lsr.iv.next2 = add nuw i32 %lsr.iv1, 64
%exitcond.not = icmp eq i32 %lsr.iv.next, 0
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
declare void @external_function(float*)
declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly

View File

@ -83,16 +83,16 @@ define void @test(double, float, i16, <4 x half>, <8 x half>) {
; SOFT: @ %bb.0: @ %entry
; SOFT-NEXT: push {r11, lr}
; SOFT-NEXT: sub sp, sp, #32
; SOFT-NEXT: vldr d16, [sp, #40]
; SOFT-NEXT: mov r12, #16
; SOFT-NEXT: vabs.f16 d16, d16
; SOFT-NEXT: mov lr, sp
; SOFT-NEXT: vst1.16 {d16}, [lr:64], r12
; SOFT-NEXT: add r12, sp, #48
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
; SOFT-NEXT: add r12, sp, #16
; SOFT-NEXT: vabs.f16 q8, q8
; SOFT-NEXT: str r3, [sp, #8]
; SOFT-NEXT: vst1.64 {d16, d17}, [lr]
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
; SOFT-NEXT: mov r12, sp
; SOFT-NEXT: vldr d16, [sp, #40]
; SOFT-NEXT: vabs.f16 d16, d16
; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
; SOFT-NEXT: str r3, [r12]
; SOFT-NEXT: bl use
; SOFT-NEXT: add sp, sp, #32
; SOFT-NEXT: pop {r11, pc}
@ -105,26 +105,26 @@ define void @test(double, float, i16, <4 x half>, <8 x half>) {
;
; SOFTEB-LABEL: test:
; SOFTEB: @ %bb.0: @ %entry
; SOFTEB-NEXT: .save {r11, lr}
; SOFTEB-NEXT: push {r11, lr}
; SOFTEB-NEXT: .save {r4, lr}
; SOFTEB-NEXT: push {r4, lr}
; SOFTEB-NEXT: .pad #32
; SOFTEB-NEXT: sub sp, sp, #32
; SOFTEB-NEXT: vldr d16, [sp, #40]
; SOFTEB-NEXT: mov r12, #16
; SOFTEB-NEXT: mov lr, sp
; SOFTEB-NEXT: str r3, [sp, #8]
; SOFTEB-NEXT: add r4, sp, #48
; SOFTEB-NEXT: add r12, sp, #16
; SOFTEB-NEXT: vrev64.16 d16, d16
; SOFTEB-NEXT: vabs.f16 d16, d16
; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12
; SOFTEB-NEXT: add r12, sp, #48
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: vst1.16 {d16}, [lr:64]!
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r4]
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: str r3, [lr]
; SOFTEB-NEXT: vabs.f16 q8, q8
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr]
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: bl use
; SOFTEB-NEXT: add sp, sp, #32
; SOFTEB-NEXT: pop {r11, pc}
; SOFTEB-NEXT: pop {r4, pc}
;
; HARDEB-LABEL: test:
; HARDEB: @ %bb.0: @ %entry
@ -148,20 +148,20 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
; SOFT-NEXT: push {r11, lr}
; SOFT-NEXT: sub sp, sp, #32
; SOFT-NEXT: add r12, sp, #80
; SOFT-NEXT: mov lr, sp
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
; SOFT-NEXT: add r12, sp, #48
; SOFT-NEXT: vabs.f16 q8, q8
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
; SOFT-NEXT: add r12, sp, #64
; SOFT-NEXT: str r3, [sp, #8]
; SOFT-NEXT: vadd.f16 q8, q8, q9
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
; SOFT-NEXT: mov r12, #16
; SOFT-NEXT: add r12, sp, #16
; SOFT-NEXT: vmul.f16 q8, q9, q8
; SOFT-NEXT: vldr d18, [sp, #40]
; SOFT-NEXT: vst1.16 {d18}, [lr:64], r12
; SOFT-NEXT: vst1.64 {d16, d17}, [lr]
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
; SOFT-NEXT: mov r12, sp
; SOFT-NEXT: vldr d16, [sp, #40]
; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
; SOFT-NEXT: str r3, [r12]
; SOFT-NEXT: bl use
; SOFT-NEXT: add sp, sp, #32
; SOFT-NEXT: pop {r11, pc}
@ -181,13 +181,8 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
; SOFTEB-NEXT: push {r11, lr}
; SOFTEB-NEXT: .pad #32
; SOFTEB-NEXT: sub sp, sp, #32
; SOFTEB-NEXT: vldr d16, [sp, #40]
; SOFTEB-NEXT: mov r12, #16
; SOFTEB-NEXT: mov lr, sp
; SOFTEB-NEXT: str r3, [sp, #8]
; SOFTEB-NEXT: vrev64.16 d16, d16
; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12
; SOFTEB-NEXT: add r12, sp, #80
; SOFTEB-NEXT: mov lr, sp
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: add r12, sp, #48
; SOFTEB-NEXT: vrev64.16 q8, q8
@ -197,10 +192,15 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vadd.f16 q8, q8, q9
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
; SOFTEB-NEXT: add r12, sp, #16
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vmul.f16 q8, q9, q8
; SOFTEB-NEXT: vldr d18, [sp, #40]
; SOFTEB-NEXT: vrev64.16 d18, d18
; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]!
; SOFTEB-NEXT: str r3, [lr]
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr]
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: bl use
; SOFTEB-NEXT: add sp, sp, #32
; SOFTEB-NEXT: pop {r11, pc}

View File

@ -26,20 +26,18 @@ define <32 x i8> @test_consume_arg([9 x double], <32 x i8> %vec) {
define void @test_produce_arg() {
; CHECK-LABEL: test_produce_arg:
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #32
; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #16
; CHECK-V7K: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]!
; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #24
; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #8
; CHECK-AAPCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
; CHECK-APCS: add r[[BASE:[0-9]+]], sp, #60
; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
; CHECK-APCS: mov r[[R4:[0-9]+]], sp
; CHECK-APCS: mov r[[BASE:[0-9]+]], sp
; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #76
; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #60
; CHECK-APCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
call <32 x i8> @test_consume_arg([9 x double] undef, <32 x i8> zeroinitializer)

View File

@ -44,11 +44,10 @@ entry:
define void @t2(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t2:
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: movs [[INC:r[0-9]+]], #32
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
; CHECK: movw [[REG2:r[0-9]+]], #16716
; CHECK: movt [[REG2:r[0-9]+]], #72
; CHECK: str [[REG2]], [r0]

View File

@ -10,18 +10,17 @@ define void @test() {
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: vmov.i32 q8, #0x0
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: mov.w r1, #-1
; CHECK-NEXT: vmov.i32 q8, #0x0
; CHECK-NEXT: movs r2, #15
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: strd r1, r1, [sp, #8]
; CHECK-NEXT: strd r1, r1, [sp]
; CHECK-NEXT: str r1, [sp, #16]
; CHECK-NEXT: vst1.64 {d16, d17}, [r3], r2
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: str r2, [r3]
; CHECK-NEXT: vst1.64 {d16, d17}, [r2]!
; CHECK-NEXT: str r1, [r2]
; CHECK-NEXT: str r1, [sp, #20]
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: str.w r1, [sp, #15]
; CHECK-NEXT: bl callee
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: pop {r7, pc}

View File

@ -76,13 +76,14 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
@ -93,8 +94,6 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
}
@ -170,13 +169,14 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
@ -187,7 +187,6 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]]
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]]
}

View File

@ -253,9 +253,8 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
}
; CHECK-LABEL: test_silly_load:
; CHECK: vldr d{{[0-9]+}}, [r0, #16]
; CHECK: movs r1, #24
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]!
; CHECK: vld1.8 {d{{[0-9]+}}}, [r0:64]!
; CHECK: ldr {{r[0-9]+}}, [r0]
define void @test_silly_load(<28 x i8>* %addr) {

View File

@ -216,15 +216,14 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
; CHECK-LABEL: test_multisource:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d18, [r0, #32]
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: vorr d22, d18, d18
; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
; CHECK-NEXT: vldr d19, [r0, #48]
; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]
; CHECK-NEXT: vzip.16 d22, d19
; CHECK-NEXT: vtrn.16 q8, q10
; CHECK-NEXT: vext.16 d18, d18, d22, #2
; CHECK-NEXT: vld1.16 {d16, d17}, [r0:128]!
; CHECK-NEXT: vld1.16 {d18, d19}, [r0:128]!
; CHECK-NEXT: vld1.16 {d20, d21}, [r0:128]!
; CHECK-NEXT: vorr d23, d20, d20
; CHECK-NEXT: vldr d22, [r0]
; CHECK-NEXT: vzip.16 d23, d22
; CHECK-NEXT: vtrn.16 q8, q9
; CHECK-NEXT: vext.16 d18, d20, d23, #2
; CHECK-NEXT: vext.16 d16, d18, d16, #2
; CHECK-NEXT: vext.16 d16, d16, d16, #2
; CHECK-NEXT: vmov r0, r1, d16

View File

@ -134,106 +134,97 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
%T1_19* %blend, %T0_19* %storeaddr) {
; CHECK-LABEL: func_blend19:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: add r2, r1, #48
; CHECK-NEXT: mov r8, #0
; CHECK-NEXT: vld1.64 {d16, d17}, [r2:128]
; CHECK-NEXT: add r2, r0, #48
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vld1.64 {d28, d29}, [r1:128]!
; CHECK-NEXT: mov lr, #0
; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128]
; CHECK-NEXT: vmov r2, r12, d16
; CHECK-NEXT: vmov r6, r7, d17
; CHECK-NEXT: vmov r4, r5, d18
; CHECK-NEXT: subs r2, r4, r2
; CHECK-NEXT: sbcs r2, r5, r12
; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]!
; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]!
; CHECK-NEXT: vld1.64 {d24, d25}, [r0:128]!
; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]!
; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128]!
; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]
; CHECK-NEXT: vmov r0, r12, d16
; CHECK-NEXT: vmov r1, r2, d18
; CHECK-NEXT: subs r0, r1, r0
; CHECK-NEXT: vmov r1, r4, d25
; CHECK-NEXT: sbcs r0, r2, r12
; CHECK-NEXT: mov r12, #0
; CHECK-NEXT: vmov r2, r4, d19
; CHECK-NEXT: vmov r2, r0, d21
; CHECK-NEXT: movlt r12, #1
; CHECK-NEXT: cmp r12, #0
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: mvnne r12, #0
; CHECK-NEXT: vld1.64 {d24, d25}, [r5:128]!
; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128]
; CHECK-NEXT: subs r2, r2, r6
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: add r0, r0, #32
; CHECK-NEXT: vld1.64 {d26, d27}, [r2:128]!
; CHECK-NEXT: vld1.64 {d22, d23}, [r2:128]
; CHECK-NEXT: sbcs r2, r4, r7
; CHECK-NEXT: vmov r4, r5, d21
; CHECK-NEXT: movlt r8, #1
; CHECK-NEXT: vmov r6, r7, d23
; CHECK-NEXT: cmp r8, #0
; CHECK-NEXT: mvnne r8, #0
; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128]
; CHECK-NEXT: add r0, r1, #32
; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]
; CHECK-NEXT: vmov r0, r1, d20
; CHECK-NEXT: vdup.32 d7, r8
; CHECK-NEXT: vdup.32 d6, r12
; CHECK-NEXT: subs r4, r6, r4
; CHECK-NEXT: sbcs r4, r7, r5
; CHECK-NEXT: vmov r5, r6, d24
; CHECK-NEXT: vmov r7, r2, d26
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
; CHECK-NEXT: vdup.32 d5, r4
; CHECK-NEXT: subs r5, r7, r5
; CHECK-NEXT: sbcs r2, r2, r6
; CHECK-NEXT: vmov r7, r6, d27
; CHECK-NEXT: vmov r2, r9, d25
; CHECK-NEXT: mov r5, #0
; CHECK-NEXT: movlt r5, #1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: mvnne r5, #0
; CHECK-NEXT: subs r2, r7, r2
; CHECK-NEXT: sbcs r2, r6, r9
; CHECK-NEXT: vmov r6, r7, d22
; CHECK-NEXT: subs r1, r1, r2
; CHECK-NEXT: sbcs r0, r4, r0
; CHECK-NEXT: vmov r2, r4, d26
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d1, r0
; CHECK-NEXT: vmov r0, r1, d22
; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: sbcs r0, r4, r1
; CHECK-NEXT: vmov r4, r5, d31
; CHECK-NEXT: vmov r0, r1, d29
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d1, r2
; CHECK-NEXT: vdup.32 d0, r5
; CHECK-NEXT: vbit q12, q13, q0
; CHECK-NEXT: subs r0, r6, r0
; CHECK-NEXT: vmov r2, r6, d28
; CHECK-NEXT: sbcs r0, r7, r1
; CHECK-NEXT: mov r7, #0
; CHECK-NEXT: vmov r0, r1, d30
; CHECK-NEXT: movlt r7, #1
; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: vmov r2, r5, d29
; CHECK-NEXT: sbcs r0, r6, r1
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r1
; CHECK-NEXT: vmov r4, r5, d30
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d3, r0
; CHECK-NEXT: vmov r0, r1, d28
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r1
; CHECK-NEXT: vmov r4, r5, d24
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d2, r0
; CHECK-NEXT: vmov r0, r1, d20
; CHECK-NEXT: vbit q14, q15, q1
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r1
; CHECK-NEXT: vmov r1, r4, d17
; CHECK-NEXT: vmov r5, r6, d19
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d0, r0
; CHECK-NEXT: vbit q10, q12, q0
; CHECK-NEXT: subs r1, r5, r1
; CHECK-NEXT: sbcs r1, r6, r4
; CHECK-NEXT: vmov r4, r5, d27
; CHECK-NEXT: vmov r0, r1, d23
; CHECK-NEXT: mov r6, #0
; CHECK-NEXT: vmov r0, r1, d31
; CHECK-NEXT: movlt r6, #1
; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r1
; CHECK-NEXT: movlt lr, #1
; CHECK-NEXT: cmp lr, #0
; CHECK-NEXT: mvnne lr, #0
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: vdup.32 d31, lr
; CHECK-NEXT: mvnne r6, #0
; CHECK-NEXT: vdup.32 d3, lr
; CHECK-NEXT: vdup.32 d2, r6
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: vorr q13, q1, q1
; CHECK-NEXT: mvnne r7, #0
; CHECK-NEXT: vdup.32 d4, r7
; CHECK-NEXT: add r0, r3, #32
; CHECK-NEXT: vbsl q13, q14, q15
; CHECK-NEXT: vbit q10, q11, q2
; CHECK-NEXT: vbit q8, q9, q3
; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]
; CHECK-NEXT: add r0, r3, #48
; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]!
; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: vdup.32 d30, r2
; CHECK-NEXT: vdup.32 d3, r6
; CHECK-NEXT: vbit q11, q13, q15
; CHECK-NEXT: vdup.32 d2, r12
; CHECK-NEXT: vst1.64 {d28, d29}, [r3:128]!
; CHECK-NEXT: vbit q8, q9, q1
; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]!
; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128]!
; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]
; CHECK-NEXT: pop {r4, r5, r6, lr}
; CHECK-NEXT: mov pc, lr
%v0 = load %T0_19, %T0_19* %loadaddr
%v1 = load %T0_19, %T0_19* %loadaddr2
@ -251,213 +242,198 @@ define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
%T1_20* %blend, %T0_20* %storeaddr) {
; CHECK-LABEL: func_blend20:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, sp, #4
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, sp, #8
; CHECK-NEXT: add r9, r1, #64
; CHECK-NEXT: mov r2, #32
; CHECK-NEXT: add r8, r0, #64
; CHECK-NEXT: vld1.64 {d16, d17}, [r9:128], r2
; CHECK-NEXT: mov r10, r1
; CHECK-NEXT: mov r11, r0
; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128], r2
; CHECK-NEXT: vmov r7, r5, d17
; CHECK-NEXT: vmov r6, r2, d19
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vld1.64 {d22, d23}, [r10:128]!
; CHECK-NEXT: subs r7, r6, r7
; CHECK-NEXT: sbcs r2, r2, r5
; CHECK-NEXT: vmov r5, r6, d16
; CHECK-NEXT: vmov r7, r4, d18
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d21, r2
; CHECK-NEXT: mov r8, r1
; CHECK-NEXT: mov lr, r0
; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]!
; CHECK-NEXT: add r9, r0, #64
; CHECK-NEXT: add r10, r1, #64
; CHECK-NEXT: mov r12, #0
; CHECK-NEXT: vld1.64 {d22, d23}, [lr:128]!
; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128]!
; CHECK-NEXT: vld1.64 {d20, d21}, [lr:128]!
; CHECK-NEXT: vmov r6, r4, d19
; CHECK-NEXT: vmov r5, r7, d21
; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]!
; CHECK-NEXT: vld1.64 {d6, d7}, [r10:128]!
; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]!
; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]!
; CHECK-NEXT: subs r6, r5, r6
; CHECK-NEXT: sbcs r4, r7, r4
; CHECK-NEXT: vmov r5, r6, d18
; CHECK-NEXT: vmov r7, r2, d20
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
; CHECK-NEXT: vdup.32 d31, r4
; CHECK-NEXT: subs r5, r7, r5
; CHECK-NEXT: sbcs r4, r4, r6
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
; CHECK-NEXT: vdup.32 d20, r4
; CHECK-NEXT: vmov r2, r4, d23
; CHECK-NEXT: vbit q8, q9, q10
; CHECK-NEXT: vld1.64 {d18, d19}, [r11:128]!
; CHECK-NEXT: vmov r7, r5, d19
; CHECK-NEXT: subs r2, r7, r2
; CHECK-NEXT: sbcs r2, r5, r4
; CHECK-NEXT: vmov r5, r7, d18
; CHECK-NEXT: sbcs r2, r2, r6
; CHECK-NEXT: vmov r4, r5, d3
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d21, r2
; CHECK-NEXT: vmov r2, r4, d22
; CHECK-NEXT: subs r2, r5, r2
; CHECK-NEXT: sbcs r2, r7, r4
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d20, r2
; CHECK-NEXT: add r2, r0, #48
; CHECK-NEXT: vbif q9, q11, q10
; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128]
; CHECK-NEXT: add r2, r1, #48
; CHECK-NEXT: vld1.64 {d2, d3}, [r2:128]
; CHECK-NEXT: vmov r5, r7, d30
; CHECK-NEXT: vmov r2, r4, d2
; CHECK-NEXT: vld1.64 {d26, d27}, [r11:128]
; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]
; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]!
; CHECK-NEXT: vld1.64 {d22, d23}, [r9:128]
; CHECK-NEXT: vld1.64 {d20, d21}, [r8:128]!
; CHECK-NEXT: vmov r11, r10, d21
; CHECK-NEXT: subs r2, r5, r2
; CHECK-NEXT: sbcs r2, r7, r4
; CHECK-NEXT: vmov r7, r6, d31
; CHECK-NEXT: vmov r2, r5, d3
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
; CHECK-NEXT: subs r2, r7, r2
; CHECK-NEXT: mov r7, #0
; CHECK-NEXT: sbcs r2, r6, r5
; CHECK-NEXT: vmov r6, r5, d27
; CHECK-NEXT: vmov r2, r9, d1
; CHECK-NEXT: movlt r7, #1
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: mvnne r7, #0
; CHECK-NEXT: vdup.32 d7, r7
; CHECK-NEXT: vdup.32 d6, r4
; CHECK-NEXT: subs r2, r6, r2
; CHECK-NEXT: sbcs r2, r5, r9
; CHECK-NEXT: vmov r6, r5, d26
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d5, r2
; CHECK-NEXT: vmov r2, r9, d0
; CHECK-NEXT: subs r2, r6, r2
; CHECK-NEXT: sbcs r2, r5, r9
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: vdup.32 d4, r2
; CHECK-NEXT: add r2, r1, #32
; CHECK-NEXT: vld1.64 {d28, d29}, [r2:128]
; CHECK-NEXT: add r2, r0, #32
; CHECK-NEXT: vbif q13, q0, q2
; CHECK-NEXT: add r1, r1, #80
; CHECK-NEXT: vld1.64 {d0, d1}, [r2:128]
; CHECK-NEXT: vmov r4, r5, d28
; CHECK-NEXT: vbif q15, q1, q3
; CHECK-NEXT: add r0, r0, #80
; CHECK-NEXT: vmov r2, r6, d0
; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128]
; CHECK-NEXT: vmov r9, r8, d25
; CHECK-NEXT: vld1.64 {d8, d9}, [r0:128]
; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128]
; CHECK-NEXT: vmov r3, r12, d8
; CHECK-NEXT: subs r2, r2, r4
; CHECK-NEXT: sbcs r2, r6, r5
; CHECK-NEXT: vmov r4, r5, d29
; CHECK-NEXT: vmov r6, r7, d1
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: subs r4, r6, r4
; CHECK-NEXT: sbcs r4, r7, r5
; CHECK-NEXT: vmov r5, r6, d2
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
; CHECK-NEXT: vdup.32 d5, r4
; CHECK-NEXT: vdup.32 d4, r2
; CHECK-NEXT: vmov r2, r4, d22
; CHECK-NEXT: vbit q14, q0, q2
; CHECK-NEXT: subs r2, r5, r2
; CHECK-NEXT: sbcs r2, r6, r4
; CHECK-NEXT: vmov r4, r5, d24
; CHECK-NEXT: vmov r6, r7, d20
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
; CHECK-NEXT: subs r1, r6, r4
; CHECK-NEXT: vmov r0, r6, d9
; CHECK-NEXT: sbcs r1, r7, r5
; CHECK-NEXT: vmov r4, r5, d7
; CHECK-NEXT: mov r1, #0
; CHECK-NEXT: movlt r1, #1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: mvnne r1, #0
; CHECK-NEXT: subs r0, r0, r4
; CHECK-NEXT: vmov r7, r4, d23
; CHECK-NEXT: sbcs r0, r6, r5
; CHECK-NEXT: vmov r5, lr, d6
; CHECK-NEXT: vdup.32 d30, r2
; CHECK-NEXT: vmov r0, r2, d1
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: vmov r4, r5, d2
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d9, r0
; CHECK-NEXT: vmov r0, r2, d0
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: vmov r4, r5, d5
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d8, r0
; CHECK-NEXT: vmov r0, r2, d7
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d11, r0
; CHECK-NEXT: vmov r0, r6, d3
; CHECK-NEXT: subs r0, r0, r7
; CHECK-NEXT: sbcs r0, r6, r4
; CHECK-NEXT: vmov r0, r2, d6
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: vmov r4, r5, d23
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: subs r4, r11, r9
; CHECK-NEXT: sbcs r4, r10, r8
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d10, r0
; CHECK-NEXT: vmov r0, r2, d17
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: vmov r4, r5, d22
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d25, r0
; CHECK-NEXT: vmov r0, r2, d16
; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d24, r0
; CHECK-NEXT: vorr q13, q12, q12
; CHECK-NEXT: vbsl q13, q11, q8
; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]!
; CHECK-NEXT: vorr q8, q5, q5
; CHECK-NEXT: vld1.64 {d28, d29}, [r10:128]!
; CHECK-NEXT: vbsl q8, q2, q3
; CHECK-NEXT: vld1.64 {d6, d7}, [r8:128]!
; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128]
; CHECK-NEXT: vld1.64 {d4, d5}, [lr:128]!
; CHECK-NEXT: vbif q10, q9, q15
; CHECK-NEXT: vorr q9, q4, q4
; CHECK-NEXT: vmov r0, r2, d22
; CHECK-NEXT: vbsl q9, q1, q0
; CHECK-NEXT: vld1.64 {d30, d31}, [lr:128]
; CHECK-NEXT: mov lr, #0
; CHECK-NEXT: vmov r7, r5, d30
; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128]
; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128]
; CHECK-NEXT: subs r0, r7, r0
; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: vmov r5, r4, d24
; CHECK-NEXT: vmov r0, r7, d28
; CHECK-NEXT: movlt lr, #1
; CHECK-NEXT: cmp lr, #0
; CHECK-NEXT: mvnne lr, #0
; CHECK-NEXT: subs r0, r5, r0
; CHECK-NEXT: sbcs r0, r4, r7
; CHECK-NEXT: vmov r7, r5, d29
; CHECK-NEXT: vmov r4, r6, d25
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: subs r7, r4, r7
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: sbcs r7, r6, r5
; CHECK-NEXT: vmov r5, r1, d31
; CHECK-NEXT: vmov r7, r6, d23
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: subs r3, r3, r5
; CHECK-NEXT: sbcs r3, r12, lr
; CHECK-NEXT: mov r3, #0
; CHECK-NEXT: movlt r3, #1
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: mvnne r3, #0
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
; CHECK-NEXT: vdup.32 d10, r3
; CHECK-NEXT: vdup.32 d1, r4
; CHECK-NEXT: vorr q2, q5, q5
; CHECK-NEXT: vdup.32 d0, r1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: vbsl q2, q4, q3
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vbif q10, q12, q0
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vdup.32 d7, r0
; CHECK-NEXT: add r0, r1, #80
; CHECK-NEXT: vdup.32 d6, r2
; CHECK-NEXT: vbit q11, q1, q3
; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]
; CHECK-NEXT: add r0, r1, #32
; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128]
; CHECK-NEXT: add r0, r1, #48
; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128]
; CHECK-NEXT: add r0, r1, #64
; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128]!
; CHECK-NEXT: vst1.64 {d26, d27}, [r1:128]
; CHECK-NEXT: mov r1, #32
; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1
; CHECK-NEXT: subs r7, r5, r7
; CHECK-NEXT: mov r5, #0
; CHECK-NEXT: sbcs r1, r1, r6
; CHECK-NEXT: vmov r6, r2, d5
; CHECK-NEXT: vmov r1, r7, d7
; CHECK-NEXT: movlt r5, #1
; CHECK-NEXT: cmp r5, #0
; CHECK-NEXT: mvnne r5, #0
; CHECK-NEXT: subs r1, r6, r1
; CHECK-NEXT: sbcs r1, r2, r7
; CHECK-NEXT: vmov r6, r7, d4
; CHECK-NEXT: mov r1, #0
; CHECK-NEXT: movlt r1, #1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: mvnne r1, #0
; CHECK-NEXT: vdup.32 d9, r1
; CHECK-NEXT: vmov r1, r2, d6
; CHECK-NEXT: subs r1, r6, r1
; CHECK-NEXT: sbcs r1, r7, r2
; CHECK-NEXT: vmov r6, r7, d0
; CHECK-NEXT: mov r1, #0
; CHECK-NEXT: movlt r1, #1
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: mvnne r1, #0
; CHECK-NEXT: vdup.32 d8, r1
; CHECK-NEXT: vmov r1, r2, d2
; CHECK-NEXT: vbif q2, q3, q4
; CHECK-NEXT: vdup.32 d7, r5
; CHECK-NEXT: vdup.32 d9, r4
; CHECK-NEXT: vmov r4, r5, d1
; CHECK-NEXT: vdup.32 d8, r0
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]!
; CHECK-NEXT: vbif q12, q14, q4
; CHECK-NEXT: vdup.32 d6, lr
; CHECK-NEXT: vbit q11, q15, q3
; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]!
; CHECK-NEXT: subs r1, r6, r1
; CHECK-NEXT: mov r6, #0
; CHECK-NEXT: sbcs r1, r7, r2
; CHECK-NEXT: vmov r1, r2, d3
; CHECK-NEXT: movlt r6, #1
; CHECK-NEXT: subs r1, r4, r1
; CHECK-NEXT: sbcs r1, r5, r2
; CHECK-NEXT: movlt r12, #1
; CHECK-NEXT: cmp r12, #0
; CHECK-NEXT: mvnne r12, #0
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: vdup.32 d27, r12
; CHECK-NEXT: mvnne r6, #0
; CHECK-NEXT: vdup.32 d26, r6
; CHECK-NEXT: vorr q10, q13, q13
; CHECK-NEXT: vbsl q10, q0, q1
; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]!
; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128]
; CHECK-NEXT: add sp, sp, #8
; CHECK-NEXT: add r0, r3, #64
; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]!
; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]!
; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128]!
; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, sp, #4
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: mov pc, lr
%v0 = load %T0_20, %T0_20* %loadaddr
%v1 = load %T0_20, %T0_20* %loadaddr2

View File

@ -198,21 +198,13 @@ for.end: ; preds = %for.body
; @testNeon is an important example of the nead for ivchains.
;
; Currently we have two extra add.w's that keep the store address
; live past the next increment because ISEL is unfortunately undoing
; the store chain. ISEL also fails to convert all but one of the stores to
; post-increment addressing. However, the loads should use
; post-increment addressing, no add's or add.w's beyond the three
; mentioned. Most importantly, there should be no spills or reloads!
; Loads and stores should use post-increment addressing, no add's or add.w's.
; Most importantly, there should be no spills or reloads!
;
; A9: testNeon:
; A9: %.lr.ph
; A9: add.w r
; A9-NOT: lsl.w
; A9-NOT: {{ldr|str|adds|add r}}
; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}}
; A9: add.w r
; A9-NOT: {{ldr|str|adds|add r}}
; A9-NOT: add.w r
; A9: bne
define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {