[ARM] Simplify address calculation for NEON load/store
The patch attempts to optimize a sequence of SIMD loads from the same base pointer: %0 = gep float*, float* base, i32 4 %1 = bitcast float* %0 to <4 x float>* %2 = load <4 x float>, <4 x float>* %1 ... %n1 = gep float*, float* base, i32 N %n2 = bitcast float* %n1 to <4 x float>* %n3 = load <4 x float>, <4 x float>* %n2 For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16]. However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so the address is computed before every ld/st instruction: add r2, r0, #32 add r0, r0, #16 vld1.32 {d18, d19}, [r2] vld1.32 {d22, d23}, [r0] This can be improved by computing address for the first load, and then using a post-indexed form of VLD1/VST1 to load the rest: add r0, r0, #16 vld1.32 {d18, d19}, [r0]! vld1.32 {d22, d23}, [r0] In order to do that, the patch adds more patterns to DAGCombine: - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1 and inc2 are constants. - (or ptr inc) is now recognized as a pointer increment if ptr is sufficiently aligned. In addition to that, we now search for all possible base updates and then pick the best one. Differential Revision: https://reviews.llvm.org/D108988
This commit is contained in:
parent
88487662f7
commit
dc8a41de34
|
@ -15244,6 +15244,390 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
|
|||
DAG.getUNDEF(VT), NewMask);
|
||||
}
|
||||
|
||||
/// Load/store instruction that can be merged with a base address
|
||||
/// update
|
||||
struct BaseUpdateTarget {
|
||||
SDNode *N;
|
||||
bool isIntrinsic;
|
||||
bool isStore;
|
||||
unsigned AddrOpIdx;
|
||||
};
|
||||
|
||||
struct BaseUpdateUser {
|
||||
/// Instruction that updates a pointer
|
||||
SDNode *N;
|
||||
/// Pointer increment operand
|
||||
SDValue Inc;
|
||||
/// Pointer increment value if it is a constant, or 0 otherwise
|
||||
unsigned ConstInc;
|
||||
};
|
||||
|
||||
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
|
||||
struct BaseUpdateUser &User,
|
||||
bool SimpleConstIncOnly,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
SDNode *N = Target.N;
|
||||
MemSDNode *MemN = cast<MemSDNode>(N);
|
||||
SDLoc dl(N);
|
||||
|
||||
// Find the new opcode for the updating load/store.
|
||||
bool isLoadOp = true;
|
||||
bool isLaneOp = false;
|
||||
// Workaround for vst1x and vld1x intrinsics which do not have alignment
|
||||
// as an operand.
|
||||
bool hasAlignment = true;
|
||||
unsigned NewOpc = 0;
|
||||
unsigned NumVecs = 0;
|
||||
if (Target.isIntrinsic) {
|
||||
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
switch (IntNo) {
|
||||
default:
|
||||
llvm_unreachable("unexpected intrinsic for Neon base update");
|
||||
case Intrinsic::arm_neon_vld1:
|
||||
NewOpc = ARMISD::VLD1_UPD;
|
||||
NumVecs = 1;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld2:
|
||||
NewOpc = ARMISD::VLD2_UPD;
|
||||
NumVecs = 2;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld3:
|
||||
NewOpc = ARMISD::VLD3_UPD;
|
||||
NumVecs = 3;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld4:
|
||||
NewOpc = ARMISD::VLD4_UPD;
|
||||
NumVecs = 4;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld1x2:
|
||||
NewOpc = ARMISD::VLD1x2_UPD;
|
||||
NumVecs = 2;
|
||||
hasAlignment = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld1x3:
|
||||
NewOpc = ARMISD::VLD1x3_UPD;
|
||||
NumVecs = 3;
|
||||
hasAlignment = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld1x4:
|
||||
NewOpc = ARMISD::VLD1x4_UPD;
|
||||
NumVecs = 4;
|
||||
hasAlignment = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld2dup:
|
||||
NewOpc = ARMISD::VLD2DUP_UPD;
|
||||
NumVecs = 2;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld3dup:
|
||||
NewOpc = ARMISD::VLD3DUP_UPD;
|
||||
NumVecs = 3;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld4dup:
|
||||
NewOpc = ARMISD::VLD4DUP_UPD;
|
||||
NumVecs = 4;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld2lane:
|
||||
NewOpc = ARMISD::VLD2LN_UPD;
|
||||
NumVecs = 2;
|
||||
isLaneOp = true;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld3lane:
|
||||
NewOpc = ARMISD::VLD3LN_UPD;
|
||||
NumVecs = 3;
|
||||
isLaneOp = true;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vld4lane:
|
||||
NewOpc = ARMISD::VLD4LN_UPD;
|
||||
NumVecs = 4;
|
||||
isLaneOp = true;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst1:
|
||||
NewOpc = ARMISD::VST1_UPD;
|
||||
NumVecs = 1;
|
||||
isLoadOp = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst2:
|
||||
NewOpc = ARMISD::VST2_UPD;
|
||||
NumVecs = 2;
|
||||
isLoadOp = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst3:
|
||||
NewOpc = ARMISD::VST3_UPD;
|
||||
NumVecs = 3;
|
||||
isLoadOp = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst4:
|
||||
NewOpc = ARMISD::VST4_UPD;
|
||||
NumVecs = 4;
|
||||
isLoadOp = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst2lane:
|
||||
NewOpc = ARMISD::VST2LN_UPD;
|
||||
NumVecs = 2;
|
||||
isLoadOp = false;
|
||||
isLaneOp = true;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst3lane:
|
||||
NewOpc = ARMISD::VST3LN_UPD;
|
||||
NumVecs = 3;
|
||||
isLoadOp = false;
|
||||
isLaneOp = true;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst4lane:
|
||||
NewOpc = ARMISD::VST4LN_UPD;
|
||||
NumVecs = 4;
|
||||
isLoadOp = false;
|
||||
isLaneOp = true;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst1x2:
|
||||
NewOpc = ARMISD::VST1x2_UPD;
|
||||
NumVecs = 2;
|
||||
isLoadOp = false;
|
||||
hasAlignment = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst1x3:
|
||||
NewOpc = ARMISD::VST1x3_UPD;
|
||||
NumVecs = 3;
|
||||
isLoadOp = false;
|
||||
hasAlignment = false;
|
||||
break;
|
||||
case Intrinsic::arm_neon_vst1x4:
|
||||
NewOpc = ARMISD::VST1x4_UPD;
|
||||
NumVecs = 4;
|
||||
isLoadOp = false;
|
||||
hasAlignment = false;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
isLaneOp = true;
|
||||
switch (N->getOpcode()) {
|
||||
default:
|
||||
llvm_unreachable("unexpected opcode for Neon base update");
|
||||
case ARMISD::VLD1DUP:
|
||||
NewOpc = ARMISD::VLD1DUP_UPD;
|
||||
NumVecs = 1;
|
||||
break;
|
||||
case ARMISD::VLD2DUP:
|
||||
NewOpc = ARMISD::VLD2DUP_UPD;
|
||||
NumVecs = 2;
|
||||
break;
|
||||
case ARMISD::VLD3DUP:
|
||||
NewOpc = ARMISD::VLD3DUP_UPD;
|
||||
NumVecs = 3;
|
||||
break;
|
||||
case ARMISD::VLD4DUP:
|
||||
NewOpc = ARMISD::VLD4DUP_UPD;
|
||||
NumVecs = 4;
|
||||
break;
|
||||
case ISD::LOAD:
|
||||
NewOpc = ARMISD::VLD1_UPD;
|
||||
NumVecs = 1;
|
||||
isLaneOp = false;
|
||||
break;
|
||||
case ISD::STORE:
|
||||
NewOpc = ARMISD::VST1_UPD;
|
||||
NumVecs = 1;
|
||||
isLaneOp = false;
|
||||
isLoadOp = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Find the size of memory referenced by the load/store.
|
||||
EVT VecTy;
|
||||
if (isLoadOp) {
|
||||
VecTy = N->getValueType(0);
|
||||
} else if (Target.isIntrinsic) {
|
||||
VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
|
||||
} else {
|
||||
assert(Target.isStore &&
|
||||
"Node has to be a load, a store, or an intrinsic!");
|
||||
VecTy = N->getOperand(1).getValueType();
|
||||
}
|
||||
|
||||
bool isVLDDUPOp =
|
||||
NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
|
||||
NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
|
||||
|
||||
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
|
||||
if (isLaneOp || isVLDDUPOp)
|
||||
NumBytes /= VecTy.getVectorNumElements();
|
||||
|
||||
if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
|
||||
// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
|
||||
// separate instructions that make it harder to use a non-constant update.
|
||||
return false;
|
||||
}
|
||||
|
||||
if (SimpleConstIncOnly && User.ConstInc != NumBytes)
|
||||
return false;
|
||||
|
||||
// OK, we found an ADD we can fold into the base update.
|
||||
// Now, create a _UPD node, taking care of not breaking alignment.
|
||||
|
||||
EVT AlignedVecTy = VecTy;
|
||||
unsigned Alignment = MemN->getAlignment();
|
||||
|
||||
// If this is a less-than-standard-aligned load/store, change the type to
|
||||
// match the standard alignment.
|
||||
// The alignment is overlooked when selecting _UPD variants; and it's
|
||||
// easier to introduce bitcasts here than fix that.
|
||||
// There are 3 ways to get to this base-update combine:
|
||||
// - intrinsics: they are assumed to be properly aligned (to the standard
|
||||
// alignment of the memory type), so we don't need to do anything.
|
||||
// - ARMISD::VLDx nodes: they are only generated from the aforementioned
|
||||
// intrinsics, so, likewise, there's nothing to do.
|
||||
// - generic load/store instructions: the alignment is specified as an
|
||||
// explicit operand, rather than implicitly as the standard alignment
|
||||
// of the memory type (like the intrisics). We need to change the
|
||||
// memory type to match the explicit alignment. That way, we don't
|
||||
// generate non-standard-aligned ARMISD::VLDx nodes.
|
||||
if (isa<LSBaseSDNode>(N)) {
|
||||
if (Alignment == 0)
|
||||
Alignment = 1;
|
||||
if (Alignment < VecTy.getScalarSizeInBits() / 8) {
|
||||
MVT EltTy = MVT::getIntegerVT(Alignment * 8);
|
||||
assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
|
||||
assert(!isLaneOp && "Unexpected generic load/store lane.");
|
||||
unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
|
||||
AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
|
||||
}
|
||||
// Don't set an explicit alignment on regular load/stores that we want
|
||||
// to transform to VLD/VST 1_UPD nodes.
|
||||
// This matches the behavior of regular load/stores, which only get an
|
||||
// explicit alignment if the MMO alignment is larger than the standard
|
||||
// alignment of the memory type.
|
||||
// Intrinsics, however, always get an explicit alignment, set to the
|
||||
// alignment of the MMO.
|
||||
Alignment = 1;
|
||||
}
|
||||
|
||||
// Create the new updating load/store node.
|
||||
// First, create an SDVTList for the new updating node's results.
|
||||
EVT Tys[6];
|
||||
unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
|
||||
unsigned n;
|
||||
for (n = 0; n < NumResultVecs; ++n)
|
||||
Tys[n] = AlignedVecTy;
|
||||
Tys[n++] = MVT::i32;
|
||||
Tys[n] = MVT::Other;
|
||||
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
|
||||
|
||||
// Then, gather the new node's operands.
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
Ops.push_back(N->getOperand(0)); // incoming chain
|
||||
Ops.push_back(N->getOperand(Target.AddrOpIdx));
|
||||
Ops.push_back(User.Inc);
|
||||
|
||||
if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
|
||||
// Try to match the intrinsic's signature
|
||||
Ops.push_back(StN->getValue());
|
||||
} else {
|
||||
// Loads (and of course intrinsics) match the intrinsics' signature,
|
||||
// so just add all but the alignment operand.
|
||||
unsigned LastOperand =
|
||||
hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
|
||||
for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
|
||||
Ops.push_back(N->getOperand(i));
|
||||
}
|
||||
|
||||
// For all node types, the alignment operand is always the last one.
|
||||
Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
|
||||
|
||||
// If this is a non-standard-aligned STORE, the penultimate operand is the
|
||||
// stored value. Bitcast it to the aligned type.
|
||||
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
|
||||
SDValue &StVal = Ops[Ops.size() - 2];
|
||||
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
|
||||
}
|
||||
|
||||
EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
|
||||
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
|
||||
MemN->getMemOperand());
|
||||
|
||||
// Update the uses.
|
||||
SmallVector<SDValue, 5> NewResults;
|
||||
for (unsigned i = 0; i < NumResultVecs; ++i)
|
||||
NewResults.push_back(SDValue(UpdN.getNode(), i));
|
||||
|
||||
// If this is an non-standard-aligned LOAD, the first result is the loaded
|
||||
// value. Bitcast it to the expected result type.
|
||||
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
|
||||
SDValue &LdVal = NewResults[0];
|
||||
LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
|
||||
}
|
||||
|
||||
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
|
||||
DCI.CombineTo(N, NewResults);
|
||||
DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// If (opcode ptr inc) is and ADD-like instruction, return the
|
||||
// increment value. Otherwise return 0.
|
||||
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
|
||||
SDValue Inc, const SelectionDAG &DAG) {
|
||||
ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
|
||||
if (!CInc)
|
||||
return 0;
|
||||
|
||||
switch (Opcode) {
|
||||
case ARMISD::VLD1_UPD:
|
||||
case ISD::ADD:
|
||||
return CInc->getZExtValue();
|
||||
case ISD::OR: {
|
||||
if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
|
||||
// (OR ptr inc) is the same as (ADD ptr inc)
|
||||
return CInc->getZExtValue();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
|
||||
switch (N->getOpcode()) {
|
||||
case ISD::ADD:
|
||||
case ISD::OR: {
|
||||
if (isa<ConstantSDNode>(N->getOperand(1))) {
|
||||
*Ptr = N->getOperand(0);
|
||||
*CInc = N->getOperand(1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case ARMISD::VLD1_UPD: {
|
||||
if (isa<ConstantSDNode>(N->getOperand(2))) {
|
||||
*Ptr = N->getOperand(1);
|
||||
*CInc = N->getOperand(2);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
|
||||
// Check that the add is independent of the load/store.
|
||||
// Otherwise, folding it would create a cycle. Search through Addr
|
||||
// as well, since the User may not be a direct user of Addr and
|
||||
// only share a base pointer.
|
||||
SmallPtrSet<const SDNode *, 32> Visited;
|
||||
SmallVector<const SDNode *, 16> Worklist;
|
||||
Worklist.push_back(N);
|
||||
Worklist.push_back(User);
|
||||
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
|
||||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
|
||||
/// NEON load/store intrinsics, and generic vector load/stores, to merge
|
||||
/// base address updates.
|
||||
|
@ -15251,237 +15635,89 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
|
|||
/// The caller is assumed to have checked legality.
|
||||
static SDValue CombineBaseUpdate(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
|
||||
N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
|
||||
const bool isStore = N->getOpcode() == ISD::STORE;
|
||||
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
|
||||
BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
|
||||
|
||||
SDValue Addr = N->getOperand(AddrOpIdx);
|
||||
MemSDNode *MemN = cast<MemSDNode>(N);
|
||||
SDLoc dl(N);
|
||||
|
||||
SmallVector<BaseUpdateUser, 8> BaseUpdates;
|
||||
|
||||
// Search for a use of the address operand that is an increment.
|
||||
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
|
||||
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
|
||||
SDNode *User = *UI;
|
||||
if (User->getOpcode() != ISD::ADD ||
|
||||
UI.getUse().getResNo() != Addr.getResNo())
|
||||
if (UI.getUse().getResNo() != Addr.getResNo() ||
|
||||
User->getNumOperands() != 2)
|
||||
continue;
|
||||
|
||||
// Check that the add is independent of the load/store. Otherwise, folding
|
||||
// it would create a cycle. We can avoid searching through Addr as it's a
|
||||
// predecessor to both.
|
||||
SmallPtrSet<const SDNode *, 32> Visited;
|
||||
SmallVector<const SDNode *, 16> Worklist;
|
||||
Visited.insert(Addr.getNode());
|
||||
Worklist.push_back(N);
|
||||
Worklist.push_back(User);
|
||||
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
|
||||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
|
||||
continue;
|
||||
SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
|
||||
unsigned ConstInc =
|
||||
getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
|
||||
|
||||
// Find the new opcode for the updating load/store.
|
||||
bool isLoadOp = true;
|
||||
bool isLaneOp = false;
|
||||
// Workaround for vst1x and vld1x intrinsics which do not have alignment
|
||||
// as an operand.
|
||||
bool hasAlignment = true;
|
||||
unsigned NewOpc = 0;
|
||||
unsigned NumVecs = 0;
|
||||
if (isIntrinsic) {
|
||||
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
switch (IntNo) {
|
||||
default: llvm_unreachable("unexpected intrinsic for Neon base update");
|
||||
case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
|
||||
NumVecs = 1; break;
|
||||
case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
|
||||
NumVecs = 2; break;
|
||||
case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
|
||||
NumVecs = 3; break;
|
||||
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
|
||||
NumVecs = 4; break;
|
||||
case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD;
|
||||
NumVecs = 2; hasAlignment = false; break;
|
||||
case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD;
|
||||
NumVecs = 3; hasAlignment = false; break;
|
||||
case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
|
||||
NumVecs = 4; hasAlignment = false; break;
|
||||
case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
|
||||
NumVecs = 2; break;
|
||||
case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
|
||||
NumVecs = 3; break;
|
||||
case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
|
||||
NumVecs = 4; break;
|
||||
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
|
||||
NumVecs = 2; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
|
||||
NumVecs = 3; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
|
||||
NumVecs = 4; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
|
||||
NumVecs = 1; isLoadOp = false; break;
|
||||
case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
|
||||
NumVecs = 2; isLoadOp = false; break;
|
||||
case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
|
||||
NumVecs = 3; isLoadOp = false; break;
|
||||
case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
|
||||
NumVecs = 4; isLoadOp = false; break;
|
||||
case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
|
||||
NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
|
||||
NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
|
||||
NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
|
||||
case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD;
|
||||
NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
|
||||
case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD;
|
||||
NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
|
||||
case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD;
|
||||
NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
|
||||
}
|
||||
} else {
|
||||
isLaneOp = true;
|
||||
switch (N->getOpcode()) {
|
||||
default: llvm_unreachable("unexpected opcode for Neon base update");
|
||||
case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
|
||||
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
|
||||
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
|
||||
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
|
||||
case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
|
||||
NumVecs = 1; isLaneOp = false; break;
|
||||
case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
|
||||
NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
|
||||
}
|
||||
if (ConstInc || User->getOpcode() == ISD::ADD)
|
||||
BaseUpdates.push_back({User, Inc, ConstInc});
|
||||
}
|
||||
|
||||
// If the address is a constant pointer increment itself, find
|
||||
// another constant increment that has the same base operand
|
||||
SDValue Base;
|
||||
SDValue CInc;
|
||||
if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
|
||||
unsigned Offset =
|
||||
getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
|
||||
for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
|
||||
UI != UE; ++UI) {
|
||||
|
||||
SDNode *User = *UI;
|
||||
if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
|
||||
User->getNumOperands() != 2)
|
||||
continue;
|
||||
|
||||
SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
|
||||
unsigned UserOffset =
|
||||
getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
|
||||
|
||||
if (!UserOffset || UserOffset <= Offset)
|
||||
continue;
|
||||
|
||||
unsigned NewConstInc = UserOffset - Offset;
|
||||
SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
|
||||
BaseUpdates.push_back({User, NewInc, NewConstInc});
|
||||
}
|
||||
}
|
||||
|
||||
// Find the size of memory referenced by the load/store.
|
||||
EVT VecTy;
|
||||
if (isLoadOp) {
|
||||
VecTy = N->getValueType(0);
|
||||
} else if (isIntrinsic) {
|
||||
VecTy = N->getOperand(AddrOpIdx+1).getValueType();
|
||||
} else {
|
||||
assert(isStore && "Node has to be a load, a store, or an intrinsic!");
|
||||
VecTy = N->getOperand(1).getValueType();
|
||||
}
|
||||
|
||||
bool isVLDDUPOp =
|
||||
NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
|
||||
NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
|
||||
|
||||
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
|
||||
if (isLaneOp || isVLDDUPOp)
|
||||
NumBytes /= VecTy.getVectorNumElements();
|
||||
|
||||
// If the increment is a constant, it must match the memory ref size.
|
||||
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
|
||||
ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
|
||||
if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
|
||||
// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
|
||||
// separate instructions that make it harder to use a non-constant update.
|
||||
// Try to fold the load/store with an update that matches memory
|
||||
// access size. This should work well for sequential loads.
|
||||
//
|
||||
// Filter out invalid updates as well.
|
||||
unsigned NumValidUpd = BaseUpdates.size();
|
||||
for (unsigned I = 0; I < NumValidUpd;) {
|
||||
BaseUpdateUser &User = BaseUpdates[I];
|
||||
if (!isValidBaseUpdate(N, User.N)) {
|
||||
--NumValidUpd;
|
||||
std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// OK, we found an ADD we can fold into the base update.
|
||||
// Now, create a _UPD node, taking care of not breaking alignment.
|
||||
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
|
||||
return SDValue();
|
||||
++I;
|
||||
}
|
||||
BaseUpdates.resize(NumValidUpd);
|
||||
|
||||
EVT AlignedVecTy = VecTy;
|
||||
unsigned Alignment = MemN->getAlignment();
|
||||
|
||||
// If this is a less-than-standard-aligned load/store, change the type to
|
||||
// match the standard alignment.
|
||||
// The alignment is overlooked when selecting _UPD variants; and it's
|
||||
// easier to introduce bitcasts here than fix that.
|
||||
// There are 3 ways to get to this base-update combine:
|
||||
// - intrinsics: they are assumed to be properly aligned (to the standard
|
||||
// alignment of the memory type), so we don't need to do anything.
|
||||
// - ARMISD::VLDx nodes: they are only generated from the aforementioned
|
||||
// intrinsics, so, likewise, there's nothing to do.
|
||||
// - generic load/store instructions: the alignment is specified as an
|
||||
// explicit operand, rather than implicitly as the standard alignment
|
||||
// of the memory type (like the intrisics). We need to change the
|
||||
// memory type to match the explicit alignment. That way, we don't
|
||||
// generate non-standard-aligned ARMISD::VLDx nodes.
|
||||
if (isa<LSBaseSDNode>(N)) {
|
||||
if (Alignment == 0)
|
||||
Alignment = 1;
|
||||
if (Alignment < VecTy.getScalarSizeInBits() / 8) {
|
||||
MVT EltTy = MVT::getIntegerVT(Alignment * 8);
|
||||
assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
|
||||
assert(!isLaneOp && "Unexpected generic load/store lane.");
|
||||
unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
|
||||
AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
|
||||
}
|
||||
// Don't set an explicit alignment on regular load/stores that we want
|
||||
// to transform to VLD/VST 1_UPD nodes.
|
||||
// This matches the behavior of regular load/stores, which only get an
|
||||
// explicit alignment if the MMO alignment is larger than the standard
|
||||
// alignment of the memory type.
|
||||
// Intrinsics, however, always get an explicit alignment, set to the
|
||||
// alignment of the MMO.
|
||||
Alignment = 1;
|
||||
}
|
||||
|
||||
// Create the new updating load/store node.
|
||||
// First, create an SDVTList for the new updating node's results.
|
||||
EVT Tys[6];
|
||||
unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
|
||||
unsigned n;
|
||||
for (n = 0; n < NumResultVecs; ++n)
|
||||
Tys[n] = AlignedVecTy;
|
||||
Tys[n++] = MVT::i32;
|
||||
Tys[n] = MVT::Other;
|
||||
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
|
||||
|
||||
// Then, gather the new node's operands.
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
Ops.push_back(N->getOperand(0)); // incoming chain
|
||||
Ops.push_back(N->getOperand(AddrOpIdx));
|
||||
Ops.push_back(Inc);
|
||||
|
||||
if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
|
||||
// Try to match the intrinsic's signature
|
||||
Ops.push_back(StN->getValue());
|
||||
} else {
|
||||
// Loads (and of course intrinsics) match the intrinsics' signature,
|
||||
// so just add all but the alignment operand.
|
||||
unsigned LastOperand =
|
||||
hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
|
||||
for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
|
||||
Ops.push_back(N->getOperand(i));
|
||||
}
|
||||
|
||||
// For all node types, the alignment operand is always the last one.
|
||||
Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
|
||||
|
||||
// If this is a non-standard-aligned STORE, the penultimate operand is the
|
||||
// stored value. Bitcast it to the aligned type.
|
||||
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
|
||||
SDValue &StVal = Ops[Ops.size()-2];
|
||||
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
|
||||
}
|
||||
|
||||
EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
|
||||
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
|
||||
MemN->getMemOperand());
|
||||
|
||||
// Update the uses.
|
||||
SmallVector<SDValue, 5> NewResults;
|
||||
for (unsigned i = 0; i < NumResultVecs; ++i)
|
||||
NewResults.push_back(SDValue(UpdN.getNode(), i));
|
||||
|
||||
// If this is an non-standard-aligned LOAD, the first result is the loaded
|
||||
// value. Bitcast it to the expected result type.
|
||||
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
|
||||
SDValue &LdVal = NewResults[0];
|
||||
LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
|
||||
}
|
||||
|
||||
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
|
||||
DCI.CombineTo(N, NewResults);
|
||||
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
|
||||
|
||||
break;
|
||||
// Try to fold with other users. Non-constant updates are considered
|
||||
// first, and constant updates are sorted to not break a sequence of
|
||||
// strided accesses (if there is any).
|
||||
std::sort(BaseUpdates.begin(), BaseUpdates.end(),
|
||||
[](BaseUpdateUser &LHS, BaseUpdateUser &RHS) {
|
||||
return LHS.ConstInc < RHS.ConstInc;
|
||||
});
|
||||
for (BaseUpdateUser &User : BaseUpdates) {
|
||||
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
|
||||
return SDValue();
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
|
|
@ -3,80 +3,57 @@
|
|||
; rdar://12713765
|
||||
; When realign-stack is set to false, make sure we are not creating stack
|
||||
; objects that are assumed to be 64-byte aligned.
|
||||
@T3_retval = common global <16 x float> zeroinitializer, align 16
|
||||
|
||||
define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
|
||||
entry:
|
||||
; CHECK-LABEL: test1:
|
||||
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
|
||||
; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
|
||||
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
|
||||
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #32
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
|
||||
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #48
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
|
||||
; CHECK: mov r[[R2:[0-9]+]], sp
|
||||
; CHECK: add r[[R3:[0-9]+]], r[[R2]], #48
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
|
||||
; CHECK: add r[[R4:[0-9]+]], r[[R2]], #32
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
|
||||
; CHECK: mov r[[R5:[0-9]+]], r[[R2]]
|
||||
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]!
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
|
||||
; CHECK: add r[[R1:[0-9]+]], r0, #48
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: add r[[R1:[0-9]+]], r0, #32
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
|
||||
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
|
||||
; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
|
||||
; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
|
||||
; CHECK: add r[[PTR]], r[[PTR]], #32
|
||||
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
|
||||
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
||||
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
||||
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
|
||||
entry:
|
||||
%retval = alloca <16 x float>, align 64
|
||||
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
|
||||
store <16 x float> %0, <16 x float>* %retval
|
||||
%1 = load <16 x float>, <16 x float>* %retval
|
||||
store <16 x float> %1, <16 x float>* %agg.result, align 16
|
||||
%a1 = bitcast <16 x float>* %retval to float*
|
||||
%a2 = getelementptr inbounds float, float* %a1, i64 8
|
||||
%a3 = bitcast float* %a2 to <4 x float>*
|
||||
|
||||
%b1 = bitcast <16 x float>* %agg.result to float*
|
||||
%b2 = getelementptr inbounds float, float* %b1, i64 8
|
||||
%b3 = bitcast float* %b2 to <4 x float>*
|
||||
|
||||
%0 = load <4 x float>, <4 x float>* %a3, align 16
|
||||
%1 = load <4 x float>, <4 x float>* %b3, align 16
|
||||
store <4 x float> %0, <4 x float>* %b3, align 16
|
||||
store <4 x float> %1, <4 x float>* %a3, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
|
||||
entry:
|
||||
; CHECK-LABEL: test2:
|
||||
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
|
||||
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
|
||||
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
|
||||
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: mov r[[R1:[0-9]+]], sp
|
||||
; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #16
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
|
||||
; CHECK: mov r[[R3:[0-9]+]], #32
|
||||
; CHECK: mov r[[R9:[0-9]+]], r[[R1]]
|
||||
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]]
|
||||
; CHECK: mov r[[R3:[0-9]+]], r[[R9]]
|
||||
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
|
||||
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: add r[[R1:[0-9]+]], r0, #48
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: add r[[R1:[0-9]+]], r0, #32
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
|
||||
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
|
||||
; CHECK: mov r[[ALIGNED:[0-9]+]], sp
|
||||
; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
|
||||
; CHECK: add r[[PTR]], r[[PTR]], #32
|
||||
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
|
||||
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
||||
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
|
||||
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
|
||||
entry:
|
||||
%retval = alloca <16 x float>, align 64
|
||||
%a1 = bitcast <16 x float>* %retval to float*
|
||||
%a2 = getelementptr inbounds float, float* %a1, i64 8
|
||||
%a3 = bitcast float* %a2 to <4 x float>*
|
||||
|
||||
%b1 = bitcast <16 x float>* %agg.result to float*
|
||||
%b2 = getelementptr inbounds float, float* %b1, i64 8
|
||||
%b3 = bitcast float* %b2 to <4 x float>*
|
||||
|
||||
%retval = alloca <16 x float>, align 64
|
||||
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
|
||||
store <16 x float> %0, <16 x float>* %retval
|
||||
%1 = load <16 x float>, <16 x float>* %retval
|
||||
store <16 x float> %1, <16 x float>* %agg.result, align 16
|
||||
%0 = load <4 x float>, <4 x float>* %a3, align 16
|
||||
%1 = load <4 x float>, <4 x float>* %b3, align 16
|
||||
store <4 x float> %0, <4 x float>* %b3, align 16
|
||||
store <4 x float> %1, <4 x float>* %a3, align 16
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -0,0 +1,325 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -o - < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
||||
target triple = "armv8-unknown-linux-gnueabihf"
|
||||
|
||||
define <4 x float> @test(float* %A) {
|
||||
; CHECK-LABEL: test:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr = bitcast float* %A to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
define <4 x float> @test_stride(float* %A) {
|
||||
; CHECK-LABEL: test_stride:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: mov r1, #24
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr = bitcast float* %A to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
define <4 x float> @test_stride_mixed(float* %A) {
|
||||
; CHECK-LABEL: test_stride_mixed:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: mov r1, #24
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr = bitcast float* %A to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
; Refrain from using multiple stride registers
|
||||
define <4 x float> @test_stride_noop(float* %A) {
|
||||
; CHECK-LABEL: test_stride_noop:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: mov r1, #24
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
|
||||
; CHECK-NEXT: mov r1, #32
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr = bitcast float* %A to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
define <4 x float> @test_positive_initial_offset(float* %A) {
|
||||
; CHECK-LABEL: test_positive_initial_offset:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: add r0, r0, #32
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr.elt = getelementptr inbounds float, float* %A, i32 8
|
||||
%X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
define <4 x float> @test_negative_initial_offset(float* %A) {
|
||||
; CHECK-LABEL: test_negative_initial_offset:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: sub r0, r0, #64
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16
|
||||
%X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
@global_float_array = external global [128 x float], align 4
|
||||
define <4 x float> @test_global() {
|
||||
; CHECK-LABEL: test_global:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: movw r0, :lower16:global_float_array
|
||||
; CHECK-NEXT: movt r0, :upper16:global_float_array
|
||||
; CHECK-NEXT: add r0, r0, #32
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: bx lr
|
||||
%X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4
|
||||
%Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4
|
||||
%Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
define <4 x float> @test_stack() {
|
||||
; Use huge alignment to test that ADD would not be converted to OR
|
||||
; CHECK-LABEL: test_stack:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: .save {r4, r10, r11, lr}
|
||||
; CHECK-NEXT: push {r4, r10, r11, lr}
|
||||
; CHECK-NEXT: .setfp r11, sp, #8
|
||||
; CHECK-NEXT: add r11, sp, #8
|
||||
; CHECK-NEXT: .pad #240
|
||||
; CHECK-NEXT: sub sp, sp, #240
|
||||
; CHECK-NEXT: bfc sp, #0, #7
|
||||
; CHECK-NEXT: mov r4, sp
|
||||
; CHECK-NEXT: mov r0, r4
|
||||
; CHECK-NEXT: bl external_function
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r4:128]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r4:128]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128]
|
||||
; CHECK-NEXT: vadd.f32 q0, q8, q9
|
||||
; CHECK-NEXT: sub sp, r11, #8
|
||||
; CHECK-NEXT: pop {r4, r10, r11, pc}
|
||||
%array = alloca [32 x float], align 128
|
||||
%arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0
|
||||
call void @external_function(float* %arraydecay)
|
||||
%X.ptr = bitcast [32 x float]* %array to <4 x float>*
|
||||
%X = load <4 x float>, <4 x float>* %X.ptr, align 4
|
||||
%Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
|
||||
%Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
|
||||
%tmp.sum = fadd <4 x float> %X, %Y
|
||||
%sum = fadd <4 x float> %tmp.sum, %Z
|
||||
ret <4 x float> %sum
|
||||
}
|
||||
|
||||
define <2 x double> @test_double(double* %A) {
|
||||
; CHECK-LABEL: test_double:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: add r0, r0, #64
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]!
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f64 d20, d17, d19
|
||||
; CHECK-NEXT: vadd.f64 d16, d16, d18
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r0]
|
||||
; CHECK-NEXT: vadd.f64 d1, d20, d23
|
||||
; CHECK-NEXT: vadd.f64 d0, d16, d22
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr.elt = getelementptr inbounds double, double* %A, i32 8
|
||||
%X.ptr = bitcast double* %X.ptr.elt to <2 x double>*
|
||||
%X = load <2 x double>, <2 x double>* %X.ptr, align 8
|
||||
%Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10
|
||||
%Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>*
|
||||
%Y = load <2 x double>, <2 x double>* %Y.ptr, align 8
|
||||
%Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12
|
||||
%Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>*
|
||||
%Z = load <2 x double>, <2 x double>* %Z.ptr, align 8
|
||||
%tmp.sum = fadd <2 x double> %X, %Y
|
||||
%sum = fadd <2 x double> %tmp.sum, %Z
|
||||
ret <2 x double> %sum
|
||||
}
|
||||
|
||||
define void @test_various_instructions(float* %A) {
|
||||
; CHECK-LABEL: test_various_instructions:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
|
||||
; CHECK-NEXT: vadd.f32 q8, q8, q9
|
||||
; CHECK-NEXT: vst1.32 {d16, d17}, [r0]
|
||||
; CHECK-NEXT: bx lr
|
||||
%X.ptr = bitcast float* %A to i8*
|
||||
%X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1)
|
||||
%Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
|
||||
%Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
|
||||
%Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
|
||||
%Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
|
||||
%Z.ptr = bitcast float* %Z.ptr.elt to i8*
|
||||
%Z = fadd <4 x float> %X, %Y
|
||||
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_lsr_geps(float* %a, float* %b, i32 %n) {
|
||||
; CHECK-LABEL: test_lsr_geps:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: cmp r2, #1
|
||||
; CHECK-NEXT: bxlt lr
|
||||
; CHECK-NEXT: .LBB10_1: @ %for.body.preheader
|
||||
; CHECK-NEXT: mov r12, #0
|
||||
; CHECK-NEXT: .LBB10_2: @ %for.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: add r3, r0, r12
|
||||
; CHECK-NEXT: subs r2, r2, #1
|
||||
; CHECK-NEXT: vld1.32 {d16, d17}, [r3]!
|
||||
; CHECK-NEXT: vld1.32 {d18, d19}, [r3]!
|
||||
; CHECK-NEXT: vld1.32 {d20, d21}, [r3]!
|
||||
; CHECK-NEXT: vld1.32 {d22, d23}, [r3]
|
||||
; CHECK-NEXT: add r3, r1, r12
|
||||
; CHECK-NEXT: add r12, r12, #64
|
||||
; CHECK-NEXT: vst1.32 {d16, d17}, [r3]!
|
||||
; CHECK-NEXT: vst1.32 {d18, d19}, [r3]!
|
||||
; CHECK-NEXT: vst1.32 {d20, d21}, [r3]!
|
||||
; CHECK-NEXT: vst1.32 {d22, d23}, [r3]
|
||||
; CHECK-NEXT: bne .LBB10_2
|
||||
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%cmp61 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup:
|
||||
ret void
|
||||
|
||||
for.body:
|
||||
%lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ]
|
||||
%lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
|
||||
%0 = bitcast float* %a to i8*
|
||||
%1 = bitcast float* %b to i8*
|
||||
%uglygep19 = getelementptr i8, i8* %0, i32 %lsr.iv1
|
||||
%uglygep1920 = bitcast i8* %uglygep19 to <4 x float>*
|
||||
%2 = load <4 x float>, <4 x float>* %uglygep1920, align 4
|
||||
%uglygep16 = getelementptr i8, i8* %0, i32 %lsr.iv1
|
||||
%uglygep1617 = bitcast i8* %uglygep16 to <4 x float>*
|
||||
%scevgep18 = getelementptr <4 x float>, <4 x float>* %uglygep1617, i32 1
|
||||
%3 = load <4 x float>, <4 x float>* %scevgep18, align 4
|
||||
%uglygep13 = getelementptr i8, i8* %0, i32 %lsr.iv1
|
||||
%uglygep1314 = bitcast i8* %uglygep13 to <4 x float>*
|
||||
%scevgep15 = getelementptr <4 x float>, <4 x float>* %uglygep1314, i32 2
|
||||
%4 = load <4 x float>, <4 x float>* %scevgep15, align 4
|
||||
%uglygep10 = getelementptr i8, i8* %0, i32 %lsr.iv1
|
||||
%uglygep1011 = bitcast i8* %uglygep10 to <4 x float>*
|
||||
%scevgep12 = getelementptr <4 x float>, <4 x float>* %uglygep1011, i32 3
|
||||
%5 = load <4 x float>, <4 x float>* %scevgep12, align 4
|
||||
%uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv1
|
||||
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %uglygep8, <4 x float> %2, i32 4)
|
||||
%uglygep6 = getelementptr i8, i8* %1, i32 %lsr.iv1
|
||||
%scevgep7 = getelementptr i8, i8* %uglygep6, i32 16
|
||||
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep7, <4 x float> %3, i32 4)
|
||||
%uglygep4 = getelementptr i8, i8* %1, i32 %lsr.iv1
|
||||
%scevgep5 = getelementptr i8, i8* %uglygep4, i32 32
|
||||
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep5, <4 x float> %4, i32 4)
|
||||
%uglygep = getelementptr i8, i8* %1, i32 %lsr.iv1
|
||||
%scevgep = getelementptr i8, i8* %uglygep, i32 48
|
||||
tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep, <4 x float> %5, i32 4)
|
||||
%lsr.iv.next = add i32 %lsr.iv, -1
|
||||
%lsr.iv.next2 = add nuw i32 %lsr.iv1, 64
|
||||
%exitcond.not = icmp eq i32 %lsr.iv.next, 0
|
||||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
declare void @external_function(float*)
|
||||
declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
|
||||
declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly
|
|
@ -83,16 +83,16 @@ define void @test(double, float, i16, <4 x half>, <8 x half>) {
|
|||
; SOFT: @ %bb.0: @ %entry
|
||||
; SOFT-NEXT: push {r11, lr}
|
||||
; SOFT-NEXT: sub sp, sp, #32
|
||||
; SOFT-NEXT: vldr d16, [sp, #40]
|
||||
; SOFT-NEXT: mov r12, #16
|
||||
; SOFT-NEXT: vabs.f16 d16, d16
|
||||
; SOFT-NEXT: mov lr, sp
|
||||
; SOFT-NEXT: vst1.16 {d16}, [lr:64], r12
|
||||
; SOFT-NEXT: add r12, sp, #48
|
||||
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
|
||||
; SOFT-NEXT: add r12, sp, #16
|
||||
; SOFT-NEXT: vabs.f16 q8, q8
|
||||
; SOFT-NEXT: str r3, [sp, #8]
|
||||
; SOFT-NEXT: vst1.64 {d16, d17}, [lr]
|
||||
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
|
||||
; SOFT-NEXT: mov r12, sp
|
||||
; SOFT-NEXT: vldr d16, [sp, #40]
|
||||
; SOFT-NEXT: vabs.f16 d16, d16
|
||||
; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
|
||||
; SOFT-NEXT: str r3, [r12]
|
||||
; SOFT-NEXT: bl use
|
||||
; SOFT-NEXT: add sp, sp, #32
|
||||
; SOFT-NEXT: pop {r11, pc}
|
||||
|
@ -105,26 +105,26 @@ define void @test(double, float, i16, <4 x half>, <8 x half>) {
|
|||
;
|
||||
; SOFTEB-LABEL: test:
|
||||
; SOFTEB: @ %bb.0: @ %entry
|
||||
; SOFTEB-NEXT: .save {r11, lr}
|
||||
; SOFTEB-NEXT: push {r11, lr}
|
||||
; SOFTEB-NEXT: .save {r4, lr}
|
||||
; SOFTEB-NEXT: push {r4, lr}
|
||||
; SOFTEB-NEXT: .pad #32
|
||||
; SOFTEB-NEXT: sub sp, sp, #32
|
||||
; SOFTEB-NEXT: vldr d16, [sp, #40]
|
||||
; SOFTEB-NEXT: mov r12, #16
|
||||
; SOFTEB-NEXT: mov lr, sp
|
||||
; SOFTEB-NEXT: str r3, [sp, #8]
|
||||
; SOFTEB-NEXT: add r4, sp, #48
|
||||
; SOFTEB-NEXT: add r12, sp, #16
|
||||
; SOFTEB-NEXT: vrev64.16 d16, d16
|
||||
; SOFTEB-NEXT: vabs.f16 d16, d16
|
||||
; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12
|
||||
; SOFTEB-NEXT: add r12, sp, #48
|
||||
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
|
||||
; SOFTEB-NEXT: vst1.16 {d16}, [lr:64]!
|
||||
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r4]
|
||||
; SOFTEB-NEXT: vrev64.16 q8, q8
|
||||
; SOFTEB-NEXT: str r3, [lr]
|
||||
; SOFTEB-NEXT: vabs.f16 q8, q8
|
||||
; SOFTEB-NEXT: vrev64.16 q8, q8
|
||||
; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr]
|
||||
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
|
||||
; SOFTEB-NEXT: bl use
|
||||
; SOFTEB-NEXT: add sp, sp, #32
|
||||
; SOFTEB-NEXT: pop {r11, pc}
|
||||
; SOFTEB-NEXT: pop {r4, pc}
|
||||
;
|
||||
; HARDEB-LABEL: test:
|
||||
; HARDEB: @ %bb.0: @ %entry
|
||||
|
@ -148,20 +148,20 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
|
|||
; SOFT-NEXT: push {r11, lr}
|
||||
; SOFT-NEXT: sub sp, sp, #32
|
||||
; SOFT-NEXT: add r12, sp, #80
|
||||
; SOFT-NEXT: mov lr, sp
|
||||
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
|
||||
; SOFT-NEXT: add r12, sp, #48
|
||||
; SOFT-NEXT: vabs.f16 q8, q8
|
||||
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
|
||||
; SOFT-NEXT: add r12, sp, #64
|
||||
; SOFT-NEXT: str r3, [sp, #8]
|
||||
; SOFT-NEXT: vadd.f16 q8, q8, q9
|
||||
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
|
||||
; SOFT-NEXT: mov r12, #16
|
||||
; SOFT-NEXT: add r12, sp, #16
|
||||
; SOFT-NEXT: vmul.f16 q8, q9, q8
|
||||
; SOFT-NEXT: vldr d18, [sp, #40]
|
||||
; SOFT-NEXT: vst1.16 {d18}, [lr:64], r12
|
||||
; SOFT-NEXT: vst1.64 {d16, d17}, [lr]
|
||||
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
|
||||
; SOFT-NEXT: mov r12, sp
|
||||
; SOFT-NEXT: vldr d16, [sp, #40]
|
||||
; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
|
||||
; SOFT-NEXT: str r3, [r12]
|
||||
; SOFT-NEXT: bl use
|
||||
; SOFT-NEXT: add sp, sp, #32
|
||||
; SOFT-NEXT: pop {r11, pc}
|
||||
|
@ -181,13 +181,8 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
|
|||
; SOFTEB-NEXT: push {r11, lr}
|
||||
; SOFTEB-NEXT: .pad #32
|
||||
; SOFTEB-NEXT: sub sp, sp, #32
|
||||
; SOFTEB-NEXT: vldr d16, [sp, #40]
|
||||
; SOFTEB-NEXT: mov r12, #16
|
||||
; SOFTEB-NEXT: mov lr, sp
|
||||
; SOFTEB-NEXT: str r3, [sp, #8]
|
||||
; SOFTEB-NEXT: vrev64.16 d16, d16
|
||||
; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12
|
||||
; SOFTEB-NEXT: add r12, sp, #80
|
||||
; SOFTEB-NEXT: mov lr, sp
|
||||
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
|
||||
; SOFTEB-NEXT: add r12, sp, #48
|
||||
; SOFTEB-NEXT: vrev64.16 q8, q8
|
||||
|
@ -197,10 +192,15 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
|
|||
; SOFTEB-NEXT: vrev64.16 q9, q9
|
||||
; SOFTEB-NEXT: vadd.f16 q8, q8, q9
|
||||
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
|
||||
; SOFTEB-NEXT: add r12, sp, #16
|
||||
; SOFTEB-NEXT: vrev64.16 q9, q9
|
||||
; SOFTEB-NEXT: vmul.f16 q8, q9, q8
|
||||
; SOFTEB-NEXT: vldr d18, [sp, #40]
|
||||
; SOFTEB-NEXT: vrev64.16 d18, d18
|
||||
; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]!
|
||||
; SOFTEB-NEXT: str r3, [lr]
|
||||
; SOFTEB-NEXT: vrev64.16 q8, q8
|
||||
; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr]
|
||||
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
|
||||
; SOFTEB-NEXT: bl use
|
||||
; SOFTEB-NEXT: add sp, sp, #32
|
||||
; SOFTEB-NEXT: pop {r11, pc}
|
||||
|
|
|
@ -26,20 +26,18 @@ define <32 x i8> @test_consume_arg([9 x double], <32 x i8> %vec) {
|
|||
define void @test_produce_arg() {
|
||||
; CHECK-LABEL: test_produce_arg:
|
||||
|
||||
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #32
|
||||
; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
|
||||
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #16
|
||||
; CHECK-V7K: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]!
|
||||
; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
|
||||
|
||||
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #24
|
||||
; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
||||
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #8
|
||||
; CHECK-AAPCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
|
||||
; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
||||
|
||||
; CHECK-APCS: add r[[BASE:[0-9]+]], sp, #60
|
||||
; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
||||
; CHECK-APCS: mov r[[R4:[0-9]+]], sp
|
||||
; CHECK-APCS: mov r[[BASE:[0-9]+]], sp
|
||||
; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #76
|
||||
; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #60
|
||||
; CHECK-APCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
|
||||
; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
||||
|
||||
call <32 x i8> @test_consume_arg([9 x double] undef, <32 x i8> zeroinitializer)
|
||||
|
|
|
@ -44,11 +44,10 @@ entry:
|
|||
define void @t2(i8* nocapture %C) nounwind {
|
||||
entry:
|
||||
; CHECK-LABEL: t2:
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
|
||||
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: movs [[INC:r[0-9]+]], #32
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
|
||||
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
|
||||
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
|
||||
; CHECK: movw [[REG2:r[0-9]+]], #16716
|
||||
; CHECK: movt [[REG2:r[0-9]+]], #72
|
||||
; CHECK: str [[REG2]], [r0]
|
||||
|
|
|
@ -10,18 +10,17 @@ define void @test() {
|
|||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .pad #24
|
||||
; CHECK-NEXT: sub sp, #24
|
||||
; CHECK-NEXT: vmov.i32 q8, #0x0
|
||||
; CHECK-NEXT: mov r0, sp
|
||||
; CHECK-NEXT: mov.w r1, #-1
|
||||
; CHECK-NEXT: vmov.i32 q8, #0x0
|
||||
; CHECK-NEXT: movs r2, #15
|
||||
; CHECK-NEXT: mov r3, r0
|
||||
; CHECK-NEXT: mov r2, r0
|
||||
; CHECK-NEXT: strd r1, r1, [sp, #8]
|
||||
; CHECK-NEXT: strd r1, r1, [sp]
|
||||
; CHECK-NEXT: str r1, [sp, #16]
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r3], r2
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: str r2, [r3]
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r2]!
|
||||
; CHECK-NEXT: str r1, [r2]
|
||||
; CHECK-NEXT: str r1, [sp, #20]
|
||||
; CHECK-NEXT: movs r1, #0
|
||||
; CHECK-NEXT: str.w r1, [sp, #15]
|
||||
; CHECK-NEXT: bl callee
|
||||
; CHECK-NEXT: add sp, #24
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
|
|
|
@ -76,13 +76,14 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
|
|||
; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
|
||||
|
||||
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
|
||||
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
|
||||
; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
|
||||
|
||||
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
|
||||
|
||||
|
@ -93,8 +94,6 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
|
|||
; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
|
||||
|
||||
; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
|
||||
; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
|
||||
}
|
||||
|
@ -170,13 +169,14 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
|
|||
; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
|
||||
|
||||
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
|
||||
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
|
||||
; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
|
||||
|
||||
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
|
||||
|
||||
|
@ -187,7 +187,6 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
|
|||
; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]]
|
||||
|
||||
; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
|
||||
; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
|
||||
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]]
|
||||
}
|
||||
|
|
|
@ -253,9 +253,8 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
|
|||
}
|
||||
|
||||
; CHECK-LABEL: test_silly_load:
|
||||
; CHECK: vldr d{{[0-9]+}}, [r0, #16]
|
||||
; CHECK: movs r1, #24
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]!
|
||||
; CHECK: vld1.8 {d{{[0-9]+}}}, [r0:64]!
|
||||
; CHECK: ldr {{r[0-9]+}}, [r0]
|
||||
|
||||
define void @test_silly_load(<28 x i8>* %addr) {
|
||||
|
|
|
@ -216,15 +216,14 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
|
|||
define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
|
||||
; CHECK-LABEL: test_multisource:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: vldr d18, [r0, #32]
|
||||
; CHECK-NEXT: mov r1, r0
|
||||
; CHECK-NEXT: vorr d22, d18, d18
|
||||
; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
|
||||
; CHECK-NEXT: vldr d19, [r0, #48]
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]
|
||||
; CHECK-NEXT: vzip.16 d22, d19
|
||||
; CHECK-NEXT: vtrn.16 q8, q10
|
||||
; CHECK-NEXT: vext.16 d18, d18, d22, #2
|
||||
; CHECK-NEXT: vld1.16 {d16, d17}, [r0:128]!
|
||||
; CHECK-NEXT: vld1.16 {d18, d19}, [r0:128]!
|
||||
; CHECK-NEXT: vld1.16 {d20, d21}, [r0:128]!
|
||||
; CHECK-NEXT: vorr d23, d20, d20
|
||||
; CHECK-NEXT: vldr d22, [r0]
|
||||
; CHECK-NEXT: vzip.16 d23, d22
|
||||
; CHECK-NEXT: vtrn.16 q8, q9
|
||||
; CHECK-NEXT: vext.16 d18, d20, d23, #2
|
||||
; CHECK-NEXT: vext.16 d16, d18, d16, #2
|
||||
; CHECK-NEXT: vext.16 d16, d16, d16, #2
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
|
|
|
@ -134,106 +134,97 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
|
|||
%T1_19* %blend, %T0_19* %storeaddr) {
|
||||
; CHECK-LABEL: func_blend19:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
|
||||
; CHECK-NEXT: add r2, r1, #48
|
||||
; CHECK-NEXT: mov r8, #0
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r2:128]
|
||||
; CHECK-NEXT: add r2, r0, #48
|
||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: vld1.64 {d28, d29}, [r1:128]!
|
||||
; CHECK-NEXT: mov lr, #0
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128]
|
||||
; CHECK-NEXT: vmov r2, r12, d16
|
||||
; CHECK-NEXT: vmov r6, r7, d17
|
||||
; CHECK-NEXT: vmov r4, r5, d18
|
||||
; CHECK-NEXT: subs r2, r4, r2
|
||||
; CHECK-NEXT: sbcs r2, r5, r12
|
||||
; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]!
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]!
|
||||
; CHECK-NEXT: vld1.64 {d24, d25}, [r0:128]!
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]!
|
||||
; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128]!
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]
|
||||
; CHECK-NEXT: vmov r0, r12, d16
|
||||
; CHECK-NEXT: vmov r1, r2, d18
|
||||
; CHECK-NEXT: subs r0, r1, r0
|
||||
; CHECK-NEXT: vmov r1, r4, d25
|
||||
; CHECK-NEXT: sbcs r0, r2, r12
|
||||
; CHECK-NEXT: mov r12, #0
|
||||
; CHECK-NEXT: vmov r2, r4, d19
|
||||
; CHECK-NEXT: vmov r2, r0, d21
|
||||
; CHECK-NEXT: movlt r12, #1
|
||||
; CHECK-NEXT: cmp r12, #0
|
||||
; CHECK-NEXT: mov r5, r1
|
||||
; CHECK-NEXT: mvnne r12, #0
|
||||
; CHECK-NEXT: vld1.64 {d24, d25}, [r5:128]!
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128]
|
||||
; CHECK-NEXT: subs r2, r2, r6
|
||||
; CHECK-NEXT: mov r2, r0
|
||||
; CHECK-NEXT: add r0, r0, #32
|
||||
; CHECK-NEXT: vld1.64 {d26, d27}, [r2:128]!
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r2:128]
|
||||
; CHECK-NEXT: sbcs r2, r4, r7
|
||||
; CHECK-NEXT: vmov r4, r5, d21
|
||||
; CHECK-NEXT: movlt r8, #1
|
||||
; CHECK-NEXT: vmov r6, r7, d23
|
||||
; CHECK-NEXT: cmp r8, #0
|
||||
; CHECK-NEXT: mvnne r8, #0
|
||||
; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128]
|
||||
; CHECK-NEXT: add r0, r1, #32
|
||||
; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]
|
||||
; CHECK-NEXT: vmov r0, r1, d20
|
||||
; CHECK-NEXT: vdup.32 d7, r8
|
||||
; CHECK-NEXT: vdup.32 d6, r12
|
||||
; CHECK-NEXT: subs r4, r6, r4
|
||||
; CHECK-NEXT: sbcs r4, r7, r5
|
||||
; CHECK-NEXT: vmov r5, r6, d24
|
||||
; CHECK-NEXT: vmov r7, r2, d26
|
||||
; CHECK-NEXT: mov r4, #0
|
||||
; CHECK-NEXT: movlt r4, #1
|
||||
; CHECK-NEXT: cmp r4, #0
|
||||
; CHECK-NEXT: mvnne r4, #0
|
||||
; CHECK-NEXT: vdup.32 d5, r4
|
||||
; CHECK-NEXT: subs r5, r7, r5
|
||||
; CHECK-NEXT: sbcs r2, r2, r6
|
||||
; CHECK-NEXT: vmov r7, r6, d27
|
||||
; CHECK-NEXT: vmov r2, r9, d25
|
||||
; CHECK-NEXT: mov r5, #0
|
||||
; CHECK-NEXT: movlt r5, #1
|
||||
; CHECK-NEXT: cmp r5, #0
|
||||
; CHECK-NEXT: mvnne r5, #0
|
||||
; CHECK-NEXT: subs r2, r7, r2
|
||||
; CHECK-NEXT: sbcs r2, r6, r9
|
||||
; CHECK-NEXT: vmov r6, r7, d22
|
||||
; CHECK-NEXT: subs r1, r1, r2
|
||||
; CHECK-NEXT: sbcs r0, r4, r0
|
||||
; CHECK-NEXT: vmov r2, r4, d26
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d1, r0
|
||||
; CHECK-NEXT: vmov r0, r1, d22
|
||||
; CHECK-NEXT: subs r0, r2, r0
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: sbcs r0, r4, r1
|
||||
; CHECK-NEXT: vmov r4, r5, d31
|
||||
; CHECK-NEXT: vmov r0, r1, d29
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: vdup.32 d1, r2
|
||||
; CHECK-NEXT: vdup.32 d0, r5
|
||||
; CHECK-NEXT: vbit q12, q13, q0
|
||||
; CHECK-NEXT: subs r0, r6, r0
|
||||
; CHECK-NEXT: vmov r2, r6, d28
|
||||
; CHECK-NEXT: sbcs r0, r7, r1
|
||||
; CHECK-NEXT: mov r7, #0
|
||||
; CHECK-NEXT: vmov r0, r1, d30
|
||||
; CHECK-NEXT: movlt r7, #1
|
||||
; CHECK-NEXT: subs r0, r2, r0
|
||||
; CHECK-NEXT: vmov r2, r5, d29
|
||||
; CHECK-NEXT: sbcs r0, r6, r1
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r1
|
||||
; CHECK-NEXT: vmov r4, r5, d30
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d3, r0
|
||||
; CHECK-NEXT: vmov r0, r1, d28
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r1
|
||||
; CHECK-NEXT: vmov r4, r5, d24
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d2, r0
|
||||
; CHECK-NEXT: vmov r0, r1, d20
|
||||
; CHECK-NEXT: vbit q14, q15, q1
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r1
|
||||
; CHECK-NEXT: vmov r1, r4, d17
|
||||
; CHECK-NEXT: vmov r5, r6, d19
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d0, r0
|
||||
; CHECK-NEXT: vbit q10, q12, q0
|
||||
; CHECK-NEXT: subs r1, r5, r1
|
||||
; CHECK-NEXT: sbcs r1, r6, r4
|
||||
; CHECK-NEXT: vmov r4, r5, d27
|
||||
; CHECK-NEXT: vmov r0, r1, d23
|
||||
; CHECK-NEXT: mov r6, #0
|
||||
; CHECK-NEXT: vmov r0, r1, d31
|
||||
; CHECK-NEXT: movlt r6, #1
|
||||
; CHECK-NEXT: subs r0, r2, r0
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r1
|
||||
; CHECK-NEXT: movlt lr, #1
|
||||
; CHECK-NEXT: cmp lr, #0
|
||||
; CHECK-NEXT: mvnne lr, #0
|
||||
; CHECK-NEXT: cmp r6, #0
|
||||
; CHECK-NEXT: vdup.32 d31, lr
|
||||
; CHECK-NEXT: mvnne r6, #0
|
||||
; CHECK-NEXT: vdup.32 d3, lr
|
||||
; CHECK-NEXT: vdup.32 d2, r6
|
||||
; CHECK-NEXT: cmp r7, #0
|
||||
; CHECK-NEXT: vorr q13, q1, q1
|
||||
; CHECK-NEXT: mvnne r7, #0
|
||||
; CHECK-NEXT: vdup.32 d4, r7
|
||||
; CHECK-NEXT: add r0, r3, #32
|
||||
; CHECK-NEXT: vbsl q13, q14, q15
|
||||
; CHECK-NEXT: vbit q10, q11, q2
|
||||
; CHECK-NEXT: vbit q8, q9, q3
|
||||
; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]
|
||||
; CHECK-NEXT: add r0, r3, #48
|
||||
; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]!
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
|
||||
; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]
|
||||
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr}
|
||||
; CHECK-NEXT: vdup.32 d30, r2
|
||||
; CHECK-NEXT: vdup.32 d3, r6
|
||||
; CHECK-NEXT: vbit q11, q13, q15
|
||||
; CHECK-NEXT: vdup.32 d2, r12
|
||||
; CHECK-NEXT: vst1.64 {d28, d29}, [r3:128]!
|
||||
; CHECK-NEXT: vbit q8, q9, q1
|
||||
; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]!
|
||||
; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128]!
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]
|
||||
; CHECK-NEXT: pop {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%v0 = load %T0_19, %T0_19* %loadaddr
|
||||
%v1 = load %T0_19, %T0_19* %loadaddr2
|
||||
|
@ -251,213 +242,198 @@ define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
|
|||
%T1_20* %blend, %T0_20* %storeaddr) {
|
||||
; CHECK-LABEL: func_blend20:
|
||||
; CHECK: @ %bb.0:
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
; CHECK-NEXT: .pad #4
|
||||
; CHECK-NEXT: sub sp, sp, #4
|
||||
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: .pad #8
|
||||
; CHECK-NEXT: sub sp, sp, #8
|
||||
; CHECK-NEXT: add r9, r1, #64
|
||||
; CHECK-NEXT: mov r2, #32
|
||||
; CHECK-NEXT: add r8, r0, #64
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r9:128], r2
|
||||
; CHECK-NEXT: mov r10, r1
|
||||
; CHECK-NEXT: mov r11, r0
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128], r2
|
||||
; CHECK-NEXT: vmov r7, r5, d17
|
||||
; CHECK-NEXT: vmov r6, r2, d19
|
||||
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r10:128]!
|
||||
; CHECK-NEXT: subs r7, r6, r7
|
||||
; CHECK-NEXT: sbcs r2, r2, r5
|
||||
; CHECK-NEXT: vmov r5, r6, d16
|
||||
; CHECK-NEXT: vmov r7, r4, d18
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: vdup.32 d21, r2
|
||||
; CHECK-NEXT: mov r8, r1
|
||||
; CHECK-NEXT: mov lr, r0
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]!
|
||||
; CHECK-NEXT: add r9, r0, #64
|
||||
; CHECK-NEXT: add r10, r1, #64
|
||||
; CHECK-NEXT: mov r12, #0
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [lr:128]!
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128]!
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [lr:128]!
|
||||
; CHECK-NEXT: vmov r6, r4, d19
|
||||
; CHECK-NEXT: vmov r5, r7, d21
|
||||
; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]!
|
||||
; CHECK-NEXT: vld1.64 {d6, d7}, [r10:128]!
|
||||
; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]!
|
||||
; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]!
|
||||
; CHECK-NEXT: subs r6, r5, r6
|
||||
; CHECK-NEXT: sbcs r4, r7, r4
|
||||
; CHECK-NEXT: vmov r5, r6, d18
|
||||
; CHECK-NEXT: vmov r7, r2, d20
|
||||
; CHECK-NEXT: mov r4, #0
|
||||
; CHECK-NEXT: movlt r4, #1
|
||||
; CHECK-NEXT: cmp r4, #0
|
||||
; CHECK-NEXT: mvnne r4, #0
|
||||
; CHECK-NEXT: vdup.32 d31, r4
|
||||
; CHECK-NEXT: subs r5, r7, r5
|
||||
; CHECK-NEXT: sbcs r4, r4, r6
|
||||
; CHECK-NEXT: mov r4, #0
|
||||
; CHECK-NEXT: movlt r4, #1
|
||||
; CHECK-NEXT: cmp r4, #0
|
||||
; CHECK-NEXT: mvnne r4, #0
|
||||
; CHECK-NEXT: vdup.32 d20, r4
|
||||
; CHECK-NEXT: vmov r2, r4, d23
|
||||
; CHECK-NEXT: vbit q8, q9, q10
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r11:128]!
|
||||
; CHECK-NEXT: vmov r7, r5, d19
|
||||
; CHECK-NEXT: subs r2, r7, r2
|
||||
; CHECK-NEXT: sbcs r2, r5, r4
|
||||
; CHECK-NEXT: vmov r5, r7, d18
|
||||
; CHECK-NEXT: sbcs r2, r2, r6
|
||||
; CHECK-NEXT: vmov r4, r5, d3
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: vdup.32 d21, r2
|
||||
; CHECK-NEXT: vmov r2, r4, d22
|
||||
; CHECK-NEXT: subs r2, r5, r2
|
||||
; CHECK-NEXT: sbcs r2, r7, r4
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: vdup.32 d20, r2
|
||||
; CHECK-NEXT: add r2, r0, #48
|
||||
; CHECK-NEXT: vbif q9, q11, q10
|
||||
; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128]
|
||||
; CHECK-NEXT: add r2, r1, #48
|
||||
; CHECK-NEXT: vld1.64 {d2, d3}, [r2:128]
|
||||
; CHECK-NEXT: vmov r5, r7, d30
|
||||
; CHECK-NEXT: vmov r2, r4, d2
|
||||
; CHECK-NEXT: vld1.64 {d26, d27}, [r11:128]
|
||||
; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]
|
||||
; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]!
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r9:128]
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [r8:128]!
|
||||
; CHECK-NEXT: vmov r11, r10, d21
|
||||
; CHECK-NEXT: subs r2, r5, r2
|
||||
; CHECK-NEXT: sbcs r2, r7, r4
|
||||
; CHECK-NEXT: vmov r7, r6, d31
|
||||
; CHECK-NEXT: vmov r2, r5, d3
|
||||
; CHECK-NEXT: mov r4, #0
|
||||
; CHECK-NEXT: movlt r4, #1
|
||||
; CHECK-NEXT: cmp r4, #0
|
||||
; CHECK-NEXT: mvnne r4, #0
|
||||
; CHECK-NEXT: subs r2, r7, r2
|
||||
; CHECK-NEXT: mov r7, #0
|
||||
; CHECK-NEXT: sbcs r2, r6, r5
|
||||
; CHECK-NEXT: vmov r6, r5, d27
|
||||
; CHECK-NEXT: vmov r2, r9, d1
|
||||
; CHECK-NEXT: movlt r7, #1
|
||||
; CHECK-NEXT: cmp r7, #0
|
||||
; CHECK-NEXT: mvnne r7, #0
|
||||
; CHECK-NEXT: vdup.32 d7, r7
|
||||
; CHECK-NEXT: vdup.32 d6, r4
|
||||
; CHECK-NEXT: subs r2, r6, r2
|
||||
; CHECK-NEXT: sbcs r2, r5, r9
|
||||
; CHECK-NEXT: vmov r6, r5, d26
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: vdup.32 d5, r2
|
||||
; CHECK-NEXT: vmov r2, r9, d0
|
||||
; CHECK-NEXT: subs r2, r6, r2
|
||||
; CHECK-NEXT: sbcs r2, r5, r9
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: vdup.32 d4, r2
|
||||
; CHECK-NEXT: add r2, r1, #32
|
||||
; CHECK-NEXT: vld1.64 {d28, d29}, [r2:128]
|
||||
; CHECK-NEXT: add r2, r0, #32
|
||||
; CHECK-NEXT: vbif q13, q0, q2
|
||||
; CHECK-NEXT: add r1, r1, #80
|
||||
; CHECK-NEXT: vld1.64 {d0, d1}, [r2:128]
|
||||
; CHECK-NEXT: vmov r4, r5, d28
|
||||
; CHECK-NEXT: vbif q15, q1, q3
|
||||
; CHECK-NEXT: add r0, r0, #80
|
||||
; CHECK-NEXT: vmov r2, r6, d0
|
||||
; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128]
|
||||
; CHECK-NEXT: vmov r9, r8, d25
|
||||
; CHECK-NEXT: vld1.64 {d8, d9}, [r0:128]
|
||||
; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128]
|
||||
; CHECK-NEXT: vmov r3, r12, d8
|
||||
; CHECK-NEXT: subs r2, r2, r4
|
||||
; CHECK-NEXT: sbcs r2, r6, r5
|
||||
; CHECK-NEXT: vmov r4, r5, d29
|
||||
; CHECK-NEXT: vmov r6, r7, d1
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: subs r4, r6, r4
|
||||
; CHECK-NEXT: sbcs r4, r7, r5
|
||||
; CHECK-NEXT: vmov r5, r6, d2
|
||||
; CHECK-NEXT: mov r4, #0
|
||||
; CHECK-NEXT: movlt r4, #1
|
||||
; CHECK-NEXT: cmp r4, #0
|
||||
; CHECK-NEXT: mvnne r4, #0
|
||||
; CHECK-NEXT: vdup.32 d5, r4
|
||||
; CHECK-NEXT: vdup.32 d4, r2
|
||||
; CHECK-NEXT: vmov r2, r4, d22
|
||||
; CHECK-NEXT: vbit q14, q0, q2
|
||||
; CHECK-NEXT: subs r2, r5, r2
|
||||
; CHECK-NEXT: sbcs r2, r6, r4
|
||||
; CHECK-NEXT: vmov r4, r5, d24
|
||||
; CHECK-NEXT: vmov r6, r7, d20
|
||||
; CHECK-NEXT: mov r2, #0
|
||||
; CHECK-NEXT: movlt r2, #1
|
||||
; CHECK-NEXT: cmp r2, #0
|
||||
; CHECK-NEXT: mvnne r2, #0
|
||||
; CHECK-NEXT: subs r1, r6, r4
|
||||
; CHECK-NEXT: vmov r0, r6, d9
|
||||
; CHECK-NEXT: sbcs r1, r7, r5
|
||||
; CHECK-NEXT: vmov r4, r5, d7
|
||||
; CHECK-NEXT: mov r1, #0
|
||||
; CHECK-NEXT: movlt r1, #1
|
||||
; CHECK-NEXT: cmp r1, #0
|
||||
; CHECK-NEXT: mvnne r1, #0
|
||||
; CHECK-NEXT: subs r0, r0, r4
|
||||
; CHECK-NEXT: vmov r7, r4, d23
|
||||
; CHECK-NEXT: sbcs r0, r6, r5
|
||||
; CHECK-NEXT: vmov r5, lr, d6
|
||||
; CHECK-NEXT: vdup.32 d30, r2
|
||||
; CHECK-NEXT: vmov r0, r2, d1
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: vmov r4, r5, d2
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d9, r0
|
||||
; CHECK-NEXT: vmov r0, r2, d0
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: vmov r4, r5, d5
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d8, r0
|
||||
; CHECK-NEXT: vmov r0, r2, d7
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: vmov r4, r5, d4
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d11, r0
|
||||
; CHECK-NEXT: vmov r0, r6, d3
|
||||
; CHECK-NEXT: subs r0, r0, r7
|
||||
; CHECK-NEXT: sbcs r0, r6, r4
|
||||
; CHECK-NEXT: vmov r0, r2, d6
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: vmov r4, r5, d23
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: subs r4, r11, r9
|
||||
; CHECK-NEXT: sbcs r4, r10, r8
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d10, r0
|
||||
; CHECK-NEXT: vmov r0, r2, d17
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: vmov r4, r5, d22
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d25, r0
|
||||
; CHECK-NEXT: vmov r0, r2, d16
|
||||
; CHECK-NEXT: subs r0, r4, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vdup.32 d24, r0
|
||||
; CHECK-NEXT: vorr q13, q12, q12
|
||||
; CHECK-NEXT: vbsl q13, q11, q8
|
||||
; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]!
|
||||
; CHECK-NEXT: vorr q8, q5, q5
|
||||
; CHECK-NEXT: vld1.64 {d28, d29}, [r10:128]!
|
||||
; CHECK-NEXT: vbsl q8, q2, q3
|
||||
; CHECK-NEXT: vld1.64 {d6, d7}, [r8:128]!
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128]
|
||||
; CHECK-NEXT: vld1.64 {d4, d5}, [lr:128]!
|
||||
; CHECK-NEXT: vbif q10, q9, q15
|
||||
; CHECK-NEXT: vorr q9, q4, q4
|
||||
; CHECK-NEXT: vmov r0, r2, d22
|
||||
; CHECK-NEXT: vbsl q9, q1, q0
|
||||
; CHECK-NEXT: vld1.64 {d30, d31}, [lr:128]
|
||||
; CHECK-NEXT: mov lr, #0
|
||||
; CHECK-NEXT: vmov r7, r5, d30
|
||||
; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128]
|
||||
; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128]
|
||||
; CHECK-NEXT: subs r0, r7, r0
|
||||
; CHECK-NEXT: sbcs r0, r5, r2
|
||||
; CHECK-NEXT: vmov r5, r4, d24
|
||||
; CHECK-NEXT: vmov r0, r7, d28
|
||||
; CHECK-NEXT: movlt lr, #1
|
||||
; CHECK-NEXT: cmp lr, #0
|
||||
; CHECK-NEXT: mvnne lr, #0
|
||||
; CHECK-NEXT: subs r0, r5, r0
|
||||
; CHECK-NEXT: sbcs r0, r4, r7
|
||||
; CHECK-NEXT: vmov r7, r5, d29
|
||||
; CHECK-NEXT: vmov r4, r6, d25
|
||||
; CHECK-NEXT: mov r0, #0
|
||||
; CHECK-NEXT: movlt r0, #1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: subs r7, r4, r7
|
||||
; CHECK-NEXT: mov r4, #0
|
||||
; CHECK-NEXT: sbcs r7, r6, r5
|
||||
; CHECK-NEXT: vmov r5, r1, d31
|
||||
; CHECK-NEXT: vmov r7, r6, d23
|
||||
; CHECK-NEXT: movlt r4, #1
|
||||
; CHECK-NEXT: subs r3, r3, r5
|
||||
; CHECK-NEXT: sbcs r3, r12, lr
|
||||
; CHECK-NEXT: mov r3, #0
|
||||
; CHECK-NEXT: movlt r3, #1
|
||||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: mvnne r3, #0
|
||||
; CHECK-NEXT: cmp r4, #0
|
||||
; CHECK-NEXT: mvnne r4, #0
|
||||
; CHECK-NEXT: vdup.32 d10, r3
|
||||
; CHECK-NEXT: vdup.32 d1, r4
|
||||
; CHECK-NEXT: vorr q2, q5, q5
|
||||
; CHECK-NEXT: vdup.32 d0, r1
|
||||
; CHECK-NEXT: cmp r0, #0
|
||||
; CHECK-NEXT: vbsl q2, q4, q3
|
||||
; CHECK-NEXT: mvnne r0, #0
|
||||
; CHECK-NEXT: vbif q10, q12, q0
|
||||
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
|
||||
; CHECK-NEXT: vdup.32 d7, r0
|
||||
; CHECK-NEXT: add r0, r1, #80
|
||||
; CHECK-NEXT: vdup.32 d6, r2
|
||||
; CHECK-NEXT: vbit q11, q1, q3
|
||||
; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]
|
||||
; CHECK-NEXT: add r0, r1, #32
|
||||
; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128]
|
||||
; CHECK-NEXT: add r0, r1, #48
|
||||
; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128]
|
||||
; CHECK-NEXT: add r0, r1, #64
|
||||
; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128]!
|
||||
; CHECK-NEXT: vst1.64 {d26, d27}, [r1:128]
|
||||
; CHECK-NEXT: mov r1, #32
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1
|
||||
; CHECK-NEXT: subs r7, r5, r7
|
||||
; CHECK-NEXT: mov r5, #0
|
||||
; CHECK-NEXT: sbcs r1, r1, r6
|
||||
; CHECK-NEXT: vmov r6, r2, d5
|
||||
; CHECK-NEXT: vmov r1, r7, d7
|
||||
; CHECK-NEXT: movlt r5, #1
|
||||
; CHECK-NEXT: cmp r5, #0
|
||||
; CHECK-NEXT: mvnne r5, #0
|
||||
; CHECK-NEXT: subs r1, r6, r1
|
||||
; CHECK-NEXT: sbcs r1, r2, r7
|
||||
; CHECK-NEXT: vmov r6, r7, d4
|
||||
; CHECK-NEXT: mov r1, #0
|
||||
; CHECK-NEXT: movlt r1, #1
|
||||
; CHECK-NEXT: cmp r1, #0
|
||||
; CHECK-NEXT: mvnne r1, #0
|
||||
; CHECK-NEXT: vdup.32 d9, r1
|
||||
; CHECK-NEXT: vmov r1, r2, d6
|
||||
; CHECK-NEXT: subs r1, r6, r1
|
||||
; CHECK-NEXT: sbcs r1, r7, r2
|
||||
; CHECK-NEXT: vmov r6, r7, d0
|
||||
; CHECK-NEXT: mov r1, #0
|
||||
; CHECK-NEXT: movlt r1, #1
|
||||
; CHECK-NEXT: cmp r1, #0
|
||||
; CHECK-NEXT: mvnne r1, #0
|
||||
; CHECK-NEXT: vdup.32 d8, r1
|
||||
; CHECK-NEXT: vmov r1, r2, d2
|
||||
; CHECK-NEXT: vbif q2, q3, q4
|
||||
; CHECK-NEXT: vdup.32 d7, r5
|
||||
; CHECK-NEXT: vdup.32 d9, r4
|
||||
; CHECK-NEXT: vmov r4, r5, d1
|
||||
; CHECK-NEXT: vdup.32 d8, r0
|
||||
; CHECK-NEXT: mov r0, r3
|
||||
; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]!
|
||||
; CHECK-NEXT: vbif q12, q14, q4
|
||||
; CHECK-NEXT: vdup.32 d6, lr
|
||||
; CHECK-NEXT: vbit q11, q15, q3
|
||||
; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]!
|
||||
; CHECK-NEXT: subs r1, r6, r1
|
||||
; CHECK-NEXT: mov r6, #0
|
||||
; CHECK-NEXT: sbcs r1, r7, r2
|
||||
; CHECK-NEXT: vmov r1, r2, d3
|
||||
; CHECK-NEXT: movlt r6, #1
|
||||
; CHECK-NEXT: subs r1, r4, r1
|
||||
; CHECK-NEXT: sbcs r1, r5, r2
|
||||
; CHECK-NEXT: movlt r12, #1
|
||||
; CHECK-NEXT: cmp r12, #0
|
||||
; CHECK-NEXT: mvnne r12, #0
|
||||
; CHECK-NEXT: cmp r6, #0
|
||||
; CHECK-NEXT: vdup.32 d27, r12
|
||||
; CHECK-NEXT: mvnne r6, #0
|
||||
; CHECK-NEXT: vdup.32 d26, r6
|
||||
; CHECK-NEXT: vorr q10, q13, q13
|
||||
; CHECK-NEXT: vbsl q10, q0, q1
|
||||
; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]!
|
||||
; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128]
|
||||
; CHECK-NEXT: add sp, sp, #8
|
||||
; CHECK-NEXT: add r0, r3, #64
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]!
|
||||
; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]!
|
||||
; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128]!
|
||||
; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
||||
; CHECK-NEXT: add sp, sp, #4
|
||||
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, lr}
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%v0 = load %T0_20, %T0_20* %loadaddr
|
||||
%v1 = load %T0_20, %T0_20* %loadaddr2
|
||||
|
|
|
@ -198,21 +198,13 @@ for.end: ; preds = %for.body
|
|||
|
||||
; @testNeon is an important example of the nead for ivchains.
|
||||
;
|
||||
; Currently we have two extra add.w's that keep the store address
|
||||
; live past the next increment because ISEL is unfortunately undoing
|
||||
; the store chain. ISEL also fails to convert all but one of the stores to
|
||||
; post-increment addressing. However, the loads should use
|
||||
; post-increment addressing, no add's or add.w's beyond the three
|
||||
; mentioned. Most importantly, there should be no spills or reloads!
|
||||
; Loads and stores should use post-increment addressing, no add's or add.w's.
|
||||
; Most importantly, there should be no spills or reloads!
|
||||
;
|
||||
; A9: testNeon:
|
||||
; A9: %.lr.ph
|
||||
; A9: add.w r
|
||||
; A9-NOT: lsl.w
|
||||
; A9-NOT: {{ldr|str|adds|add r}}
|
||||
; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}}
|
||||
; A9: add.w r
|
||||
; A9-NOT: {{ldr|str|adds|add r}}
|
||||
; A9-NOT: add.w r
|
||||
; A9: bne
|
||||
define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {
|
||||
|
|
Loading…
Reference in New Issue