[SelectionDAG] computeKnownBits - support constant pool values from target

This patch adds the overridable TargetLowering::getTargetConstantFromLoad function which allows targets to return any constant value loaded by a LoadSDNode node - only X86 makes use of this so far but everything should be in place for other targets.

computeKnownBits then uses this function to improve codegen, notably vector code after legalization.

A future commit will do the same for ComputeNumSignBits but computeKnownBits sees the bigger benefit.

This required a couple of fixes:
* SimplifyDemandedBits must early-out for getTargetConstantFromLoad cases to prevent infinite loops of constant regeneration (similar to what we already do for BUILD_VECTOR).
* Fix a DAGCombiner::visitTRUNCATE issue as we had trunc(shl(v8i32),v8i16) <-> shl(trunc(v8i16),v8i32) infinite loops after legalization on AVX512 targets.

Differential Revision: https://reviews.llvm.org/D61887

llvm-svn: 361620
This commit is contained in:
Simon Pilgrim 2019-05-24 10:03:11 +00:00
parent 980f760515
commit 95b8d9bbf8
17 changed files with 1201 additions and 1825 deletions

View File

@ -3119,6 +3119,10 @@ public:
TargetLoweringOpt &TLO,
unsigned Depth = 0) const;
/// This method returns the constant pool value that will be loaded by LD.
/// NOTE: You must check for implicit extensions of the constant by LD.
virtual const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const;
/// If \p SNaN is false, \returns true if \p Op is known to never be any
/// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling
/// NaN.

View File

@ -10110,7 +10110,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
(!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
SDValue Amt = N0.getOperand(1);
KnownBits Known = DAG.computeKnownBits(Amt);

View File

@ -2886,8 +2886,59 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
}
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(Op);
// If this is a ZEXTLoad and we are looking at the loaded value.
if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
const Constant *Cst = TLI->getTargetConstantFromLoad(LD);
if (ISD::isNON_EXTLoad(LD) && Cst) {
// Determine any common known bits from the loaded constant pool value.
Type *CstTy = Cst->getType();
if ((NumElts * BitWidth) == CstTy->getPrimitiveSizeInBits()) {
// If its a vector splat, then we can (quickly) reuse the scalar path.
// NOTE: We assume all elements match and none are UNDEF.
if (CstTy->isVectorTy()) {
if (const Constant *Splat = Cst->getSplatValue()) {
Cst = Splat;
CstTy = Cst->getType();
}
}
// TODO - do we need to handle different bitwidths?
if (CstTy->isVectorTy() && BitWidth == CstTy->getScalarSizeInBits()) {
// Iterate across all vector elements finding common known bits.
Known.One.setAllBits();
Known.Zero.setAllBits();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
if (Constant *Elt = Cst->getAggregateElement(i)) {
if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
const APInt &Value = CInt->getValue();
Known.One &= Value;
Known.Zero &= ~Value;
continue;
}
if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
APInt Value = CFP->getValueAPF().bitcastToAPInt();
Known.One &= Value;
Known.Zero &= ~Value;
continue;
}
}
Known.One.clearAllBits();
Known.Zero.clearAllBits();
break;
}
} else if (BitWidth == CstTy->getPrimitiveSizeInBits()) {
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
const APInt &Value = CInt->getValue();
Known.One = Value;
Known.Zero = ~Value;
} else if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
APInt Value = CFP->getValueAPF().bitcastToAPInt();
Known.One = Value;
Known.Zero = ~Value;
}
}
}
} else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
// If this is a ZEXTLoad and we are looking at the loaded value.
EVT VT = LD->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
Known.Zero.setBitsFrom(MemBits);

View File

@ -659,6 +659,14 @@ bool TargetLowering::SimplifyDemandedBits(
Known.Zero &= Known2.Zero;
}
return false; // Don't fall through, will infinitely loop.
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(Op);
if (getTargetConstantFromLoad(LD)) {
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
return false; // Don't fall through, will infinitely loop.
}
break;
}
case ISD::INSERT_VECTOR_ELT: {
SDValue Vec = Op.getOperand(0);
SDValue Scl = Op.getOperand(1);
@ -2314,6 +2322,10 @@ bool TargetLowering::SimplifyDemandedBitsForTargetNode(
return false;
}
const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
return nullptr;
}
bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,

View File

@ -5731,10 +5731,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
auto *Load = dyn_cast<LoadSDNode>(Op);
static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
if (!Load)
return nullptr;
@ -5750,6 +5747,17 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
return CNode->getConstVal();
}
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
}
const Constant *
X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
assert(LD && "Unexpected null LoadSDNode");
return getTargetConstantFromNode(LD);
}
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,

View File

@ -908,6 +908,8 @@ namespace llvm {
TargetLoweringOpt &TLO,
unsigned Depth) const override;
const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
SDValue unwrapAddress(SDValue N) const override;
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;

View File

@ -940,9 +940,8 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
; AVX512-LABEL: test46:
; AVX512: ## %bb.0:
; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
; AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc1]
; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3]
; AVX512-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A]
; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI47_0-4, kind: reloc_riprel_4byte
; AVX512-NEXT: retq ## encoding: [0xc3]

View File

@ -61,31 +61,25 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
; X64-NEXT: packuswb %xmm2, %xmm1
; X64-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X64-NEXT: movdqa %xmm1, %xmm2
; X64-NEXT: pand %xmm0, %xmm2
; X64-NEXT: psllw $4, %xmm2
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; X64-NEXT: packuswb %xmm2, %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psllw $4, %xmm1
; X64-NEXT: pand {{.*}}(%rip), %xmm1
; X64-NEXT: psrlw $4, %xmm1
; X64-NEXT: pand %xmm0, %xmm1
; X64-NEXT: pandn %xmm2, %xmm0
; X64-NEXT: psrlw $4, %xmm0
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X64-NEXT: pand %xmm0, %xmm1
; X64-NEXT: psllw $2, %xmm1
; X64-NEXT: pand {{.*}}(%rip), %xmm1
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: psrlw $2, %xmm0
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X64-NEXT: pand %xmm0, %xmm1
; X64-NEXT: paddb %xmm1, %xmm1
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: psrlw $1, %xmm0
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: psrlq $48, %xmm0
; X64-NEXT: retq

View File

@ -47,31 +47,25 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
; X86-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; X86-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
; X86-NEXT: packuswb %xmm2, %xmm1
; X86-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X86-NEXT: movdqa %xmm1, %xmm2
; X86-NEXT: pand %xmm0, %xmm2
; X86-NEXT: psllw $4, %xmm2
; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; X86-NEXT: packuswb %xmm2, %xmm0
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psllw $4, %xmm1
; X86-NEXT: pand {{\.LCPI.*}}, %xmm1
; X86-NEXT: psrlw $4, %xmm1
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: pandn %xmm2, %xmm0
; X86-NEXT: psrlw $4, %xmm0
; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: psllw $2, %xmm1
; X86-NEXT: pand {{\.LCPI.*}}, %xmm1
; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: psrlw $2, %xmm0
; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: paddb %xmm1, %xmm1
; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: psrlw $1, %xmm0
; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl

View File

@ -268,16 +268,11 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_shl_ext_shl1:
; SSE2: # %bb.0:
; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pslld $29, %xmm2
; SSE2-NEXT: pslld $28, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE2-NEXT: pslld $30, %xmm0
; SSE2-NEXT: xorpd %xmm2, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_ext_shl1:

View File

@ -1309,7 +1309,6 @@ define i1 @allones_v32i8_and1(<32 x i8> %arg) {
; AVX1-NEXT: vpmovmskb %xmm1, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %ecx
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
@ -1368,7 +1367,6 @@ define i1 @allzeros_v32i8_and1(<32 x i8> %arg) {
; AVX1-NEXT: vpmovmskb %xmm1, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %ecx
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
@ -1432,8 +1430,6 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) {
; AVX1-LABEL: allones_v64i8_and1:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovmskb %xmm2, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
@ -1441,7 +1437,6 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) {
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vpsllw $7, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
@ -1518,8 +1513,6 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) {
; AVX1-LABEL: allzeros_v64i8_and1:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovmskb %xmm2, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
@ -1527,7 +1520,6 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) {
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vpsllw $7, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
@ -2728,7 +2720,6 @@ define i1 @allones_v32i8_and4(<32 x i8> %arg) {
; AVX1-NEXT: vpmovmskb %xmm1, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %ecx
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
@ -2787,7 +2778,6 @@ define i1 @allzeros_v32i8_and4(<32 x i8> %arg) {
; AVX1-NEXT: vpmovmskb %xmm1, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %ecx
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
@ -2851,8 +2841,6 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) {
; AVX1-LABEL: allones_v64i8_and4:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovmskb %xmm2, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
@ -2860,7 +2848,6 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) {
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
@ -2937,8 +2924,6 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) {
; AVX1-LABEL: allzeros_v64i8_and4:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovmskb %xmm2, %eax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0
@ -2946,7 +2931,6 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) {
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm0
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0

File diff suppressed because it is too large Load Diff

View File

@ -414,10 +414,9 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -339,9 +339,7 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psraw $15, %xmm0
@ -505,9 +503,7 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; X32-SSE-LABEL: var_shift_v8i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; X32-SSE-NEXT: pand %xmm0, %xmm2
; X32-SSE-NEXT: pand %xmm0, %xmm1
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: psllw $12, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: psraw $15, %xmm0
@ -1122,11 +1118,9 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psraw $15, %xmm0
@ -1287,11 +1281,9 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; X32-SSE-NEXT: pand %xmm0, %xmm2
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X32-SSE-NEXT: pand %xmm0, %xmm1
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; X32-SSE-NEXT: psllw $12, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: psraw $15, %xmm0

View File

@ -1660,9 +1660,7 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]