[TargetLowering] Expand the last stage of i16 popcnt using shift+add+and instead of mul+shift.
If we use multiply it would be with 0x0101 which is 1 more than a power of 2. On some targets we would expand this to shl+add. By avoiding the multiply earlier, we can generate better code. Note, PowerPC doesn't do the shl+add expansion of multiply so one of the tests increased in instruction count. Limiting to scalars because it almost always increased the number of instructions in vector tests. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D125638
This commit is contained in:
parent
e6fc8454be
commit
1c4880a2d3
|
@ -7716,6 +7716,18 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
|
|||
if (Len <= 8)
|
||||
return Op;
|
||||
|
||||
// Avoid the multiply if we only have 2 bytes to add.
|
||||
// TODO: Only doing this for scalars because vectors weren't as obviously
|
||||
// improved.
|
||||
if (Len == 16 && !VT.isVector()) {
|
||||
// v = (v + (v >> 8)) & 0x00FF;
|
||||
return DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::ADD, dl, VT, Op,
|
||||
DAG.getNode(ISD::SRL, dl, VT, Op,
|
||||
DAG.getConstant(8, dl, ShVT))),
|
||||
DAG.getConstant(0xFF, dl, VT));
|
||||
}
|
||||
|
||||
// v = (v * 0x01010101...) >> (Len - 8)
|
||||
SDValue Mask01 =
|
||||
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
|
||||
|
|
|
@ -23,9 +23,9 @@ define i16 @zpop_i8_i16(i8 %x) {
|
|||
; SLOW-NEXT: add 3, 4, 3
|
||||
; SLOW-NEXT: srwi 4, 3, 4
|
||||
; SLOW-NEXT: add 3, 3, 4
|
||||
; SLOW-NEXT: andi. 3, 3, 3855
|
||||
; SLOW-NEXT: mulli 3, 3, 257
|
||||
; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
|
||||
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
|
||||
; SLOW-NEXT: clrlwi 3, 3, 28
|
||||
; SLOW-NEXT: add 3, 3, 4
|
||||
; SLOW-NEXT: blr
|
||||
%z = zext i8 %x to i16
|
||||
%pop = tail call i16 @llvm.ctpop.i16(i16 %z)
|
||||
|
@ -172,9 +172,10 @@ define i32 @popz_i16_32(i16 %x) {
|
|||
; SLOW-NEXT: add 3, 4, 3
|
||||
; SLOW-NEXT: srwi 4, 3, 4
|
||||
; SLOW-NEXT: add 3, 3, 4
|
||||
; SLOW-NEXT: andi. 3, 3, 3855
|
||||
; SLOW-NEXT: mulli 3, 3, 257
|
||||
; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
|
||||
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
|
||||
; SLOW-NEXT: clrlwi 3, 3, 28
|
||||
; SLOW-NEXT: add 3, 3, 4
|
||||
; SLOW-NEXT: clrldi 3, 3, 32
|
||||
; SLOW-NEXT: blr
|
||||
%pop = tail call i16 @llvm.ctpop.i16(i16 %x)
|
||||
%z = zext i16 %pop to i32
|
||||
|
@ -276,9 +277,9 @@ define i64 @popa_i16_i64(i16 %x) {
|
|||
; SLOW-NEXT: add 3, 4, 3
|
||||
; SLOW-NEXT: srwi 4, 3, 4
|
||||
; SLOW-NEXT: add 3, 3, 4
|
||||
; SLOW-NEXT: andi. 3, 3, 3855
|
||||
; SLOW-NEXT: mulli 3, 3, 257
|
||||
; SLOW-NEXT: srwi 3, 3, 8
|
||||
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
|
||||
; SLOW-NEXT: clrlwi 3, 3, 28
|
||||
; SLOW-NEXT: add 3, 3, 4
|
||||
; SLOW-NEXT: rlwinm 3, 3, 0, 27, 27
|
||||
; SLOW-NEXT: blr
|
||||
%pop = call i16 @llvm.ctpop.i16(i16 %x)
|
||||
|
|
|
@ -110,13 +110,10 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
|
|||
; RV32_NOZBB-NEXT: add a0, a2, a0
|
||||
; RV32_NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV32_NOZBB-NEXT: add a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: lui a1, 1
|
||||
; RV32_NOZBB-NEXT: addi a1, a1, -241
|
||||
; RV32_NOZBB-NEXT: and a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: slli a1, a0, 8
|
||||
; RV32_NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 20
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 28
|
||||
; RV32_NOZBB-NEXT: add a0, a1, a0
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 19
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 27
|
||||
; RV32_NOZBB-NEXT: ret
|
||||
; RV32_NOZBB-NEXT: .LBB1_2:
|
||||
; RV32_NOZBB-NEXT: li a0, 16
|
||||
|
@ -143,14 +140,11 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
|
|||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: add a0, a2, a0
|
||||
; RV64NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV64NOZBB-NEXT: add a0, a0, a1
|
||||
; RV64NOZBB-NEXT: lui a1, 1
|
||||
; RV64NOZBB-NEXT: addiw a1, a1, -241
|
||||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: slliw a1, a0, 8
|
||||
; RV64NOZBB-NEXT: addw a0, a1, a0
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 51
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 59
|
||||
; RV64NOZBB-NEXT: addw a0, a0, a1
|
||||
; RV64NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 52
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 60
|
||||
; RV64NOZBB-NEXT: add a0, a1, a0
|
||||
; RV64NOZBB-NEXT: ret
|
||||
; RV64NOZBB-NEXT: .LBB1_2:
|
||||
; RV64NOZBB-NEXT: li a0, 16
|
||||
|
@ -606,13 +600,10 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
|
|||
; RV32_NOZBB-NEXT: add a0, a2, a0
|
||||
; RV32_NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV32_NOZBB-NEXT: add a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: lui a1, 1
|
||||
; RV32_NOZBB-NEXT: addi a1, a1, -241
|
||||
; RV32_NOZBB-NEXT: and a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: slli a1, a0, 8
|
||||
; RV32_NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 20
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 28
|
||||
; RV32_NOZBB-NEXT: add a0, a1, a0
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 19
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 27
|
||||
; RV32_NOZBB-NEXT: ret
|
||||
;
|
||||
; RV64NOZBB-LABEL: test_cttz_i16_zero_undef:
|
||||
|
@ -632,14 +623,11 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
|
|||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: add a0, a2, a0
|
||||
; RV64NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV64NOZBB-NEXT: add a0, a0, a1
|
||||
; RV64NOZBB-NEXT: lui a1, 1
|
||||
; RV64NOZBB-NEXT: addiw a1, a1, -241
|
||||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: slliw a1, a0, 8
|
||||
; RV64NOZBB-NEXT: addw a0, a1, a0
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 51
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 59
|
||||
; RV64NOZBB-NEXT: addw a0, a0, a1
|
||||
; RV64NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 52
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 60
|
||||
; RV64NOZBB-NEXT: add a0, a1, a0
|
||||
; RV64NOZBB-NEXT: ret
|
||||
;
|
||||
; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
|
||||
|
@ -1096,13 +1084,10 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
|
|||
; RV32_NOZBB-NEXT: add a0, a2, a0
|
||||
; RV32_NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV32_NOZBB-NEXT: add a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: lui a1, 1
|
||||
; RV32_NOZBB-NEXT: addi a1, a1, -241
|
||||
; RV32_NOZBB-NEXT: and a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: slli a1, a0, 8
|
||||
; RV32_NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 20
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 28
|
||||
; RV32_NOZBB-NEXT: add a0, a1, a0
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 19
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 27
|
||||
; RV32_NOZBB-NEXT: ret
|
||||
; RV32_NOZBB-NEXT: .LBB9_2:
|
||||
; RV32_NOZBB-NEXT: li a0, 16
|
||||
|
@ -1138,14 +1123,11 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
|
|||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: add a0, a2, a0
|
||||
; RV64NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV64NOZBB-NEXT: add a0, a0, a1
|
||||
; RV64NOZBB-NEXT: lui a1, 1
|
||||
; RV64NOZBB-NEXT: addiw a1, a1, -241
|
||||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: slliw a1, a0, 8
|
||||
; RV64NOZBB-NEXT: addw a0, a1, a0
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 51
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 59
|
||||
; RV64NOZBB-NEXT: addw a0, a0, a1
|
||||
; RV64NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 52
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 60
|
||||
; RV64NOZBB-NEXT: add a0, a1, a0
|
||||
; RV64NOZBB-NEXT: ret
|
||||
; RV64NOZBB-NEXT: .LBB9_2:
|
||||
; RV64NOZBB-NEXT: li a0, 16
|
||||
|
@ -1713,13 +1695,10 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
|
|||
; RV32_NOZBB-NEXT: add a0, a2, a0
|
||||
; RV32_NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV32_NOZBB-NEXT: add a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: lui a1, 1
|
||||
; RV32_NOZBB-NEXT: addi a1, a1, -241
|
||||
; RV32_NOZBB-NEXT: and a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: slli a1, a0, 8
|
||||
; RV32_NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 20
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 28
|
||||
; RV32_NOZBB-NEXT: add a0, a1, a0
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 19
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 27
|
||||
; RV32_NOZBB-NEXT: ret
|
||||
;
|
||||
; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
|
||||
|
@ -1749,14 +1728,11 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
|
|||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: add a0, a2, a0
|
||||
; RV64NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV64NOZBB-NEXT: add a0, a0, a1
|
||||
; RV64NOZBB-NEXT: lui a1, 1
|
||||
; RV64NOZBB-NEXT: addiw a1, a1, -241
|
||||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: slliw a1, a0, 8
|
||||
; RV64NOZBB-NEXT: addw a0, a1, a0
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 51
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 59
|
||||
; RV64NOZBB-NEXT: addw a0, a0, a1
|
||||
; RV64NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 52
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 60
|
||||
; RV64NOZBB-NEXT: add a0, a1, a0
|
||||
; RV64NOZBB-NEXT: ret
|
||||
;
|
||||
; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
|
||||
|
@ -2251,13 +2227,10 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
|
|||
; RV32_NOZBB-NEXT: add a0, a2, a0
|
||||
; RV32_NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV32_NOZBB-NEXT: add a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: lui a1, 1
|
||||
; RV32_NOZBB-NEXT: addi a1, a1, -241
|
||||
; RV32_NOZBB-NEXT: and a0, a0, a1
|
||||
; RV32_NOZBB-NEXT: slli a1, a0, 8
|
||||
; RV32_NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 20
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 28
|
||||
; RV32_NOZBB-NEXT: add a0, a1, a0
|
||||
; RV32_NOZBB-NEXT: slli a0, a0, 19
|
||||
; RV32_NOZBB-NEXT: srli a0, a0, 27
|
||||
; RV32_NOZBB-NEXT: ret
|
||||
;
|
||||
; RV64NOZBB-LABEL: test_ctpop_i16:
|
||||
|
@ -2274,14 +2247,11 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
|
|||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: add a0, a2, a0
|
||||
; RV64NOZBB-NEXT: srli a1, a0, 4
|
||||
; RV64NOZBB-NEXT: add a0, a0, a1
|
||||
; RV64NOZBB-NEXT: lui a1, 1
|
||||
; RV64NOZBB-NEXT: addiw a1, a1, -241
|
||||
; RV64NOZBB-NEXT: and a0, a0, a1
|
||||
; RV64NOZBB-NEXT: slliw a1, a0, 8
|
||||
; RV64NOZBB-NEXT: addw a0, a1, a0
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 51
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 59
|
||||
; RV64NOZBB-NEXT: addw a0, a0, a1
|
||||
; RV64NOZBB-NEXT: andi a1, a0, 15
|
||||
; RV64NOZBB-NEXT: slli a0, a0, 52
|
||||
; RV64NOZBB-NEXT: srli a0, a0, 60
|
||||
; RV64NOZBB-NEXT: add a0, a1, a0
|
||||
; RV64NOZBB-NEXT: ret
|
||||
;
|
||||
; RV32ZBB-LABEL: test_ctpop_i16:
|
||||
|
|
|
@ -64,9 +64,8 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) {
|
|||
; NOPOPCNT-NEXT: addl %eax, %ecx
|
||||
; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F
|
||||
; NOPOPCNT-NEXT: movl %ecx, %eax
|
||||
; NOPOPCNT-NEXT: shll $8, %eax
|
||||
; NOPOPCNT-NEXT: addl %ecx, %eax
|
||||
; NOPOPCNT-NEXT: shrl $8, %eax
|
||||
; NOPOPCNT-NEXT: addl %ecx, %eax
|
||||
; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax
|
||||
; NOPOPCNT-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -77,9 +77,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
|
|||
; X86-NEXT: addl %eax, %ecx
|
||||
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
|
||||
; X86-NEXT: movl %ecx, %eax
|
||||
; X86-NEXT: shll $8, %eax
|
||||
; X86-NEXT: shrl $8, %eax
|
||||
; X86-NEXT: addl %ecx, %eax
|
||||
; X86-NEXT: movzbl %ah, %eax
|
||||
; X86-NEXT: movzbl %al, %eax
|
||||
; X86-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
|
@ -99,9 +99,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
|
|||
; X64-NEXT: addl %edi, %eax
|
||||
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
|
||||
; X64-NEXT: movl %eax, %ecx
|
||||
; X64-NEXT: shll $8, %ecx
|
||||
; X64-NEXT: shrl $8, %ecx
|
||||
; X64-NEXT: addl %eax, %ecx
|
||||
; X64-NEXT: movzbl %ch, %eax
|
||||
; X64-NEXT: movzbl %cl, %eax
|
||||
; X64-NEXT: # kill: def $ax killed $ax killed $eax
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
|
@ -1540,9 +1540,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
|
|||
; X86-NEXT: addl %eax, %ecx
|
||||
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
|
||||
; X86-NEXT: movl %ecx, %eax
|
||||
; X86-NEXT: shll $8, %eax
|
||||
; X86-NEXT: shrl $8, %eax
|
||||
; X86-NEXT: addl %ecx, %eax
|
||||
; X86-NEXT: movzbl %ah, %eax
|
||||
; X86-NEXT: movzbl %al, %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: popcount_i16_zext:
|
||||
|
@ -1561,9 +1561,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
|
|||
; X64-NEXT: addl %edi, %eax
|
||||
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
|
||||
; X64-NEXT: movl %eax, %ecx
|
||||
; X64-NEXT: shll $8, %ecx
|
||||
; X64-NEXT: shrl $8, %ecx
|
||||
; X64-NEXT: addl %eax, %ecx
|
||||
; X64-NEXT: movzbl %ch, %eax
|
||||
; X64-NEXT: movzbl %cl, %eax
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X86-POPCNT-LABEL: popcount_i16_zext:
|
||||
|
|
Loading…
Reference in New Issue