[TargetLowering] Expand the last stage of i16 popcnt using shift+add+and instead of mul+shift.

If we use multiply it would be with 0x0101 which is 1 more than a power
of 2. On some targets we would expand this to shl+add. By avoiding the
multiply earlier, we can generate better code.

Note, PowerPC doesn't do the shl+add expansion of multiply so one of
the tests increased in instruction count.

Limiting to scalars because it almost always increased the number of
instructions in vector tests.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D125638
This commit is contained in:
Craig Topper 2022-05-16 09:27:43 -07:00
parent e6fc8454be
commit 1c4880a2d3
5 changed files with 71 additions and 89 deletions

View File

@ -7716,6 +7716,18 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
if (Len <= 8)
return Op;
// Avoid the multiply if we only have 2 bytes to add.
// TODO: Only doing this for scalars because vectors weren't as obviously
// improved.
if (Len == 16 && !VT.isVector()) {
// v = (v + (v >> 8)) & 0x00FF;
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::ADD, dl, VT, Op,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(8, dl, ShVT))),
DAG.getConstant(0xFF, dl, VT));
}
// v = (v * 0x01010101...) >> (Len - 8)
SDValue Mask01 =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);

View File

@ -23,9 +23,9 @@ define i16 @zpop_i8_i16(i8 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: andi. 3, 3, 3855
; SLOW-NEXT: mulli 3, 3, 257
; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
; SLOW-NEXT: clrlwi 3, 3, 28
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: blr
%z = zext i8 %x to i16
%pop = tail call i16 @llvm.ctpop.i16(i16 %z)
@ -172,9 +172,10 @@ define i32 @popz_i16_32(i16 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: andi. 3, 3, 3855
; SLOW-NEXT: mulli 3, 3, 257
; SLOW-NEXT: rlwinm 3, 3, 24, 24, 31
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
; SLOW-NEXT: clrlwi 3, 3, 28
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: clrldi 3, 3, 32
; SLOW-NEXT: blr
%pop = tail call i16 @llvm.ctpop.i16(i16 %x)
%z = zext i16 %pop to i32
@ -276,9 +277,9 @@ define i64 @popa_i16_i64(i16 %x) {
; SLOW-NEXT: add 3, 4, 3
; SLOW-NEXT: srwi 4, 3, 4
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: andi. 3, 3, 3855
; SLOW-NEXT: mulli 3, 3, 257
; SLOW-NEXT: srwi 3, 3, 8
; SLOW-NEXT: rlwinm 4, 3, 24, 28, 31
; SLOW-NEXT: clrlwi 3, 3, 28
; SLOW-NEXT: add 3, 3, 4
; SLOW-NEXT: rlwinm 3, 3, 0, 27, 27
; SLOW-NEXT: blr
%pop = call i16 @llvm.ctpop.i16(i16 %x)

View File

@ -110,13 +110,10 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
; RV32_NOZBB-NEXT: .LBB1_2:
; RV32_NOZBB-NEXT: li a0, 16
@ -143,14 +140,11 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
; RV64NOZBB-NEXT: .LBB1_2:
; RV64NOZBB-NEXT: li a0, 16
@ -606,13 +600,10 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_cttz_i16_zero_undef:
@ -632,14 +623,11 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
@ -1096,13 +1084,10 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
; RV32_NOZBB-NEXT: .LBB9_2:
; RV32_NOZBB-NEXT: li a0, 16
@ -1138,14 +1123,11 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
; RV64NOZBB-NEXT: .LBB9_2:
; RV64NOZBB-NEXT: li a0, 16
@ -1713,13 +1695,10 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
@ -1749,14 +1728,11 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
@ -2251,13 +2227,10 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV32_NOZBB-NEXT: add a0, a2, a0
; RV32_NOZBB-NEXT: srli a1, a0, 4
; RV32_NOZBB-NEXT: add a0, a0, a1
; RV32_NOZBB-NEXT: lui a1, 1
; RV32_NOZBB-NEXT: addi a1, a1, -241
; RV32_NOZBB-NEXT: and a0, a0, a1
; RV32_NOZBB-NEXT: slli a1, a0, 8
; RV32_NOZBB-NEXT: andi a1, a0, 15
; RV32_NOZBB-NEXT: slli a0, a0, 20
; RV32_NOZBB-NEXT: srli a0, a0, 28
; RV32_NOZBB-NEXT: add a0, a1, a0
; RV32_NOZBB-NEXT: slli a0, a0, 19
; RV32_NOZBB-NEXT: srli a0, a0, 27
; RV32_NOZBB-NEXT: ret
;
; RV64NOZBB-LABEL: test_ctpop_i16:
@ -2274,14 +2247,11 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: add a0, a2, a0
; RV64NOZBB-NEXT: srli a1, a0, 4
; RV64NOZBB-NEXT: add a0, a0, a1
; RV64NOZBB-NEXT: lui a1, 1
; RV64NOZBB-NEXT: addiw a1, a1, -241
; RV64NOZBB-NEXT: and a0, a0, a1
; RV64NOZBB-NEXT: slliw a1, a0, 8
; RV64NOZBB-NEXT: addw a0, a1, a0
; RV64NOZBB-NEXT: slli a0, a0, 51
; RV64NOZBB-NEXT: srli a0, a0, 59
; RV64NOZBB-NEXT: addw a0, a0, a1
; RV64NOZBB-NEXT: andi a1, a0, 15
; RV64NOZBB-NEXT: slli a0, a0, 52
; RV64NOZBB-NEXT: srli a0, a0, 60
; RV64NOZBB-NEXT: add a0, a1, a0
; RV64NOZBB-NEXT: ret
;
; RV32ZBB-LABEL: test_ctpop_i16:

View File

@ -64,9 +64,8 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) {
; NOPOPCNT-NEXT: addl %eax, %ecx
; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F
; NOPOPCNT-NEXT: movl %ecx, %eax
; NOPOPCNT-NEXT: shll $8, %eax
; NOPOPCNT-NEXT: addl %ecx, %eax
; NOPOPCNT-NEXT: shrl $8, %eax
; NOPOPCNT-NEXT: addl %ecx, %eax
; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax
; NOPOPCNT-NEXT: retq
;

View File

@ -77,9 +77,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movzbl %ah, %eax
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
@ -99,9 +99,9 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shll $8, %ecx
; X64-NEXT: shrl $8, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movzbl %ch, %eax
; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
@ -1540,9 +1540,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: shrl $8, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movzbl %ah, %eax
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: retl
;
; X64-LABEL: popcount_i16_zext:
@ -1561,9 +1561,9 @@ define i32 @popcount_i16_zext(i16 zeroext %x) {
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shll $8, %ecx
; X64-NEXT: shrl $8, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movzbl %ch, %eax
; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: retq
;
; X86-POPCNT-LABEL: popcount_i16_zext: