[InstCombine] fold mul with masked low bit operand to trunc+select

https://alive2.llvm.org/ce/z/o7rQ5q This shows an extra instruction in some cases, but that is caused by an existing canonicalization of trunc -> and+icmp. Codegen should be better for any target where a multiply is more costly than the most simple ALU op. This ends up producing the requested x86 asm from issue #55618, but it's not the same IR. We are missing a canonicalization from the negate+mask pattern to the trunc+select created here.
2022-06-05 17:55:09 -04:00 · 2022-06-05 17:55:09 -04:00 · 3f33d67d8a
parent abb21b54bc
commit 3f33d67d8a
6 changed files with 34 additions and 25 deletions
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@ -390,6 +390,12 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
    return SelectInst::Create(IsNeg, Y, ConstantInt::getNullValue(Ty));
  }
  // (and X, 1) * Y --> (trunc X) ? Y : 0
  if (match(&I, m_c_BinOp(m_OneUse(m_And(m_Value(X), m_One())), m_Value(Y)))) {
    Value *Tr = Builder.CreateTrunc(X, CmpInst::makeCmpResultType(Ty));
    return SelectInst::Create(Tr, Y, ConstantInt::getNullValue(Ty));
  }
  // ((ashr X, 31) | 1) * X --> abs(X)
  // X * ((ashr X, 31) | 1) --> abs(X)
  if (match(&I, m_c_BinOp(m_Or(m_AShr(m_Value(X),
--- a/llvm/test/Transforms/InstCombine/icmp-mul-and.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul-and.ll
@ -267,10 +267,11 @@ define i1 @pr51551_neg1(i32 %x, i32 %y) {
 define i1 @pr51551_neg2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pr51551_neg2(
-; CHECK-NEXT:    [[T0:%.*]] = and i32 [[Y:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[MUL]], 7
+; CHECK-NEXT:    [[X_OP:%.*]] = and i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X_OP]], 0
 ; CHECK-NEXT:    [[CMP:%.*]] = select i1 [[DOTNOT]], i1 true, i1 [[CMP1]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
  %t0 = and i32 %y, -7
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@ -749,16 +749,11 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
  ret i1 %r
 }
-; negative test - no leading zeros for 's'
+; no leading zeros for 's', but we reduce this with other transforms
 ; TODO: If analysis was generalized for sign bits, we could reduce this to false.
 define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
 ; CHECK-LABEL: @mul_of_bool_no_lz_other_op(
-; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 1
+; CHECK-NEXT:    ret i1 false
 ; CHECK-NEXT:    [[S:%.*]] = sext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[B]], [[S]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[M]], 127
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
  %b = and i32 %x, 1
  %s = sext i8 %y to i32
--- a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
+++ b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
@ -80,8 +80,8 @@ define <4 x i32> @combine_mul_self_demandedbits_vector(<4 x i32> %x) {
 define i8 @one_demanded_bit(i8 %x) {
 ; CHECK-LABEL: @one_demanded_bit(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[X:%.*]], 6
+; CHECK-NEXT:    [[M:%.*]] = shl i8 [[X:%.*]], 6
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP1]], -65
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[M]], -65
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %m = mul i8 %x, 192  ; 0b1100_0000
@ -91,8 +91,8 @@ define i8 @one_demanded_bit(i8 %x) {
 define <2 x i8> @one_demanded_bit_splat(<2 x i8> %x) {
 ; CHECK-LABEL: @one_demanded_bit_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], <i8 5, i8 5>
+; CHECK-NEXT:    [[M:%.*]] = shl <2 x i8> [[X:%.*]], <i8 5, i8 5>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[TMP1]], <i8 32, i8 32>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[M]], <i8 32, i8 32>
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
  %m = mul <2 x i8> %x, <i8 160, i8 160> ; 0b1010_0000
@ -201,9 +201,10 @@ define i64 @scalar_mul_bit_x0_y0_uses(i64 %x, i64 %y) {
 ; Negative test
 define i64 @scalar_mul_bit_x0_y1(i64 %x, i64 %y) {
 ; CHECK-LABEL: @scalar_mul_bit_x0_y1(
 ; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[Y:%.*]], 2
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[AND1]], [[AND2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[DOTNOT]], i64 0, i64 [[AND2]]
 ; CHECK-NEXT:    ret i64 [[MUL]]
 ;
  %and1 = and i64 %x, 1
@ -214,9 +215,10 @@ define i64 @scalar_mul_bit_x0_y1(i64 %x, i64 %y) {
 define i64 @scalar_mul_bit_x0_yC(i64 %x, i64 %y, i64 %c) {
 ; CHECK-LABEL: @scalar_mul_bit_x0_yC(
 ; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[Y:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i64 [[AND1]], [[AND2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[DOTNOT]], i64 0, i64 [[AND2]]
 ; CHECK-NEXT:    ret i64 [[MUL]]
 ;
  %and1 = and i64 %x, 1
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@ -466,8 +466,9 @@ define <2 x i32> @signbit_mul_vec_commute(<2 x i32> %a, <2 x i32> %b) {
 define i32 @lowbit_mul(i32 %a, i32 %b) {
 ; CHECK-LABEL: @lowbit_mul(
-; CHECK-NEXT:    [[D:%.*]] = and i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 1
-; CHECK-NEXT:    [[E:%.*]] = mul nuw i32 [[D]], [[B:%.*]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[E:%.*]] = select i1 [[DOTNOT]], i32 0, i32 [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
  %d = and i32 %a, 1
@ -480,8 +481,8 @@ define i32 @lowbit_mul(i32 %a, i32 %b) {
 define <2 x i17> @lowbit_mul_commute(<2 x i17> %a, <2 x i17> %p) {
 ; CHECK-LABEL: @lowbit_mul_commute(
 ; CHECK-NEXT:    [[B:%.*]] = xor <2 x i17> [[P:%.*]], <i17 42, i17 43>
-; CHECK-NEXT:    [[D:%.*]] = and <2 x i17> [[A:%.*]], <i17 1, i17 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i17> [[A:%.*]] to <2 x i1>
-; CHECK-NEXT:    [[E:%.*]] = mul nuw <2 x i17> [[B]], [[D]]
+; CHECK-NEXT:    [[E:%.*]] = select <2 x i1> [[TMP1]], <2 x i17> [[B]], <2 x i17> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i17> [[E]]
 ;
  %b = xor <2 x i17> %p, <i17 42, i17 43> ; thwart complexity-based canonicalization
@ -490,6 +491,8 @@ define <2 x i17> @lowbit_mul_commute(<2 x i17> %a, <2 x i17> %p) {
  ret <2 x i17> %e
 }
 ; negative test - extra use
 define i32 @lowbit_mul_use(i32 %a, i32 %b) {
 ; CHECK-LABEL: @lowbit_mul_use(
 ; CHECK-NEXT:    [[D:%.*]] = and i32 [[A:%.*]], 1
@ -503,6 +506,8 @@ define i32 @lowbit_mul_use(i32 %a, i32 %b) {
  ret i32 %e
 }
 ; negative test - wrong mask
 define i32 @not_lowbit_mul(i32 %a, i32 %b) {
 ; CHECK-LABEL: @not_lowbit_mul(
 ; CHECK-NEXT:    [[D:%.*]] = and i32 [[A:%.*]], 2
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@ -1499,8 +1499,8 @@ define i32 @mul_no_common_bits_const_op(i32 %p) {
 define <2 x i12> @mul_no_common_bits_commute(<2 x i12> %p) {
 ; CHECK-LABEL: @mul_no_common_bits_commute(
-; CHECK-NEXT:    [[X:%.*]] = and <2 x i12> [[P:%.*]], <i12 1, i12 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i12> [[P:%.*]] to <2 x i1>
-; CHECK-NEXT:    [[R:%.*]] = mul nuw nsw <2 x i12> [[X]], <i12 15, i12 17>
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[TMP1]], <2 x i12> <i12 15, i12 17>, <2 x i12> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i12> [[R]]
 ;
  %x = and <2 x i12> %p, <i12 1, i12 1>