[X86] Promote i8/i16 CTTZ (BSF) instructions and remove speculation branch

This patch adds a Type operand to the TLI isCheapToSpeculateCttz/isCheapToSpeculateCtlz callbacks, allowing targets to decide whether branches should occur on a type-by-type/legality basis. For X86, this patch proposes to allow CTTZ speculation for i8/i16 types that will lower to promoted i32 BSF instructions by masking the operand above the msb (we already do something similar for i8/i16 TZCNT). This required a minor tweak to CTTZ lowering - if the src operand is known never zero (i.e. due to the promotion masking) we can remove the CMOV zero src handling. Although BSF isn't very fast, most CPUs from the last 20 years don't do that bad a job with it, although there are some annoying passthrough EFLAGS dependencies. Additionally, now that we emit 'REP BSF' in most cases, we are tending towards assuming this will most likely be executed as a TZCNT instruction on any semi-modern CPU. Differential Revision: https://reviews.llvm.org/D132520
2022-08-24 17:28:07 +01:00 · 2022-08-24 17:28:07 +01:00 · f9de13232f
parent 172fe1706d
commit f9de13232f
23 changed files with 78 additions and 115 deletions
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@ -1474,13 +1474,13 @@ public:
      break;
    case Intrinsic::cttz:
      // FIXME: If necessary, this should go in target-specific overrides.
-      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
+      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
        return TargetTransformInfo::TCC_Basic;
      break;

    case Intrinsic::ctlz:
      // FIXME: If necessary, this should go in target-specific overrides.
-      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
+      if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
        return TargetTransformInfo::TCC_Basic;
      break;

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@ -621,12 +621,12 @@ public:
  }

  /// Return true if it is cheap to speculate a call to intrinsic cttz.
-  virtual bool isCheapToSpeculateCttz() const {
+  virtual bool isCheapToSpeculateCttz(Type *Ty) const {
    return false;
  }

  /// Return true if it is cheap to speculate a call to intrinsic ctlz.
-  virtual bool isCheapToSpeculateCtlz() const {
+  virtual bool isCheapToSpeculateCtlz(Type *Ty) const {
    return false;
  }

--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@ -2045,13 +2045,13 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
    return false;

  // If it's cheap to speculate, there's nothing to do.
+  Type *Ty = CountZeros->getType();
  auto IntrinsicID = CountZeros->getIntrinsicID();
-  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
-      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
+  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
+      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
    return false;

  // Only handle legal scalar cases. Anything else requires too much work.
-  Type *Ty = CountZeros->getType();
  unsigned SizeInBits = Ty->getScalarSizeInBits();
  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
    return false;
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@ -742,11 +742,11 @@ public:
    return true;
  }

-  bool isCheapToSpeculateCttz() const override {
+  bool isCheapToSpeculateCttz(Type *) const override {
    return true;
  }

-  bool isCheapToSpeculateCtlz() const override {
+  bool isCheapToSpeculateCtlz(Type *) const override {
    return true;
  }

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -692,11 +692,11 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
 // profitable with the expansion for 64-bit since it's generally good to
 // speculate things.
-bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
  return true;
 }

-bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  return true;
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -193,8 +193,8 @@ public:
                                    unsigned NumElem,
                                    unsigned AS) const override;
  bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;

  bool isSDNodeAlwaysUniform(const SDNode *N) const override;
  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -21157,11 +21157,11 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
  return false;
 }

-bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
  return Subtarget->hasV6T2Ops();
 }

-bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  return Subtarget->hasV6T2Ops();
 }

--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@ -679,8 +679,8 @@ class VectorType;
      return (MemVT.getSizeInBits() <= 32);
    }

-    bool isCheapToSpeculateCttz() const override;
-    bool isCheapToSpeculateCtlz() const override;
+    bool isCheapToSpeculateCttz(Type *Ty) const override;
+    bool isCheapToSpeculateCtlz(Type *Ty) const override;

    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
      return VT.isScalarInteger();
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@ -129,8 +129,8 @@ public:
  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
  bool isTruncateFree(EVT VT1, EVT VT2) const override;

-  bool isCheapToSpeculateCttz() const override { return true; }
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCttz(Type *) const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *) const override { return true; }
  bool isCtlzFast() const override { return true; }

  bool hasBitTest(SDValue X, SDValue Y) const override;
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@ -1172,11 +1172,11 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
  return SDValue();
 }

-bool MipsTargetLowering::isCheapToSpeculateCttz() const {
+bool MipsTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
  return Subtarget.hasMips32();
 }

-bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
+bool MipsTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  return Subtarget.hasMips32();
 }

--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@ -283,8 +283,8 @@ class TargetRegisterClass;
    EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
                            ISD::NodeType) const override;

-    bool isCheapToSpeculateCttz() const override;
-    bool isCheapToSpeculateCtlz() const override;
+    bool isCheapToSpeculateCttz(Type *Ty) const override;
+    bool isCheapToSpeculateCtlz(Type *Ty) const override;
    bool hasBitTest(SDValue X, SDValue Y) const override;
    bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                           CombineLevel Level) const override;
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@ -559,7 +559,7 @@ public:
  // x == 0 is not undefined behavior) into a branch that checks whether x is 0
  // and avoids calling ctlz in that case.  We have a dedicated ctlz
  // instruction, so we say that ctlz is cheap to speculate.
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *Ty) const override { return true; }

  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override {
    return AtomicExpansionKind::None;
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@ -790,11 +790,11 @@ namespace llvm {
      return MVT::i32;
    }

-    bool isCheapToSpeculateCttz() const override {
+    bool isCheapToSpeculateCttz(Type *Ty) const override {
      return true;
    }

-    bool isCheapToSpeculateCtlz() const override {
+    bool isCheapToSpeculateCtlz(Type *Ty) const override {
      return true;
    }

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@ -1155,11 +1155,11 @@ bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const {
  return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32);
 }

-bool RISCVTargetLowering::isCheapToSpeculateCttz() const {
+bool RISCVTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
  return Subtarget.hasStdExtZbb();
 }

-bool RISCVTargetLowering::isCheapToSpeculateCtlz() const {
+bool RISCVTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  return Subtarget.hasStdExtZbb();
 }

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@ -370,8 +370,8 @@ public:
  bool isZExtFree(SDValue Val, EVT VT2) const override;
  bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
  bool signExtendConstant(const ConstantInt *CI) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;
  bool hasAndNotCompare(SDValue Y) const override;
  bool hasBitTest(SDValue X, SDValue Y) const override;
  bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@ -423,7 +423,7 @@ public:
      return 1;
    return TargetLowering::getNumRegisters(Context, VT);
  }
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *) const override { return true; }
  bool preferZeroCompareBranch() const override { return true; }
  bool hasBitPreservingFPLogic(EVT VT) const override {
    EVT ScVT = VT.getScalarType();
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@ -236,7 +236,7 @@ public:
  // VE doesn't have rem.
  bool hasStandaloneRem(EVT) const override { return false; }
  // VE LDZ instruction returns 64 if the input is zero.
-  bool isCheapToSpeculateCtlz() const override { return true; }
+  bool isCheapToSpeculateCtlz(Type *) const override { return true; }
  // VE LDZ instruction is fast.
  bool isCtlzFast() const override { return true; }
  // VE has NND instruction.
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@ -751,12 +751,12 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }

-bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const {
+bool WebAssemblyTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
  // Assume ctz is a relatively cheap operation.
  return true;
 }

-bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const {
+bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  // Assume clz is a relatively cheap operation.
  return true;
 }
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@ -65,8 +65,8 @@ private:
  std::pair<unsigned, const TargetRegisterClass *>
  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                               StringRef Constraint, MVT VT) const override;
-  bool isCheapToSpeculateCttz() const override;
-  bool isCheapToSpeculateCtlz() const override;
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;
  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                             unsigned AS,
                             Instruction *I = nullptr) const override;
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -5820,12 +5820,13 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
  return VT.isSimple() || !isOperationExpand(Opcode, VT);
 }

-bool X86TargetLowering::isCheapToSpeculateCttz() const {
-  // Speculate cttz only if we can directly use TZCNT.
-  return Subtarget.hasBMI();
+bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
+  // Speculate cttz only if we can directly use TZCNT or can promote to i32.
+  return Subtarget.hasBMI() ||
+         (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
 }

-bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  // Speculate ctlz only if we can directly use LZCNT.
  return Subtarget.hasLZCNT();
 }
@ -28877,6 +28878,10 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

+  // If src is known never zero we can skip the CMOV.
+  if (DAG.isKnownNeverZero(N0))
+    return Op;
+
  // If src is zero (i.e. bsf sets ZF), returns NumBits.
  SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
                   DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@ -1033,9 +1033,9 @@ namespace llvm {
    bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                          const MachineFunction &MF) const override;

-    bool isCheapToSpeculateCttz() const override;
+    bool isCheapToSpeculateCttz(Type *Ty) const override;

-    bool isCheapToSpeculateCtlz() const override;
+    bool isCheapToSpeculateCtlz(Type *Ty) const override;

    bool isCtlzFast() const override;

--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,NOBMI
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,BMI
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=BMI,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx | FileCheck %s -check-prefixes=BMI,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=BMI,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=BMI,AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512VPOPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BITALG
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE2,NOBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE2,BMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=CHECK,BMI,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx | FileCheck %s -check-prefixes=CHECK,BMI,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=CHECK,BMI,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512VPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BITALG

 ; Verify the cost of scalar trailing zero count instructions.

@ -70,52 +70,36 @@ define i32 @var_cttz_i32u(i32 %a) {
 }

 define i16 @var_cttz_i16(i16 %a) {
-; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
-;
-; BMI-LABEL: 'var_cttz_i16'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
+; CHECK-LABEL: 'var_cttz_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
 ;
  %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 0)
  ret i16 %cttz
 }

 define i16 @var_cttz_i16u(i16 %a) {
-; NOBMI-LABEL: 'var_cttz_i16u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
-;
-; BMI-LABEL: 'var_cttz_i16u'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
+; CHECK-LABEL: 'var_cttz_i16u'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz
 ;
  %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 1)
  ret i16 %cttz
 }

 define i8 @var_cttz_i8(i8 %a) {
-; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
-;
-; BMI-LABEL: 'var_cttz_i8'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
+; CHECK-LABEL: 'var_cttz_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
 ;
  %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 0)
  ret i8 %cttz
 }

 define i8 @var_cttz_i8u(i8 %a) {
-; NOBMI-LABEL: 'var_cttz_i8u'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
-;
-; BMI-LABEL: 'var_cttz_i8u'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
+; CHECK-LABEL: 'var_cttz_i8u'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz
 ;
  %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 1)
  ret i8 %cttz
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/clz.ll
@ -510,34 +510,20 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
  ret i64 %tmp1
 }

-; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+; Promote i8 cttz to i32 and mask bit8 to prevent (slow) zero-src bsf case.
 define i8 @cttz_i8_zero_test(i8 %n) {
 ; X86-LABEL: cttz_i8_zero_test:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB12_1
-; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB12_1:
-; X86-NEXT:    movb $8, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i8_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb %dil, %dil
-; X64-NEXT:    je .LBB12_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB12_1:
-; X64-NEXT:    movb $8, %al
+; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    rep bsfl %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@ -559,34 +545,22 @@ define i8 @cttz_i8_zero_test(i8 %n) {
  ret i8 %tmp1
 }

-; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+; Promote i16 cttz to i32 and mask bit16 to prevent (slow) zero-src bsf case.
 define i16 @cttz_i16_zero_test(i16 %n) {
 ; X86-LABEL: cttz_i16_zero_test:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB13_1
-; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    movl $65536, %eax # imm = 0x10000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB13_1:
-; X86-NEXT:    movw $16, %ax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i16_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testw %di, %di
-; X64-NEXT:    je .LBB13_1
-; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    orl $65536, %edi # imm = 0x10000
 ; X64-NEXT:    rep bsfl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB13_1:
-; X64-NEXT:    movw $16, %ax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: cttz_i16_zero_test:
 ; X86-CLZ:       # %bb.0: