From fdec50182d85ec0b8518af3baae37ae28b102f1c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 18 Aug 2022 11:55:23 +0100
Subject: [PATCH] [CostModel] Replace getUserCost with getInstructionCost *
 Replace getUserCost with getInstructionCost, covering all cost kinds. *
 Remove getInstructionLatency, it's not implemented by any backends, and we
 should fold the functionality into getUserCost (now getInstructionCost) to
 make it easier for targets to handle the cost kinds with their existing cost
 callbacks.

Original Patch by @samparker (Sam Parker)

Differential Revision: https://reviews.llvm.org/D79483
---
 .../llvm/Analysis/TargetTransformInfo.h       |  69 ++---
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  49 ++--
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   7 -
 llvm/lib/Analysis/CodeMetrics.cpp             |   2 +-
 llvm/lib/Analysis/InlineCost.cpp              |  12 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  13 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   4 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |   4 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |   9 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   5 +-
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  12 +-
 .../Target/PowerPC/PPCTargetTransformInfo.h   |   5 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   4 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp |   5 +
 .../Transforms/IPO/FunctionSpecialization.cpp |   3 +-
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |   4 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |   6 +-
 llvm/lib/Transforms/Scalar/LoopFlatten.cpp    |   2 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   4 +-
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |   2 +-
 .../Scalar/SpeculativeExecution.cpp           |   2 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |   9 +-
 .../Analysis/CostModel/AArch64/sve-math.ll    |   2 +-
 .../CostModel/ARM/intrinsic-cost-kinds.ll     |  36 +--
 .../CostModel/ARM/target-intrinsics.ll        |   2 +-
 .../CostModel/SystemZ/ext-of-icmp-cost.ll     |   2 +-
 .../CostModel/X86/arith-fp-costkinds.ll       | 237 +++++++++---------
 llvm/test/Analysis/CostModel/X86/costmodel.ll |   2 +-
 .../CostModel/X86/intrinsic-cost-kinds.ll     |  52 ++--
 29 files changed, 263 insertions(+), 302 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8ff6626e1e0c..c579f7742289 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -219,29 +219,6 @@ public:
     TCK_SizeAndLatency   ///< The weighted sum of size and latency.
   };
 
-  /// Query the cost of a specified instruction.
-  ///
-  /// Clients should use this interface to query the cost of an existing
-  /// instruction. The instruction must have a valid parent (basic block).
-  ///
-  /// Note, this method does not cache the cost calculation and it
-  /// can be expensive in some cases.
-  InstructionCost getInstructionCost(const Instruction *I,
-                                     enum TargetCostKind kind) const {
-    InstructionCost Cost;
-    switch (kind) {
-    case TCK_Latency:
-      Cost = getInstructionLatency(I);
-      break;
-    case TCK_RecipThroughput:
-    case TCK_CodeSize:
-    case TCK_SizeAndLatency:
-      Cost = getUserCost(I, kind);
-      break;
-    }
-    return Cost;
-  }
-
   /// Underlying constants for 'cost' values in this interface.
   ///
   /// Many APIs in this interface return a cost. This enum defines the
@@ -320,14 +297,16 @@ public:
   ///
   /// The returned cost is defined in terms of \c TargetCostConstants, see its
   /// comments for a detailed explanation of the cost values.
-  InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
-                              TargetCostKind CostKind) const;
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TargetCostKind CostKind) const;
 
-  /// This is a helper function which calls the two-argument getUserCost
-  /// with \p Operands which are the current operands U has.
-  InstructionCost getUserCost(const User *U, TargetCostKind CostKind) const {
+  /// This is a helper function which calls the three-argument
+  /// getInstructionCost with \p Operands which are the current operands U has.
+  InstructionCost getInstructionCost(const User *U,
+                                     TargetCostKind CostKind) const {
     SmallVector<const Value *, 4> Operands(U->operand_values());
-    return getUserCost(U, Operands, CostKind);
+    return getInstructionCost(U, Operands, CostKind);
   }
 
   /// If a branch or a select condition is skewed in one direction by more than
@@ -432,11 +411,11 @@ public:
   /// Parameters that control the generic loop unrolling transformation.
   struct UnrollingPreferences {
     /// The cost threshold for the unrolled loop. Should be relative to the
-    /// getUserCost values returned by this API, and the expectation is that
-    /// the unrolled loop's instructions when run through that interface should
-    /// not exceed this cost. However, this is only an estimate. Also, specific
-    /// loops may be unrolled even with a cost above this threshold if deemed
-    /// profitable. Set this to UINT_MAX to disable the loop body cost
+    /// getInstructionCost values returned by this API, and the expectation is
+    /// that the unrolled loop's instructions when run through that interface
+    /// should not exceed this cost. However, this is only an estimate. Also,
+    /// specific loops may be unrolled even with a cost above this threshold if
+    /// deemed profitable. Set this to UINT_MAX to disable the loop body cost
     /// restriction.
     unsigned Threshold;
     /// If complete unrolling will reduce the cost of the loop, we will boost
@@ -1519,10 +1498,6 @@ public:
   /// @}
 
 private:
-  /// Estimate the latency of specified instruction.
-  /// Returns 1 as the default value.
-  InstructionCost getInstructionLatency(const Instruction *I) const;
-
   /// The abstract base class used to type erase specific TTI
   /// implementations.
   class Concept;
@@ -1549,9 +1524,9 @@ public:
   getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
                                    ProfileSummaryInfo *PSI,
                                    BlockFrequencyInfo *BFI) = 0;
-  virtual InstructionCost getUserCost(const User *U,
-                                      ArrayRef<const Value *> Operands,
-                                      TargetCostKind CostKind) = 0;
+  virtual InstructionCost getInstructionCost(const User *U,
+                                             ArrayRef<const Value *> Operands,
+                                             TargetCostKind CostKind) = 0;
   virtual BranchProbability getPredictableBranchThreshold() = 0;
   virtual bool hasBranchDivergence() = 0;
   virtual bool useGPUDivergenceAnalysis() = 0;
@@ -1866,7 +1841,6 @@ public:
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                                      Align Alignment) const = 0;
-  virtual InstructionCost getInstructionLatency(const Instruction *I) = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
 };
@@ -1901,9 +1875,10 @@ public:
   InstructionCost getMemcpyCost(const Instruction *I) override {
     return Impl.getMemcpyCost(I);
   }
-  InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
-                              TargetCostKind CostKind) override {
-    return Impl.getUserCost(U, Operands, CostKind);
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TargetCostKind CostKind) override {
+    return Impl.getInstructionCost(U, Operands, CostKind);
   }
   BranchProbability getPredictableBranchThreshold() override {
     return Impl.getPredictableBranchThreshold();
@@ -2518,10 +2493,6 @@ public:
     return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
   }
 
-  InstructionCost getInstructionLatency(const Instruction *I) override {
-    return Impl.getInstructionLatency(I);
-  }
-
   VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
     return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index abc9116a5a2a..620c2e4e93cf 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -500,6 +500,12 @@ public:
       // FIXME: Unlikely to be true for CodeSize.
       return TTI::TCC_Expensive;
     }
+
+    // Assume a 3cy latency for fp arithmetic ops.
+    if (CostKind == TTI::TCK_Latency)
+      if (Ty->getScalarType()->isFloatingPointTy())
+        return 3;
+
     return 1;
   }
 
@@ -993,8 +999,9 @@ public:
     return TTI::TCC_Basic;
   }
 
-  InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
-                              TTI::TargetCostKind CostKind) {
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TTI::TargetCostKind CostKind) {
     using namespace llvm::PatternMatch;
 
     auto *TargetTTI = static_cast<T *>(this);
@@ -1097,6 +1104,9 @@ public:
                                         CostKind, I);
     }
     case Instruction::Load: {
+      // FIXME: Arbitary cost which could come from the backend.
+      if (CostKind == TTI::TCK_Latency)
+        return 4;
       auto *LI = cast<LoadInst>(U);
       Type *LoadType = U->getType();
       // If there is a non-register sized type, the cost estimation may expand
@@ -1248,39 +1258,10 @@ public:
       return TargetTTI->getVectorInstrCost(*EEI, DstTy, Idx);
     }
     }
-    // By default, just classify everything as 'basic'.
-    return TTI::TCC_Basic;
-  }
 
-  InstructionCost getInstructionLatency(const Instruction *I) {
-    SmallVector<const Value *, 4> Operands(I->operand_values());
-    if (getUserCost(I, Operands, TTI::TCK_Latency) == TTI::TCC_Free)
-      return 0;
-
-    if (isa<LoadInst>(I))
-      return 4;
-
-    Type *DstTy = I->getType();
-
-    // Usually an intrinsic is a simple instruction.
-    // A real function call is much slower.
-    if (auto *CI = dyn_cast<CallInst>(I)) {
-      const Function *F = CI->getCalledFunction();
-      if (!F || static_cast<T *>(this)->isLoweredToCall(F))
-        return 40;
-      // Some intrinsics return a value and a flag, we use the value type
-      // to decide its latency.
-      if (StructType *StructTy = dyn_cast<StructType>(DstTy))
-        DstTy = StructTy->getElementType(0);
-      // Fall through to simple instructions.
-    }
-
-    if (VectorType *VectorTy = dyn_cast<VectorType>(DstTy))
-      DstTy = VectorTy->getElementType();
-    if (DstTy->isFloatingPointTy())
-      return 3;
-
-    return 1;
+    // By default, just classify everything as 'basic' or -1 to represent that
+    // don't know the throughput cost.
+    return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
   }
 };
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 2be86d040727..826fd9106f99 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -638,13 +638,6 @@ public:
         SimplifyAndSetOp);
   }
 
-  InstructionCost getInstructionLatency(const Instruction *I) {
-    if (isa<LoadInst>(I))
-      return getST()->getSchedModel().DefaultLoadLatency;
-
-    return BaseT::getInstructionLatency(I);
-  }
-
   virtual Optional<unsigned>
   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
     return Optional<unsigned>(
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index ded842b92ae1..2637e2f97dbb 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -177,7 +177,7 @@ void CodeMetrics::analyzeBasicBlock(
       if (InvI->cannotDuplicate())
         notDuplicatable = true;
 
-    NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
+    NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
   }
 
   if (isa<ReturnInst>(BB->getTerminator()))
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 0992a4f119d6..86e0b4ba403b 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -1361,8 +1361,8 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
       Operands.push_back(SimpleOp);
     else
       Operands.push_back(Op);
-  return TTI.getUserCost(&GEP, Operands,
-                         TargetTransformInfo::TCK_SizeAndLatency) ==
+  return TTI.getInstructionCost(&GEP, Operands,
+                                TargetTransformInfo::TCK_SizeAndLatency) ==
          TargetTransformInfo::TCC_Free;
 }
 
@@ -1639,7 +1639,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
     SROAArgValues[&I] = SROAArg;
 
-  return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+  return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
          TargetTransformInfo::TCC_Free;
 }
 
@@ -1662,7 +1662,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
     SROAArgValues[&I] = SROAArg;
 
-  return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+  return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
          TargetTransformInfo::TCC_Free;
 }
 
@@ -1692,7 +1692,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
     break;
   }
 
-  return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+  return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
          TargetTransformInfo::TCC_Free;
 }
 
@@ -2390,7 +2390,7 @@ bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
 bool CallAnalyzer::visitInstruction(Instruction &I) {
   // Some instructions are free. All of the free intrinsics can also be
   // handled by SROA, etc.
-  if (TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+  if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
       TargetTransformInfo::TCC_Free)
     return true;
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 382ed27f1e7b..001075d07bf8 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -221,10 +221,10 @@ unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters(
 }
 
 InstructionCost
-TargetTransformInfo::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands,
-                                 enum TargetCostKind CostKind) const {
-  InstructionCost Cost = TTIImpl->getUserCost(U, Operands, CostKind);
+TargetTransformInfo::getInstructionCost(const User *U,
+                                        ArrayRef<const Value *> Operands,
+                                        enum TargetCostKind CostKind) const {
+  InstructionCost Cost = TTIImpl->getInstructionCost(U, Operands, CostKind);
   assert((CostKind == TTI::TCK_RecipThroughput || Cost >= 0) &&
          "TTI should not produce negative costs!");
   return Cost;
@@ -1149,11 +1149,6 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
-InstructionCost
-TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
-  return TTIImpl->getInstructionLatency(I);
-}
-
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 92fe929da535..099be15be50f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6603,8 +6603,8 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
   // If it's safe to speculatively execute, then it should not have side
   // effects; therefore, it's safe to sink and possibly *not* execute.
   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
-         TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
-         TargetTransformInfo::TCC_Expensive;
+         TTI->getInstructionCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
+             TargetTransformInfo::TCC_Expensive;
 }
 
 /// Returns true if a SelectInst should be turned into an explicit branch.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 11f3dd3094b6..afafad702f32 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2340,8 +2340,8 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       }
 
       SmallVector<const Value*, 4> Operands(I.operand_values());
-      Cost +=
-        getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
+      Cost += getInstructionCost(&I, Operands,
+                                 TargetTransformInfo::TCK_SizeAndLatency);
     }
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index e11bef95e14e..13b96642a1ae 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -340,9 +340,10 @@ unsigned HexagonTTIImpl::getCacheLineSize() const {
   return ST.getL1CacheLineSize();
 }
 
-InstructionCost HexagonTTIImpl::getUserCost(const User *U,
-                                            ArrayRef<const Value *> Operands,
-                                            TTI::TargetCostKind CostKind) {
+InstructionCost
+HexagonTTIImpl::getInstructionCost(const User *U,
+                                   ArrayRef<const Value *> Operands,
+                                   TTI::TargetCostKind CostKind) {
   auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
     if (!CI->isIntegerCast())
       return false;
@@ -364,7 +365,7 @@ InstructionCost HexagonTTIImpl::getUserCost(const User *U,
   if (const CastInst *CI = dyn_cast<const CastInst>(U))
     if (isCastFoldedIntoLoad(CI))
       return TargetTransformInfo::TCC_Free;
-  return BaseT::getUserCost(U, Operands, CostKind);
+  return BaseT::getInstructionCost(U, Operands, CostKind);
 }
 
 bool HexagonTTIImpl::shouldBuildLookupTables() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 9d263642173a..e79e9d02e177 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -165,8 +165,9 @@ public:
 
   /// @}
 
-  InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
-                              TTI::TargetCostKind CostKind);
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TTI::TargetCostKind CostKind);
 
   // Hexagon specific decision to generate a lookup table.
   bool shouldBuildLookupTables() const;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 27ba6d780dd7..86c39cdf7f53 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -321,21 +321,21 @@ static bool isMMAType(Type *Ty) {
          (Ty->getPrimitiveSizeInBits() > 128);
 }
 
-InstructionCost PPCTTIImpl::getUserCost(const User *U,
-                                        ArrayRef<const Value *> Operands,
-                                        TTI::TargetCostKind CostKind) {
+InstructionCost PPCTTIImpl::getInstructionCost(const User *U,
+                                               ArrayRef<const Value *> Operands,
+                                               TTI::TargetCostKind CostKind) {
   // We already implement getCastInstrCost and getMemoryOpCost where we perform
   // the vector adjustment there.
   if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
-    return BaseT::getUserCost(U, Operands, CostKind);
+    return BaseT::getInstructionCost(U, Operands, CostKind);
 
   if (U->getType()->isVectorTy()) {
     // Instructions that need to be split should cost more.
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
-    return LT.first * BaseT::getUserCost(U, Operands, CostKind);
+    return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
   }
 
-  return BaseT::getUserCost(U, Operands, CostKind);
+  return BaseT::getInstructionCost(U, Operands, CostKind);
 }
 
 // Determining the address of a TLS variable results in a function call in
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 9a5b26bd1b90..6058ff9158c4 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -59,8 +59,9 @@ public:
                                       const APInt &Imm, Type *Ty,
                                       TTI::TargetCostKind CostKind);
 
-  InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands,
-                              TTI::TargetCostKind CostKind);
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TTI::TargetCostKind CostKind);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index cd414a761121..ef26a82ce358 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -492,8 +492,8 @@ void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       }
 
       SmallVector<const Value *> Operands(I.operand_values());
-      Cost +=
-          getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
+      Cost += getInstructionCost(&I, Operands,
+                                 TargetTransformInfo::TCK_SizeAndLatency);
     }
   }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d409885a9131..c5bb48edd67f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2621,6 +2621,11 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                                CmpInst::Predicate VecPred,
                                                TTI::TargetCostKind CostKind,
                                                const Instruction *I) {
+  // Assume a 3cy latency for fp select ops.
+  if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
+    if (ValTy->getScalarType()->isFloatingPointTy())
+      return 3;
+
   // TODO: Handle other cost kinds.
   if (CostKind != TTI::TCK_RecipThroughput)
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 5f9f52e6503e..fbc963885091 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -573,7 +573,8 @@ private:
     if (!I)
       return std::numeric_limits<unsigned>::min();
 
-    auto Cost = TTI.getUserCost(U, TargetTransformInfo::TCK_SizeAndLatency);
+    InstructionCost Cost =
+        TTI.getInstructionCost(U, TargetTransformInfo::TCK_SizeAndLatency);
 
     // Traverse recursively if there are more uses.
     // TODO: Any other instructions to be added here?
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 3fc078a05e32..a23f5a96646b 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -561,8 +561,8 @@ static unsigned getJumpThreadDuplicationCost(const TargetTransformInfo *TTI,
       if (CI->cannotDuplicate() || CI->isConvergent())
         return ~0U;
 
-    if (TTI->getUserCost(&*I, TargetTransformInfo::TCK_SizeAndLatency)
-            == TargetTransformInfo::TCC_Free)
+    if (TTI->getInstructionCost(&*I, TargetTransformInfo::TCK_SizeAndLatency) ==
+        TargetTransformInfo::TCC_Free)
       continue;
 
     // All other instructions count for at least one unit.
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 8e114b998354..ef0dab301871 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1317,13 +1317,13 @@ static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
 static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
                          const TargetTransformInfo *TTI) {
   InstructionCost CostI =
-      TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+      TTI->getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 
   if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
     if (CostI != TargetTransformInfo::TCC_Free)
       return false;
-    // For a GEP, we cannot simply use getUserCost because currently it
-    // optimistically assumes that a GEP will fold into addressing mode
+    // For a GEP, we cannot simply use getInstructionCost because currently
+    // it optimistically assumes that a GEP will fold into addressing mode
     // regardless of its users.
     const BasicBlock *BB = GEP->getParent();
     for (const User *U : GEP->users()) {
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index aa75af240631..c530e77c844f 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -552,7 +552,7 @@ checkOuterLoopInsts(FlattenInfo &FI,
                             m_Specific(FI.InnerTripCount))))
         continue;
       InstructionCost Cost =
-          TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+          TTI->getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
       LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
       RepeatedInstrCost += Cost;
     }
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f993282b2e7d..5f958eebfb64 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -443,7 +443,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
         // First accumulate the cost of this instruction.
         if (!Cost.IsFree) {
-          UnrolledCost += TTI.getUserCost(I, CostKind);
+          UnrolledCost += TTI.getInstructionCost(I, CostKind);
           LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
                             << Iteration << "): ");
           LLVM_DEBUG(I->dump());
@@ -537,7 +537,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
         // Track this instruction's expected baseline cost when executing the
         // rolled loop form.
-        RolledDynamicCost += TTI.getUserCost(&I, CostKind);
+        RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
 
         // Visit the instruction to analyze its loop cost after unrolling,
         // and if the visitor returns true, mark the instruction as free after
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 8a4477d16d5c..70f97d807cb5 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2878,7 +2878,7 @@ static bool unswitchBestCondition(
         if (CB->isConvergent() || CB->cannotDuplicate())
           return false;
 
-      Cost += TTI.getUserCost(&I, CostKind);
+      Cost += TTI.getInstructionCost(&I, CostKind);
     }
     assert(Cost >= 0 && "Must not have negative costs!");
     LoopCost += Cost;
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 9ac4608134c2..65f8d760ede3 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -252,7 +252,7 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I,
     case Instruction::ShuffleVector:
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
-      return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+      return TTI.getInstructionCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 
     default:
       return InstructionCost::getInvalid(); // Disallow anything not explicitly
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 76f09c0bb14d..21f8d54703de 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -381,7 +381,7 @@ static InstructionCost computeSpeculationCost(const User *I,
   assert((!isa<Instruction>(I) ||
           isSafeToSpeculativelyExecute(cast<Instruction>(I))) &&
          "Instruction is not safe to speculatively execute!");
-  return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+  return TTI.getInstructionCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 }
 
 /// If we have a merge point of an "if condition" as accepted above,
@@ -3626,8 +3626,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
 
     // Account for the cost of duplicating this instruction into each
     // predecessor. Ignore free instructions.
-    if (!TTI ||
-        TTI->getUserCost(&I, CostKind) != TargetTransformInfo::TCC_Free) {
+    if (!TTI || TTI->getInstructionCost(&I, CostKind) !=
+                    TargetTransformInfo::TCC_Free) {
       NumBonusInsts += PredCount;
 
       // Early exits once we reach the limit.
@@ -3799,7 +3799,8 @@ static bool mergeConditionalStoreToAddress(
         return false; // Not in white-list - not worthwhile folding.
       // And finally, if this is a non-free instruction that we are okay
       // speculating, ensure that we consider the speculation budget.
-      Cost += TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+      Cost +=
+          TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
       if (Cost > Budget)
         return false; // Eagerly refuse to fold as soon as we're out of budget.
     }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-math.ll b/llvm/test/Analysis/CostModel/AArch64/sve-math.ll
index f45ed57564b7..551fc89c93ae 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-math.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-math.ll
@@ -33,7 +33,7 @@ define <vscale x 2 x double> @sqrt_v2f64(<vscale x 2 x double> %a) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <vscale x 2 x double> %r
 ;
 ; LATE-LABEL: 'sqrt_v2f64'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %a)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %a)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <vscale x 2 x double> %r
 ;
 ; SIZE-LABEL: 'sqrt_v2f64'
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index a91c4627483e..1e48a157c906 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -49,8 +49,8 @@ define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'smax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'smax'
@@ -75,8 +75,8 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fmuladd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmuladd'
@@ -101,8 +101,8 @@ define void @log2(float %a, <16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'log2'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'log2'
@@ -127,8 +127,8 @@ define void @constrained_fadd(float %a, <16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'constrained_fadd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; LATE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'constrained_fadd'
@@ -153,8 +153,8 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fmaximum'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmaximum'
@@ -180,7 +180,7 @@ define void @cttz(i32 %a, <16 x i32> %va) {
 ;
 ; LATE-LABEL: 'cttz'
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'cttz'
@@ -206,7 +206,7 @@ define void @ctlz(i32 %a, <16 x i32> %va) {
 ;
 ; LATE-LABEL: 'ctlz'
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'ctlz'
@@ -231,8 +231,8 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fshl'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fshl'
@@ -256,7 +256,7 @@ define void @maskedgather(<16 x float*> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedgather'
@@ -277,7 +277,7 @@ define void @maskedscatter(<16 x float> %va, <16 x float*> %vb, <16 x i1> %vc) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedscatter'
@@ -298,7 +298,7 @@ define void @reduce_fmax(<16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'reduce_fmax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fmax'
@@ -319,7 +319,7 @@ define void @memcpy(i8* %a, i8* %b, i32 %c) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'memcpy'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %a, i8* align 1 %b, i32 32, i1 false)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %a, i8* align 1 %b, i32 32, i1 false)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'memcpy'
diff --git a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
index 97add5ec2dc5..36cc4642b4c9 100644
--- a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
@@ -16,7 +16,7 @@ define void @intrinsics() {
 ;
 ; CHECK-THUMB2-LAT-LABEL: 'intrinsics'
 ; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half* undef)
+; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half* undef)
 ; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
 ; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
 ; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/SystemZ/ext-of-icmp-cost.ll b/llvm/test/Analysis/CostModel/SystemZ/ext-of-icmp-cost.ll
index 7cb0d7f4c905..921c84731609 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/ext-of-icmp-cost.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/ext-of-icmp-cost.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output \
 ; RUN:   -mtriple=s390x-unknown-linux -mcpu=z13  | FileCheck %s
 ;
-; Check that getUserCost() does not return TCC_Free for extensions of
+; Check that getInstructionCost() does not return TCC_Free for extensions of
 ; i1 returned from icmp.
 
 define i64 @fun1(i64 %v) {
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-costkinds.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-costkinds.ll
index e20dde0797fc..c8a45d8163da 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-costkinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-costkinds.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 -cost-kind=latency      < %s | FileCheck %s --check-prefixes=LATE
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 -cost-kind=code-size    < %s | FileCheck %s --check-prefixes=SIZE,SSE-SIZE
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 -cost-kind=size-latency < %s | FileCheck %s --check-prefixes=SIZE_LATE,SSE-SIZE_LATE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 -cost-kind=latency      < %s | FileCheck %s --check-prefixes=CHECK,LATE,SSE-LATE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 -cost-kind=code-size    < %s | FileCheck %s --check-prefixes=CHECK,SIZE,SSE-SIZE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 -cost-kind=size-latency < %s | FileCheck %s --check-prefixes=CHECK,SIZE_LATE,SSE-SIZE_LATE
 ;
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx  -cost-kind=latency      < %s | FileCheck %s --check-prefixes=LATE
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx  -cost-kind=code-size    < %s | FileCheck %s --check-prefixes=SIZE,AVX-SIZE
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx  -cost-kind=size-latency < %s | FileCheck %s --check-prefixes=SIZE_LATE,AVX-SIZE_LATE,AVX1-SIZE_LATE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx  -cost-kind=latency      < %s | FileCheck %s --check-prefixes=CHECK,LATE,AVX-LATE,AVX1-LATE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx  -cost-kind=code-size    < %s | FileCheck %s --check-prefixes=CHECK,SIZE,AVX-SIZE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx  -cost-kind=size-latency < %s | FileCheck %s --check-prefixes=CHECK,SIZE_LATE,AVX-SIZE_LATE,AVX1-SIZE_LATE
 ;
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 -cost-kind=latency      < %s | FileCheck %s --check-prefixes=LATE
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 -cost-kind=code-size    < %s | FileCheck %s --check-prefixes=SIZE,AVX-SIZE
-; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 -cost-kind=size-latency < %s | FileCheck %s --check-prefixes=SIZE_LATE,AVX-SIZE_LATE,AVX2-SIZE_LATE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 -cost-kind=latency      < %s | FileCheck %s --check-prefixes=CHECK,LATE,AVX-LATE,AVX2-LATE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 -cost-kind=code-size    < %s | FileCheck %s --check-prefixes=CHECK,SIZE,AVX-SIZE
+; RUN: opt -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 -cost-kind=size-latency < %s | FileCheck %s --check-prefixes=CHECK,SIZE_LATE,AVX-SIZE_LATE,AVX2-SIZE_LATE
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -250,38 +250,16 @@ define i32 @fmul(i32 %arg) {
 }
 
 define i32 @fdiv(i32 %arg) {
-; LATE-LABEL: 'fdiv'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = fdiv float undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = fdiv double undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SIZE-LABEL: 'fdiv'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = fdiv float undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = fdiv double undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SIZE_LATE-LABEL: 'fdiv'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = fdiv float undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = fdiv double undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'fdiv'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = fdiv float undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = fdiv double undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = fdiv float undef, undef
   %V4F32 = fdiv <4 x float> undef, undef
@@ -297,38 +275,16 @@ define i32 @fdiv(i32 %arg) {
 }
 
 define i32 @frem(i32 %arg) {
-; LATE-LABEL: 'frem'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = frem float undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = frem <4 x float> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = frem <8 x float> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = frem <16 x float> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = frem double undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = frem <2 x double> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = frem <4 x double> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = frem <8 x double> undef, undef
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SIZE-LABEL: 'frem'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = frem <4 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = frem <8 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = frem <16 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = frem <4 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = frem <8 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; SIZE_LATE-LABEL: 'frem'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = frem <4 x float> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = frem <8 x float> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = frem <16 x float> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = frem <4 x double> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = frem <8 x double> undef, undef
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'frem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = frem <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = frem <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = frem <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = frem float undef, undef
   %V4F32 = frem <4 x float> undef, undef
@@ -344,16 +300,16 @@ define i32 @frem(i32 %arg) {
 }
 
 define i32 @fsqrt(i32 %arg) {
-; LATE-LABEL: 'fsqrt'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LATE-LABEL: 'fsqrt'
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE-SIZE-LABEL: 'fsqrt'
 ; SSE-SIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
@@ -377,6 +333,17 @@ define i32 @fsqrt(i32 %arg) {
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; AVX1-LATE-LABEL: 'fsqrt'
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; AVX1-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-SIZE_LATE-LABEL: 'fsqrt'
 ; AVX1-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
 ; AVX1-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -388,6 +355,17 @@ define i32 @fsqrt(i32 %arg) {
 ; AVX1-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 ; AVX1-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; AVX2-LATE-LABEL: 'fsqrt'
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; AVX2-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX2-SIZE_LATE-LABEL: 'fsqrt'
 ; AVX2-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
 ; AVX2-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -413,16 +391,16 @@ define i32 @fsqrt(i32 %arg) {
 }
 
 define i32 @fabs(i32 %arg) {
-; LATE-LABEL: 'fabs'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.fabs.f32(float undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.fabs.f64(double undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LATE-LABEL: 'fabs'
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.fabs.f32(float undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.fabs.f64(double undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE-SIZE-LABEL: 'fabs'
 ; SSE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.fabs.f32(float undef)
@@ -446,6 +424,17 @@ define i32 @fabs(i32 %arg) {
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; AVX-LATE-LABEL: 'fabs'
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.fabs.f32(float undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.fabs.f64(double undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX-SIZE-LABEL: 'fabs'
 ; AVX-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.fabs.f32(float undef)
 ; AVX-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
@@ -482,16 +471,16 @@ define i32 @fabs(i32 %arg) {
 }
 
 define i32 @fcopysign(i32 %arg) {
-; LATE-LABEL: 'fcopysign'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LATE-LABEL: 'fcopysign'
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE-SIZE-LABEL: 'fcopysign'
 ; SSE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
@@ -515,6 +504,17 @@ define i32 @fcopysign(i32 %arg) {
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; AVX-LATE-LABEL: 'fcopysign'
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX-SIZE-LABEL: 'fcopysign'
 ; AVX-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
 ; AVX-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
@@ -551,16 +551,16 @@ define i32 @fcopysign(i32 %arg) {
 }
 
 define i32 @fma(i32 %arg) {
-; LATE-LABEL: 'fma'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LATE-LABEL: 'fma'
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; SSE-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE-SIZE-LABEL: 'fma'
 ; SSE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
@@ -584,6 +584,17 @@ define i32 @fma(i32 %arg) {
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 ; SSE-SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; AVX-LATE-LABEL: 'fma'
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; AVX-LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX-SIZE-LABEL: 'fma'
 ; AVX-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
 ; AVX-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/costmodel.ll b/llvm/test/Analysis/CostModel/X86/costmodel.ll
index 4a96ca102f96..336a2f6be7bb 100644
--- a/llvm/test/Analysis/CostModel/X86/costmodel.ll
+++ b/llvm/test/Analysis/CostModel/X86/costmodel.ll
@@ -19,7 +19,7 @@ define i64 @foo(i64 %arg) {
 ; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %P2I = ptrtoint i8* undef to i64
 ; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %TC = trunc i64 undef to i32
 ; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; LATENCY-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void undef()
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void undef()
 ; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 undef
 ;
 ; CODESIZE-LABEL: 'foo'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 78b1855f1353..8d68623e2527 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -55,8 +55,8 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'umul'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'umul'
@@ -81,8 +81,8 @@ define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'smax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'smax'
@@ -107,8 +107,8 @@ define void @fcopysign(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fcopysign'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.copysign.f32(float %a, float %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.copysign.v16f32(<16 x float> %va, <16 x float> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.copysign.f32(float %a, float %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.copysign.v16f32(<16 x float> %va, <16 x float> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fcopysign'
@@ -133,8 +133,8 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fmuladd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmuladd'
@@ -159,8 +159,8 @@ define void @log2(float %a, <16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'log2'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'log2'
@@ -185,8 +185,8 @@ define void @constrained_fadd(float %a, <16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'constrained_fadd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; LATE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'constrained_fadd'
@@ -211,8 +211,8 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fmaximum'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmaximum'
@@ -237,8 +237,8 @@ define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'cttz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'cttz'
@@ -263,8 +263,8 @@ define void @ctlz(i32 %a, <16 x i32> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'ctlz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'ctlz'
@@ -289,8 +289,8 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'fshl'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fshl'
@@ -314,7 +314,7 @@ define void @maskedgather(<16 x float*> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedgather'
@@ -335,7 +335,7 @@ define void @maskedscatter(<16 x float> %va, <16 x float*> %vb, <16 x i1> %vc) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedscatter'
@@ -356,7 +356,7 @@ define void @reduce_fmax(<16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'reduce_fmax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fmax'
@@ -377,7 +377,7 @@ define void @reduce_fmul(<16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'reduce_fmul'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v = call float @llvm.vector.reduce.fmul.v16f32(float 4.200000e+01, <16 x float> %va)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fmul'
@@ -398,7 +398,7 @@ define void @reduce_fadd_fast(<16 x float> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'reduce_fadd_fast'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %va)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %va)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fadd_fast'
@@ -419,7 +419,7 @@ define void @memcpy(i8* %a, i8* %b, i32 %c) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'memcpy'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %a, i8* align 1 %b, i32 32, i1 false)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %a, i8* align 1 %b, i32 32, i1 false)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'memcpy'