Irreducible loop metadata for more accurate block frequency under PGO.

Summary:
Currently the block frequency analysis is an approximation for irreducible
loops.

The new irreducible loop metadata is used to annotate the irreducible loop
headers with their header weights based on the PGO profile (currently this is
approximated to be evenly weighted) and to help improve the accuracy of the
block frequency analysis for irreducible loops.

This patch is a basic support for this.

Reviewers: davidxl

Reviewed By: davidxl

Subscribers: mehdi_amini, llvm-commits, eraman

Differential Revision: https://reviews.llvm.org/D39028

llvm-svn: 317278
This commit is contained in:
Hiroshi Yamauchi 2017-11-02 22:26:51 +00:00
parent 64b6e5af13
commit dce9def3dd
21 changed files with 600 additions and 10 deletions

View File

@ -5194,6 +5194,29 @@ the loop identifier metadata node directly:
!1 = !{!1} ; an identifier for the inner loop
!2 = !{!2} ; an identifier for the outer loop
'``irr_loop``' Metadata
^^^^^^^^^^^^^^^^^^^^^^^
``irr_loop`` metadata may be attached to the terminator instruction of a basic
block that's an irreducible loop header (note that an irreducible loop has more
than once header basic blocks.) If ``irr_loop`` metadata is attached to the
terminator instruction of a basic block that is not really an irreducible loop
header, the behavior is undefined. The intent of this metadata is to improve the
accuracy of the block frequency propagation. For example, in the code below, the
block ``header0`` may have a loop header weight (relative to the other headers of
the irreducible loop) of 100:
.. code-block:: llvm
header0:
...
br i1 %cmp, label %t1, label %t2, !irr_loop !0
...
!0 = !{"loop_header_weight", i64 100}
Irreducible loop header weights are typically based on profile data.
'``invariant.group``' Metadata
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -75,6 +75,10 @@ public:
/// the enclosing function's count (if available) and returns the value.
Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
/// \brief Returns true if \p BB is an irreducible loop header
/// block. Otherwise false.
bool isIrrLoopHeader(const BasicBlock *BB);
// Set the frequency of the given basic block.
void setBlockFreq(const BasicBlock *BB, uint64_t Freq);

View File

@ -20,6 +20,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/BasicBlock.h"
@ -414,6 +415,10 @@ public:
/// \brief Data about each block. This is used downstream.
std::vector<FrequencyData> Freqs;
/// \brief Whether each block is an irreducible loop header.
/// This is used downstream.
SparseBitVector<> IsIrrLoopHeader;
/// \brief Loop data: see initializeLoops().
std::vector<WorkingData> Working;
@ -492,6 +497,8 @@ public:
/// the backedges going into each of the loop headers.
void adjustLoopHeaderMass(LoopData &Loop);
void distributeIrrLoopHeaderMass(Distribution &Dist);
/// \brief Package up a loop.
void packageLoop(LoopData &Loop);
@ -520,6 +527,7 @@ public:
const BlockNode &Node) const;
Optional<uint64_t> getProfileCountFromFreq(const Function &F,
uint64_t Freq) const;
bool isIrrLoopHeader(const BlockNode &Node);
void setBlockFreq(const BlockNode &Node, uint64_t Freq);
@ -973,6 +981,10 @@ public:
return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq);
}
bool isIrrLoopHeader(const BlockT *BB) {
return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
}
void setBlockFreq(const BlockT *BB, uint64_t Freq);
Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
@ -1140,17 +1152,39 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
if (Loop.isIrreducible()) {
BlockMass Remaining = BlockMass::getFull();
DEBUG(dbgs() << "isIrreducible = true\n");
Distribution Dist;
unsigned NumHeadersWithWeight = 0;
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
auto &Mass = Working[Loop.Nodes[H].Index].getMass();
Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
Remaining -= Mass;
auto &HeaderNode = Loop.Nodes[H];
const BlockT *Block = getBlock(HeaderNode);
IsIrrLoopHeader.set(Loop.Nodes[H].Index);
Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
if (!HeaderWeight)
continue;
DEBUG(dbgs() << getBlockName(HeaderNode)
<< " has irr loop header weight " << HeaderWeight.getValue()
<< "\n");
NumHeadersWithWeight++;
uint64_t HeaderWeightValue = HeaderWeight.getValue();
if (HeaderWeightValue)
Dist.addLocal(HeaderNode, HeaderWeightValue);
}
if (NumHeadersWithWeight != Loop.NumHeaders) {
// Not all headers have a weight metadata. Distribute weight evenly.
Dist = Distribution();
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
auto &HeaderNode = Loop.Nodes[H];
Dist.addLocal(HeaderNode, 1);
}
}
distributeIrrLoopHeaderMass(Dist);
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
adjustLoopHeaderMass(Loop);
if (NumHeadersWithWeight != Loop.NumHeaders)
// Not all headers have a weight metadata. Adjust header mass.
adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
@ -1285,6 +1319,9 @@ raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
BlockFrequencyInfoImplBase::getBlockProfileCount(
*F->getFunction(), getNode(&BB)))
OS << ", count = " << ProfileCount.getValue();
if (Optional<uint64_t> IrrLoopHeaderWeight =
BB.getIrrLoopHeaderWeight())
OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue();
OS << "\n";
}

View File

@ -97,6 +97,8 @@ private:
using const_probability_iterator =
std::vector<BranchProbability>::const_iterator;
Optional<uint64_t> IrrLoopHeaderWeight;
/// Keep track of the physical registers that are livein of the basicblock.
using LiveInVector = std::vector<RegisterMaskPair>;
LiveInVector LiveIns;
@ -729,6 +731,14 @@ public:
/// Return the MCSymbol for this basic block.
MCSymbol *getSymbol() const;
Optional<uint64_t> getIrrLoopHeaderWeight() const {
return IrrLoopHeaderWeight;
}
void setIrrLoopHeaderWeight(uint64_t Weight) {
IrrLoopHeaderWeight = Weight;
}
private:
/// Return probability iterator corresponding to the I successor iterator.
probability_iterator getProbabilityIterator(succ_iterator I);

View File

@ -62,6 +62,8 @@ public:
Optional<uint64_t> getBlockProfileCount(const MachineBasicBlock *MBB) const;
Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
bool isIrrLoopHeader(const MachineBasicBlock *MBB);
const MachineFunction *getFunction() const;
const MachineBranchProbabilityInfo *getMBPI() const;
void view(const Twine &Name, bool isSimple = true) const;

View File

@ -398,6 +398,8 @@ public:
/// \brief Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() const;
Optional<uint64_t> getIrrLoopHeaderWeight() const;
private:
/// \brief Increment the internal refcount of the number of BlockAddresses
/// referencing this BasicBlock by \p Amt.

View File

@ -101,6 +101,7 @@ public:
MD_absolute_symbol = 21, // "absolute_symbol"
MD_associated = 22, // "associated"
MD_callees = 23, // "callees"
MD_irr_loop = 24, // "irr_loop"
};
/// Known operand bundle tag IDs, which always have the same value. All

View File

@ -173,6 +173,9 @@ public:
/// base type, access type and offset relative to the base type.
MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
uint64_t Offset, bool IsConstant = false);
/// \brief Return metadata containing an irreducible loop header weight.
MDNode *createIrrLoopHeaderWeight(uint64_t Weight);
};
} // end namespace llvm

View File

@ -68,6 +68,8 @@ public:
void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
uint64_t MaxCount);
void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_PGOINSTRUMENTATION_H

View File

@ -218,6 +218,11 @@ BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
return BFI->getProfileCountFromFreq(*getFunction(), Freq);
}
bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) {
assert(BFI && "Expected analysis to be available");
return BFI->isIrrLoopHeader(BB);
}
void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
assert(BFI && "Expected analysis to be available");
BFI->setBlockFreq(BB, Freq);

View File

@ -271,6 +271,7 @@ void BlockFrequencyInfoImplBase::clear() {
// Swap with a default-constructed std::vector, since std::vector<>::clear()
// does not actually clear heap storage.
std::vector<FrequencyData>().swap(Freqs);
IsIrrLoopHeader.clear();
std::vector<WorkingData>().swap(Working);
Loops.clear();
}
@ -280,8 +281,10 @@ void BlockFrequencyInfoImplBase::clear() {
/// Releases all memory not used downstream. In particular, saves Freqs.
static void cleanup(BlockFrequencyInfoImplBase &BFI) {
std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader));
BFI.clear();
BFI.Freqs = std::move(SavedFreqs);
BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader);
}
bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
@ -572,6 +575,13 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
return BlockCount.getLimitedValue();
}
bool
BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) {
if (!Node.isValid())
return false;
return IsIrrLoopHeader.test(Node.Index);
}
Scaled64
BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
if (!Node.isValid())
@ -819,3 +829,14 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
}
}
void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) {
BlockMass LoopMass = BlockMass::getFull();
DitheringDistributer D(Dist, LoopMass);
for (const Weight &W : Dist.Weights) {
BlockMass Taken = D.takeMass(W.Amount);
assert(W.Type == Weight::Local && "all weights should be local");
Working[W.TargetNode.Index].getMass() = Taken;
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
}
}

View File

@ -42,6 +42,8 @@ using namespace llvm;
MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
: BB(B), Number(-1), xParent(&MF) {
Insts.Parent = this;
if (B)
IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight();
}
MachineBasicBlock::~MachineBasicBlock() {
@ -338,6 +340,12 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
}
OS << '\n';
}
if (IrrLoopHeaderWeight) {
if (Indexes) OS << '\t';
OS << " Irreducible loop header weight: "
<< IrrLoopHeaderWeight.getValue();
OS << '\n';
}
}
void MachineBasicBlock::printAsOperand(raw_ostream &OS,

View File

@ -234,6 +234,12 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None;
}
bool
MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) {
assert(MBFI && "Expected analysis to be available");
return MBFI->isIrrLoopHeader(MBB);
}
const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
return MBFI ? MBFI->getFunction() : nullptr;
}

View File

@ -447,3 +447,16 @@ bool BasicBlock::isLandingPad() const {
const LandingPadInst *BasicBlock::getLandingPadInst() const {
return dyn_cast<LandingPadInst>(getFirstNonPHI());
}
Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
const TerminatorInst *TI = getTerminator();
if (MDNode *MDIrrLoopHeader =
TI->getMetadata(LLVMContext::MD_irr_loop)) {
MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
if (MDName->getString().equals("loop_header_weight")) {
auto *CI = mdconst::extract<ConstantInt>(MDIrrLoopHeader->getOperand(1));
return Optional<uint64_t>(CI->getValue().getZExtValue());
}
}
return Optional<uint64_t>();
}

View File

@ -60,6 +60,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
{MD_absolute_symbol, "absolute_symbol"},
{MD_associated, "associated"},
{MD_callees, "callees"},
{MD_irr_loop, "irr_loop"},
};
for (auto &MDKind : MDKinds) {

View File

@ -197,3 +197,10 @@ MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
}
return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)});
}
MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
SmallVector<Metadata *, 2> Vals(2);
Vals[0] = createString("loop_header_weight");
Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight));
return MDNode::get(Context, Vals);
}

View File

@ -844,8 +844,9 @@ public:
PGOUseFunc(Function &Func, Module *Modu,
std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
BranchProbabilityInfo *BPI = nullptr,
BlockFrequencyInfo *BFI = nullptr)
: F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI),
BlockFrequencyInfo *BFIin = nullptr)
: F(Func), M(Modu), BFI(BFIin),
FuncInfo(Func, ComdatMembers, false, BPI, BFIin),
FreqAttr(FFA_Normal) {}
// Read counts for the instrumented BB from profile.
@ -863,6 +864,9 @@ public:
// Annotate the value profile call sites for one value kind.
void annotateValueSites(uint32_t Kind);
// Annotate the irreducible loop header weights.
void annotateIrrLoopHeaderWeights();
// The hotness of the function from the profile count.
enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
@ -894,6 +898,7 @@ public:
private:
Function &F;
Module *M;
BlockFrequencyInfo *BFI;
// This member stores the shared information with class PGOGenFunc.
FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
@ -1183,6 +1188,18 @@ void PGOUseFunc::setBranchWeights() {
}
}
void PGOUseFunc::annotateIrrLoopHeaderWeights() {
DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
// Find irr loop headers
for (auto &BB : F) {
if (BFI->isIrrLoopHeader(&BB)) {
TerminatorInst *TI = BB.getTerminator();
const UseBBInfo &BBCountInfo = getBBInfo(&BB);
setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
}
}
}
void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
Module *M = F.getParent();
IRBuilder<> Builder(&SI);
@ -1441,6 +1458,7 @@ static bool annotateAllFunctions(
Func.populateCounters();
Func.setBranchWeights();
Func.annotateValueSites();
Func.annotateIrrLoopHeaderWeights();
PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
if (FreqAttr == PGOUseFunc::FFA_Cold)
ColdFunctions.push_back(&F);
@ -1582,6 +1600,12 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
namespace llvm {
void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
MDBuilder MDB(M->getContext());
TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
MDB.createIrrLoopHeaderWeight(Count));
}
template <> struct GraphTraits<PGOUseFunc *> {
using NodeRef = const BasicBlock *;
using ChildIteratorType = succ_const_iterator;

View File

@ -0,0 +1,208 @@
; RUN: opt < %s -analyze -block-freq | FileCheck %s
; RUN: opt < %s -passes='print<block-freq>' -disable-output 2>&1 | FileCheck %s
; Function Attrs: noinline norecurse nounwind readnone uwtable
define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 {
entry:
%cmp24 = icmp sgt i32 %iter_outer, 0
br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28
entry.for.cond.cleanup_crit_edge: ; preds = %entry
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge
%sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
ret i32 %sum.0.lcssa
for.body: ; preds = %for.end, %entry
%k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
%sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
%rem23 = and i32 %k.026, 1
%cmp1 = icmp eq i32 %rem23, 0
br i1 %cmp1, label %entry8, label %for.cond2, !prof !29
for.cond2: ; preds = %if.end9, %for.body
%sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
%i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
%cmp3 = icmp slt i32 %i.0, %iter_inner
br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31
for.body4: ; preds = %for.cond2
%rem5 = srem i32 %k.026, 3
%cmp6 = icmp eq i32 %rem5, 0
br i1 %cmp6, label %entry8, label %if.end9, !prof !32
entry8: ; preds = %for.body4, %for.body
%sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
%i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
%add = add nsw i32 %sum.2, 4
br label %if.end9, !irr_loop !33
if.end9: ; preds = %entry8, %for.body4
%sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
%i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
%add10 = add nsw i32 %sum.3, 1
%inc = add nsw i32 %i.2, 1
br label %for.cond2, !irr_loop !34
for.end: ; preds = %for.cond2
%inc12 = add nuw nsw i32 %k.026, 1
%exitcond = icmp eq i32 %inc12, %iter_outer
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35
}
!27 = !{!"function_entry_count", i64 1}
!28 = !{!"branch_weights", i32 1, i32 0}
!29 = !{!"branch_weights", i32 50, i32 50}
!30 = !{!"branch_weights", i32 950, i32 100}
!31 = !{!"loop_header_weight", i64 1050}
!32 = !{!"branch_weights", i32 323, i32 627}
!33 = !{!"loop_header_weight", i64 373}
!34 = !{!"loop_header_weight", i64 1000}
!35 = !{!"branch_weights", i32 1, i32 99}
; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii':
; CHECK-NEXT: block-frequency-info: _Z11irreducibleii
; CHECK-NEXT: - entry: {{.*}} count = 1
; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0
; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1
; CHECK-NEXT: - for.body: {{.*}} count = 100
; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050
; CHECK-NEXT: - for.body4: {{.*}} count = 950
; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373
; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000
; CHECK-NEXT: - for.end: {{.*}} count = 100
@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
@tracing = local_unnamed_addr global i32 0, align 4
; Function Attrs: noinline norecurse nounwind uwtable
define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 {
entry:
store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
%0 = load i32, i32* @tracing, align 4
%tobool = icmp eq i32 %0, 0
br label %for.cond1
for.cond1: ; preds = %sw.default, %entry
%p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
%sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
%incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
%1 = load i8, i8* %p.addr.0, align 1
%incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
%2 = load i8, i8* %incdec.ptr, align 1
%conv3 = zext i8 %2 to i32
br label %dispatch_op
dispatch_op: ; preds = %sw.bb6, %for.cond1
%p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
%op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
%oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
%sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
switch i8 %op.0, label %sw.default [
i8 0, label %sw.bb
i8 1, label %dispatch_op.sw.bb6_crit_edge
i8 2, label %sw.bb15
], !prof !36
dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
br label %sw.bb6
sw.bb: ; preds = %indirectgoto, %dispatch_op
%oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
%sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
%add.neg = sub i32 -5, %oparg.1
%sub = add i32 %add.neg, %sum.2
br label %exit
TARGET_1: ; preds = %indirectgoto
%incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
%3 = load i8, i8* %p.addr.5, align 1
%conv5 = zext i8 %3 to i32
br label %sw.bb6
sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
%p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
%oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
%sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
%mul = mul nsw i32 %oparg.2, 7
%add7 = add nsw i32 %sum.3, %mul
%rem46 = and i32 %add7, 1
%cmp8 = icmp eq i32 %rem46, 0
br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38
if.then: ; preds = %sw.bb6
%mul9 = mul nsw i32 %add7, 9
br label %indirectgoto
TARGET_2: ; preds = %indirectgoto
%incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
%4 = load i8, i8* %p.addr.5, align 1
%conv14 = zext i8 %4 to i32
br label %sw.bb15
sw.bb15: ; preds = %TARGET_2, %dispatch_op
%p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
%oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
%sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
%add16 = add nsw i32 %oparg.3, 3
%add17 = add nsw i32 %add16, %sum.4
br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40
if.then18: ; preds = %sw.bb15
%idx.ext = sext i32 %oparg.3 to i64
%add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
%mul19 = mul nsw i32 %add17, 17
br label %indirectgoto
unknown_op: ; preds = %indirectgoto
%sub24 = add nsw i32 %sum.7, -4
br label %sw.default
sw.default: ; preds = %unknown_op, %dispatch_op
%p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
%sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
%add25 = add nsw i32 %sum.5, 11
br label %for.cond1
exit: ; preds = %sw.bb15, %sw.bb
%sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
ret i32 %sum.6
indirectgoto: ; preds = %if.then18, %if.then
%add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
%sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
%p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
%5 = load i8, i8* %add.ptr.pn, align 1
%idxprom21 = zext i8 %5 to i64
%arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
%6 = load i8*, i8** %arrayidx22, align 8
indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42
}
!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1}
!37 = !{!"branch_weights", i32 201, i32 300}
!38 = !{!"loop_header_weight", i64 501}
!39 = !{!"branch_weights", i32 100, i32 0}
!40 = !{!"loop_header_weight", i64 100}
!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99}
!42 = !{!"loop_header_weight", i64 400}
; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh':
; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh
; CHECK-NEXT: - entry: {{.*}} count = 1
; CHECK-NEXT: - for.cond1: {{.*}} count = 1
; CHECK-NEXT: - dispatch_op: {{.*}} count = 201
; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200
; CHECK-NEXT: - sw.bb: {{.*}} count = 0
; CHECK-NEXT: - TARGET_1: {{.*}} count = 299
; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501
; CHECK-NEXT: - if.then: {{.*}} count = 299
; CHECK-NEXT: - TARGET_2: {{.*}} count = 98
; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100
; CHECK-NEXT: - if.then18: {{.*}} count = 99
; CHECK-NEXT: - unknown_op: {{.*}} count = 0
; CHECK-NEXT: - sw.default: {{.*}} count = 0
; CHECK-NEXT: - exit: {{.*}} count = 1
; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400

View File

@ -10,13 +10,13 @@
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY
; LAZY: 53 bitcode-reader - Number of Metadata records loaded
; LAZY: 55 bitcode-reader - Number of Metadata records loaded
; LAZY: 2 bitcode-reader - Number of MDStrings loaded
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY
; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded
; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded
; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded

View File

@ -0,0 +1,29 @@
:ir
_Z11irreducibleii
# Func Hash:
64451410787
# Num Counters:
6
# Counter Values:
1000
950
100
373
1
0
_Z11irreduciblePh
# Func Hash:
104649601521
# Num Counters:
9
# Counter Values:
100
300
99
300
201
1
1
0
0

View File

@ -0,0 +1,184 @@
; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata
; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
; GEN: $__llvm_profile_raw_version = comdat any
; Function Attrs: noinline norecurse nounwind readnone uwtable
define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 {
entry:
%cmp24 = icmp sgt i32 %iter_outer, 0
br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge
entry.for.cond.cleanup_crit_edge: ; preds = %entry
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end
%sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
ret i32 %sum.0.lcssa
for.body: ; preds = %entry, %for.end
%k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
%sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
%rem23 = and i32 %k.026, 1
%cmp1 = icmp eq i32 %rem23, 0
br i1 %cmp1, label %entry8, label %for.cond2
for.cond2: ; preds = %for.body, %if.end9
%sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
%i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
%cmp3 = icmp slt i32 %i.0, %iter_inner
br i1 %cmp3, label %for.body4, label %for.end
; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}},
; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]]
for.body4: ; preds = %for.cond2
%rem5 = srem i32 %k.026, 3
%cmp6 = icmp eq i32 %rem5, 0
br i1 %cmp6, label %entry8, label %if.end9
entry8: ; preds = %for.body4, %for.body
%sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
%i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
%add = add nsw i32 %sum.2, 4
br label %if.end9
; USE: br label %if.end9,
; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]]
if.end9: ; preds = %entry8, %for.body4
%sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
%i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
%add10 = add nsw i32 %sum.3, 1
%inc = add nsw i32 %i.2, 1
br label %for.cond2
; USE: br label %for.cond2,
; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]]
for.end: ; preds = %for.cond2
%inc12 = add nuw nsw i32 %k.026, 1
%exitcond = icmp eq i32 %inc12, %iter_outer
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
@tracing = local_unnamed_addr global i32 0, align 4
; Function Attrs: noinline norecurse nounwind uwtable
define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) {
entry:
store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
%0 = load i32, i32* @tracing, align 4
%tobool = icmp eq i32 %0, 0
br label %for.cond1
for.cond1: ; preds = %sw.default, %entry
%p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
%sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
%incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
%1 = load i8, i8* %p.addr.0, align 1
%incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
%2 = load i8, i8* %incdec.ptr, align 1
%conv3 = zext i8 %2 to i32
br label %dispatch_op
dispatch_op: ; preds = %sw.bb6, %for.cond1
%p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
%op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
%oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
%sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
switch i8 %op.0, label %sw.default [
i8 0, label %sw.bb
i8 1, label %dispatch_op.sw.bb6_crit_edge
i8 2, label %sw.bb15
]
dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
br label %sw.bb6
sw.bb: ; preds = %indirectgoto, %dispatch_op
%oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
%sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
%add.neg = sub i32 -5, %oparg.1
%sub = add i32 %add.neg, %sum.2
br label %exit
TARGET_1: ; preds = %indirectgoto
%incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
%3 = load i8, i8* %p.addr.5, align 1
%conv5 = zext i8 %3 to i32
br label %sw.bb6
sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1
%p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
%oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
%sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
%mul = mul nsw i32 %oparg.2, 7
%add7 = add nsw i32 %sum.3, %mul
%rem46 = and i32 %add7, 1
%cmp8 = icmp eq i32 %rem46, 0
br i1 %cmp8, label %dispatch_op, label %if.then
; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}},
; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]]
if.then: ; preds = %sw.bb6
%mul9 = mul nsw i32 %add7, 9
br label %indirectgoto
TARGET_2: ; preds = %indirectgoto
%incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
%4 = load i8, i8* %p.addr.5, align 1
%conv14 = zext i8 %4 to i32
br label %sw.bb15
sw.bb15: ; preds = %TARGET_2, %dispatch_op
%p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
%oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
%sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
%add16 = add nsw i32 %oparg.3, 3
%add17 = add nsw i32 %add16, %sum.4
br i1 %tobool, label %if.then18, label %exit
; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}},
; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]]
if.then18: ; preds = %sw.bb15
%idx.ext = sext i32 %oparg.3 to i64
%add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
%mul19 = mul nsw i32 %add17, 17
br label %indirectgoto
unknown_op: ; preds = %indirectgoto
%sub24 = add nsw i32 %sum.7, -4
br label %sw.default
sw.default: ; preds = %unknown_op, %dispatch_op
%p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
%sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
%add25 = add nsw i32 %sum.5, 11
br label %for.cond1
exit: ; preds = %sw.bb15, %sw.bb
%sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
ret i32 %sum.6
indirectgoto: ; preds = %if.then18, %if.then
%add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
%sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
%p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
%5 = load i8, i8* %add.ptr.pn, align 1
%idxprom21 = zext i8 %5 to i64
%arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
%6 = load i8*, i8** %arrayidx22, align 8
indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2]
; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}},
; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]]
}
; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050}
; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373}
; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000}
; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501}
; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100}
; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}