[BOLT] Increase coverage of shrink wrapping [3/5]
Add the option to run -equalize-bb-counts before shrink wrapping to avoid unnecessarily optimizing some CFGs where profile is inaccurate but we can prove two blocks have the same frequency. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D126113
This commit is contained in:
parent
3508ced6ea
commit
3332904ad6
|
@ -13,6 +13,7 @@ namespace llvm {
|
|||
namespace bolt {
|
||||
|
||||
class BinaryFunction;
|
||||
class DataflowInfoManager;
|
||||
|
||||
enum MCFCostFunction : char {
|
||||
MCF_DISABLE = 0,
|
||||
|
@ -22,6 +23,12 @@ enum MCFCostFunction : char {
|
|||
MCF_BLAMEFTS
|
||||
};
|
||||
|
||||
/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
|
||||
/// without the Usability Burden" by Diego Novillo to make basic block counts
|
||||
/// equal if we show that A dominates B, B post-dominates A and they are in the
|
||||
/// same loop and same loop nesting level.
|
||||
void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF);
|
||||
|
||||
/// Fill edge counts based on the basic block count. Used in nonLBR mode when
|
||||
/// we only have bb count.
|
||||
void estimateEdgeCounts(BinaryFunction &BF);
|
||||
|
|
|
@ -35,6 +35,7 @@ extern llvm::cl::opt<bool> AggregateOnly;
|
|||
extern llvm::cl::opt<unsigned> BucketsPerLine;
|
||||
extern llvm::cl::opt<bool> DiffOnly;
|
||||
extern llvm::cl::opt<bool> EnableBAT;
|
||||
extern llvm::cl::opt<bool> EqualizeBBCounts;
|
||||
extern llvm::cl::opt<bool> RemoveSymtab;
|
||||
extern llvm::cl::opt<unsigned> ExecutionCountThreshold;
|
||||
extern llvm::cl::opt<unsigned> HeatmapBlock;
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "bolt/Passes/ShrinkWrapping.h"
|
||||
#include "bolt/Passes/StackAvailableExpressions.h"
|
||||
#include "bolt/Passes/StackReachingUses.h"
|
||||
#include "bolt/Utils/CommandLineOpts.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include <deque>
|
||||
#include <unordered_map>
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "bolt/Passes/MCF.h"
|
||||
#include "bolt/Core/BinaryFunction.h"
|
||||
#include "bolt/Passes/DataflowInfoManager.h"
|
||||
#include "bolt/Utils/CommandLineOpts.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include <algorithm>
|
||||
|
@ -33,15 +34,8 @@ extern cl::opt<bool> TimeOpts;
|
|||
static cl::opt<bool> IterativeGuess(
|
||||
"iterative-guess",
|
||||
cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
|
||||
|
||||
cl::Hidden, cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
EqualizeBBCounts("equalize-bb-counts",
|
||||
cl::desc("in non-LBR mode, use same count for BBs "
|
||||
"that should have equivalent count"),
|
||||
cl::Hidden, cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool> UseRArcs(
|
||||
"mcf-use-rarcs",
|
||||
cl::desc("in MCF, consider the possibility of cancelling flow to balance "
|
||||
|
@ -370,12 +364,12 @@ createLoopNestLevelMap(BinaryFunction &BF) {
|
|||
return LoopNestLevel;
|
||||
}
|
||||
|
||||
/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
|
||||
/// without the Usability Burden" by Diego Novillo to make basic block counts
|
||||
/// equal if we show that A dominates B, B post-dominates A and they are in the
|
||||
/// same loop and same loop nesting level.
|
||||
void equalizeBBCounts(BinaryFunction &BF) {
|
||||
auto Info = DataflowInfoManager(BF, nullptr, nullptr);
|
||||
} // end anonymous namespace
|
||||
|
||||
void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) {
|
||||
if (BF.begin() == BF.end())
|
||||
return;
|
||||
|
||||
DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
|
||||
DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis();
|
||||
auto &InsnToBB = Info.getInsnToBBMap();
|
||||
|
@ -448,8 +442,6 @@ void equalizeBBCounts(BinaryFunction &BF) {
|
|||
}
|
||||
}
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
void estimateEdgeCounts(BinaryFunction &BF) {
|
||||
EdgeWeightMap PredEdgeWeights;
|
||||
EdgeWeightMap SuccEdgeWeights;
|
||||
|
@ -459,7 +451,8 @@ void estimateEdgeCounts(BinaryFunction &BF) {
|
|||
}
|
||||
if (opts::EqualizeBBCounts) {
|
||||
LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true));
|
||||
equalizeBBCounts(BF);
|
||||
auto Info = DataflowInfoManager(BF, nullptr, nullptr);
|
||||
equalizeBBCounts(Info, BF);
|
||||
LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true));
|
||||
}
|
||||
if (opts::IterativeGuess)
|
||||
|
|
|
@ -13,6 +13,8 @@
|
|||
#include "bolt/Passes/ShrinkWrapping.h"
|
||||
#include "bolt/Core/MCPlus.h"
|
||||
#include "bolt/Passes/DataflowInfoManager.h"
|
||||
#include "bolt/Passes/MCF.h"
|
||||
#include "bolt/Utils/CommandLineOpts.h"
|
||||
#include <numeric>
|
||||
#include <stack>
|
||||
|
||||
|
@ -1982,6 +1984,9 @@ bool ShrinkWrapping::perform(bool HotOnly) {
|
|||
if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold()))
|
||||
return false;
|
||||
|
||||
if (opts::EqualizeBBCounts)
|
||||
equalizeBBCounts(Info, BF);
|
||||
|
||||
if (BF.checkForAmbiguousJumpTables()) {
|
||||
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
|
||||
<< ".\n");
|
||||
|
|
|
@ -73,6 +73,12 @@ EnableBAT("enable-bat",
|
|||
cl::ZeroOrMore,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool> EqualizeBBCounts(
|
||||
"equalize-bb-counts",
|
||||
cl::desc("use same count for BBs that should have equivalent count (used "
|
||||
"in non-LBR and shrink wrapping)"),
|
||||
cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
|
||||
|
||||
cl::opt<bool> RemoveSymtab("remove-symtab", cl::desc("Remove .symtab section"),
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
# This checks that shrink wrapping does not pessimize a CFG pattern where two
|
||||
# blocks can be proved to have the same execution count but, because of profile
|
||||
# inaccuricies, we could move saves into the second block. We can prove two
|
||||
# blocks have the same frequency when B post-dominate A and A dominates B and
|
||||
# are at the same loop nesting level. This would be a pessimization because
|
||||
# shrink wrapping is unlikely to be able to cleanly move PUSH instructions,
|
||||
# inserting additional store instructions.
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
||||
# RUN: %s -o %t.o
|
||||
# RUN: link_fdata %s %t.o %t.fdata
|
||||
# RUN: llvm-strip --strip-unneeded %t.o
|
||||
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
|
||||
# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \
|
||||
# RUN: -frame-opt=all -equalize-bb-counts | FileCheck %s
|
||||
|
||||
# Here we create a CFG pattern with two blocks A and B belonging to the same
|
||||
# equivalency class as defined by dominance relations and having in theory
|
||||
# the same frequency. But we tweak edge counts from profile to make block A
|
||||
# hotter than block B.
|
||||
.globl _start
|
||||
.type _start, %function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
# Hot prologue
|
||||
# FDATA: 0 [unknown] 0 1 _start 0 0 10
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r14
|
||||
subq $0x20, %rsp
|
||||
b: je end_if_1
|
||||
# FDATA: 1 _start #b# 1 _start #end_if_1# 0 1
|
||||
if_false:
|
||||
movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs
|
||||
c: jmp end_if_1
|
||||
# Reduce frequency from 9 to 1 to simulate an inaccurate profile
|
||||
# FDATA: 1 _start #c# 1 _start #end_if_1# 0 1
|
||||
end_if_1:
|
||||
# first uses of R14 and RBX appear at this point, possible move point for SW
|
||||
mov %r14, %rdi
|
||||
mov %rbx, %rdi
|
||||
leaq -0x20(%rbp), %r14
|
||||
movq -0x20(%rbp), %rdi
|
||||
addq $0x20, %rsp
|
||||
pop %r14
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size _start, .-_start
|
||||
|
||||
.data
|
||||
rel: .quad end_if_1
|
||||
|
||||
# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 0 spills inserting push/pops
|
Loading…
Reference in New Issue