[BOLT] Increase coverage of shrink wrapping [3/5]

Add the option to run -equalize-bb-counts before shrink
wrapping to avoid unnecessarily optimizing some CFGs where profile is
inaccurate but we can prove two blocks have the same frequency.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D126113
This commit is contained in:
Rafael Auler 2022-05-20 19:43:07 -07:00
parent 3508ced6ea
commit 3332904ad6
7 changed files with 87 additions and 16 deletions

View File

@ -13,6 +13,7 @@ namespace llvm {
namespace bolt {
class BinaryFunction;
class DataflowInfoManager;
enum MCFCostFunction : char {
MCF_DISABLE = 0,
@ -22,6 +23,12 @@ enum MCFCostFunction : char {
MCF_BLAMEFTS
};
/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
/// without the Usability Burden" by Diego Novillo to make basic block counts
/// equal if we show that A dominates B, B post-dominates A and they are in the
/// same loop and same loop nesting level.
void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF);
/// Fill edge counts based on the basic block count. Used in nonLBR mode when
/// we only have bb count.
void estimateEdgeCounts(BinaryFunction &BF);

View File

@ -35,6 +35,7 @@ extern llvm::cl::opt<bool> AggregateOnly;
extern llvm::cl::opt<unsigned> BucketsPerLine;
extern llvm::cl::opt<bool> DiffOnly;
extern llvm::cl::opt<bool> EnableBAT;
extern llvm::cl::opt<bool> EqualizeBBCounts;
extern llvm::cl::opt<bool> RemoveSymtab;
extern llvm::cl::opt<unsigned> ExecutionCountThreshold;
extern llvm::cl::opt<unsigned> HeatmapBlock;

View File

@ -17,6 +17,7 @@
#include "bolt/Passes/ShrinkWrapping.h"
#include "bolt/Passes/StackAvailableExpressions.h"
#include "bolt/Passes/StackReachingUses.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/Support/Timer.h"
#include <deque>
#include <unordered_map>

View File

@ -13,6 +13,7 @@
#include "bolt/Passes/MCF.h"
#include "bolt/Core/BinaryFunction.h"
#include "bolt/Passes/DataflowInfoManager.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"
#include <algorithm>
@ -33,15 +34,8 @@ extern cl::opt<bool> TimeOpts;
static cl::opt<bool> IterativeGuess(
"iterative-guess",
cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<bool>
EqualizeBBCounts("equalize-bb-counts",
cl::desc("in non-LBR mode, use same count for BBs "
"that should have equivalent count"),
cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<bool> UseRArcs(
"mcf-use-rarcs",
cl::desc("in MCF, consider the possibility of cancelling flow to balance "
@ -370,12 +364,12 @@ createLoopNestLevelMap(BinaryFunction &BF) {
return LoopNestLevel;
}
/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
/// without the Usability Burden" by Diego Novillo to make basic block counts
/// equal if we show that A dominates B, B post-dominates A and they are in the
/// same loop and same loop nesting level.
void equalizeBBCounts(BinaryFunction &BF) {
auto Info = DataflowInfoManager(BF, nullptr, nullptr);
} // end anonymous namespace
void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) {
if (BF.begin() == BF.end())
return;
DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis();
auto &InsnToBB = Info.getInsnToBBMap();
@ -448,8 +442,6 @@ void equalizeBBCounts(BinaryFunction &BF) {
}
}
} // end anonymous namespace
void estimateEdgeCounts(BinaryFunction &BF) {
EdgeWeightMap PredEdgeWeights;
EdgeWeightMap SuccEdgeWeights;
@ -459,7 +451,8 @@ void estimateEdgeCounts(BinaryFunction &BF) {
}
if (opts::EqualizeBBCounts) {
LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true));
equalizeBBCounts(BF);
auto Info = DataflowInfoManager(BF, nullptr, nullptr);
equalizeBBCounts(Info, BF);
LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true));
}
if (opts::IterativeGuess)

View File

@ -13,6 +13,8 @@
#include "bolt/Passes/ShrinkWrapping.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Passes/DataflowInfoManager.h"
#include "bolt/Passes/MCF.h"
#include "bolt/Utils/CommandLineOpts.h"
#include <numeric>
#include <stack>
@ -1982,6 +1984,9 @@ bool ShrinkWrapping::perform(bool HotOnly) {
if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold()))
return false;
if (opts::EqualizeBBCounts)
equalizeBBCounts(Info, BF);
if (BF.checkForAmbiguousJumpTables()) {
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
<< ".\n");

View File

@ -73,6 +73,12 @@ EnableBAT("enable-bat",
cl::ZeroOrMore,
cl::cat(BoltCategory));
cl::opt<bool> EqualizeBBCounts(
"equalize-bb-counts",
cl::desc("use same count for BBs that should have equivalent count (used "
"in non-LBR and shrink wrapping)"),
cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
cl::opt<bool> RemoveSymtab("remove-symtab", cl::desc("Remove .symtab section"),
cl::cat(BoltCategory));

View File

@ -0,0 +1,58 @@
# This checks that shrink wrapping does not pessimize a CFG pattern where two
# blocks can be proved to have the same execution count but, because of profile
# inaccuricies, we could move saves into the second block. We can prove two
# blocks have the same frequency when B post-dominate A and A dominates B and
# are at the same loop nesting level. This would be a pessimization because
# shrink wrapping is unlikely to be able to cleanly move PUSH instructions,
# inserting additional store instructions.
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \
# RUN: -frame-opt=all -equalize-bb-counts | FileCheck %s
# Here we create a CFG pattern with two blocks A and B belonging to the same
# equivalency class as defined by dominance relations and having in theory
# the same frequency. But we tweak edge counts from profile to make block A
# hotter than block B.
.globl _start
.type _start, %function
_start:
.cfi_startproc
# Hot prologue
# FDATA: 0 [unknown] 0 1 _start 0 0 10
push %rbp
mov %rsp, %rbp
push %rbx
push %r14
subq $0x20, %rsp
b: je end_if_1
# FDATA: 1 _start #b# 1 _start #end_if_1# 0 1
if_false:
movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs
c: jmp end_if_1
# Reduce frequency from 9 to 1 to simulate an inaccurate profile
# FDATA: 1 _start #c# 1 _start #end_if_1# 0 1
end_if_1:
# first uses of R14 and RBX appear at this point, possible move point for SW
mov %r14, %rdi
mov %rbx, %rdi
leaq -0x20(%rbp), %r14
movq -0x20(%rbp), %rdi
addq $0x20, %rsp
pop %r14
pop %rbx
pop %rbp
ret
.cfi_endproc
.size _start, .-_start
.data
rel: .quad end_if_1
# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 0 spills inserting push/pops