[BOLT] Increase coverage of shrink wrapping [1/5]
Change how function score is calculated and provide more detailed statistics when reporting back frame optimizer and shrink wrapping results. In this new statistics, we provide dynamic coverage numbers. The main metric for shrink wrapping is the number of executed stores that were saved because of shrink wrapping (push instructions that were either entirely moved away from the hot block or converted to a stack adjustment instruction). There is still a number of reduced load instructions (pop) that we are not counting at the moment. Also update alloc combiner to report dynamic numbers, as well as frame optimizer. For debugging purposes, we also include a list of top 10 functions optimized by shrink wrapping. These changes are aimed at better understanding the impact of shrink wrapping in a given binary. We also remove an assertion in dataflow analysis to do not choke on empty functions (which makes no sense). Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D126111
This commit is contained in:
parent
77ccc480ba
commit
42465efd17
|
@ -17,6 +17,7 @@ namespace bolt {
|
||||||
class AllocCombinerPass : public BinaryFunctionPass {
|
class AllocCombinerPass : public BinaryFunctionPass {
|
||||||
/// Stats aggregating variables
|
/// Stats aggregating variables
|
||||||
uint64_t NumCombined{0};
|
uint64_t NumCombined{0};
|
||||||
|
uint64_t DynamicCountCombined{0};
|
||||||
DenseSet<const BinaryFunction *> FuncsChanged;
|
DenseSet<const BinaryFunction *> FuncsChanged;
|
||||||
|
|
||||||
void combineAdjustments(BinaryFunction &BF);
|
void combineAdjustments(BinaryFunction &BF);
|
||||||
|
|
|
@ -315,6 +315,8 @@ public:
|
||||||
void run() {
|
void run() {
|
||||||
derived().preflight();
|
derived().preflight();
|
||||||
|
|
||||||
|
if (Func.begin() == Func.end())
|
||||||
|
return;
|
||||||
// Initialize state for all points of the function
|
// Initialize state for all points of the function
|
||||||
for (BinaryBasicBlock &BB : Func) {
|
for (BinaryBasicBlock &BB : Func) {
|
||||||
StateTy &St = getOrCreateStateAt(BB);
|
StateTy &St = getOrCreateStateAt(BB);
|
||||||
|
@ -324,7 +326,6 @@ public:
|
||||||
St = derived().getStartingStateAtPoint(Inst);
|
St = derived().getStartingStateAtPoint(Inst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert(Func.begin() != Func.end() && "Unexpected empty function");
|
|
||||||
|
|
||||||
std::queue<BinaryBasicBlock *> Worklist;
|
std::queue<BinaryBasicBlock *> Worklist;
|
||||||
// TODO: Pushing this in a DFS ordering will greatly speed up the dataflow
|
// TODO: Pushing this in a DFS ordering will greatly speed up the dataflow
|
||||||
|
|
|
@ -130,7 +130,6 @@ class FrameAnalysis {
|
||||||
/// Analysis stats counters
|
/// Analysis stats counters
|
||||||
uint64_t NumFunctionsNotOptimized{0};
|
uint64_t NumFunctionsNotOptimized{0};
|
||||||
uint64_t NumFunctionsFailedRestoreFI{0};
|
uint64_t NumFunctionsFailedRestoreFI{0};
|
||||||
uint64_t CountFunctionsNotOptimized{0};
|
|
||||||
uint64_t CountFunctionsFailedRestoreFI{0};
|
uint64_t CountFunctionsFailedRestoreFI{0};
|
||||||
uint64_t CountDenominator{0};
|
uint64_t CountDenominator{0};
|
||||||
|
|
||||||
|
|
|
@ -77,9 +77,12 @@ class FrameOptimizerPass : public BinaryFunctionPass {
|
||||||
/// Stats aggregating variables
|
/// Stats aggregating variables
|
||||||
uint64_t NumRedundantLoads{0};
|
uint64_t NumRedundantLoads{0};
|
||||||
uint64_t NumRedundantStores{0};
|
uint64_t NumRedundantStores{0};
|
||||||
uint64_t NumLoadsChangedToReg{0};
|
uint64_t FreqRedundantLoads{0};
|
||||||
uint64_t NumLoadsChangedToImm{0};
|
uint64_t FreqRedundantStores{0};
|
||||||
|
uint64_t FreqLoadsChangedToReg{0};
|
||||||
|
uint64_t FreqLoadsChangedToImm{0};
|
||||||
uint64_t NumLoadsDeleted{0};
|
uint64_t NumLoadsDeleted{0};
|
||||||
|
uint64_t FreqLoadsDeleted{0};
|
||||||
|
|
||||||
DenseSet<const BinaryFunction *> FuncsChanged;
|
DenseSet<const BinaryFunction *> FuncsChanged;
|
||||||
|
|
||||||
|
|
|
@ -310,6 +310,10 @@ class ShrinkWrapping {
|
||||||
/// Pass stats
|
/// Pass stats
|
||||||
static std::atomic_uint64_t SpillsMovedRegularMode;
|
static std::atomic_uint64_t SpillsMovedRegularMode;
|
||||||
static std::atomic_uint64_t SpillsMovedPushPopMode;
|
static std::atomic_uint64_t SpillsMovedPushPopMode;
|
||||||
|
static std::atomic_uint64_t SpillsMovedDynamicCount;
|
||||||
|
static std::atomic_uint64_t SpillsFailedDynamicCount;
|
||||||
|
static std::atomic_uint64_t InstrDynamicCount;
|
||||||
|
static std::atomic_uint64_t StoreDynamicCount;
|
||||||
|
|
||||||
Optional<unsigned> AnnotationIndex;
|
Optional<unsigned> AnnotationIndex;
|
||||||
|
|
||||||
|
@ -515,7 +519,7 @@ public:
|
||||||
BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
|
BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool perform();
|
bool perform(bool HotOnly = false);
|
||||||
|
|
||||||
static void printStats();
|
static void printStats();
|
||||||
};
|
};
|
||||||
|
|
|
@ -2299,7 +2299,7 @@ uint64_t BinaryFunction::getFunctionScore() const {
|
||||||
uint64_t BBExecCount = BB->getExecutionCount();
|
uint64_t BBExecCount = BB->getExecutionCount();
|
||||||
if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
|
if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
|
||||||
continue;
|
continue;
|
||||||
TotalScore += BBExecCount;
|
TotalScore += BBExecCount * BB->getNumNonPseudos();
|
||||||
}
|
}
|
||||||
FunctionScore = TotalScore;
|
FunctionScore = TotalScore;
|
||||||
return FunctionScore;
|
return FunctionScore;
|
||||||
|
|
|
@ -101,6 +101,7 @@ void AllocCombinerPass::combineAdjustments(BinaryFunction &BF) {
|
||||||
|
|
||||||
BB.eraseInstruction(BB.findInstruction(Prev));
|
BB.eraseInstruction(BB.findInstruction(Prev));
|
||||||
++NumCombined;
|
++NumCombined;
|
||||||
|
DynamicCountCombined += BB.getKnownExecutionCount();
|
||||||
FuncsChanged.insert(&BF);
|
FuncsChanged.insert(&BF);
|
||||||
Prev = &Inst;
|
Prev = &Inst;
|
||||||
}
|
}
|
||||||
|
@ -116,7 +117,8 @@ void AllocCombinerPass::runOnFunctions(BinaryContext &BC) {
|
||||||
});
|
});
|
||||||
|
|
||||||
outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
|
outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
|
||||||
<< " empty spaces coalesced.\n";
|
<< " empty spaces coalesced (dyn count: " << DynamicCountCombined
|
||||||
|
<< ").\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace bolt
|
} // end namespace bolt
|
||||||
|
|
|
@ -527,16 +527,12 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto &I : BC.getBinaryFunctions()) {
|
for (auto &I : BC.getBinaryFunctions()) {
|
||||||
uint64_t Count = I.second.getExecutionCount();
|
CountDenominator += I.second.getFunctionScore();
|
||||||
if (Count != BinaryFunction::COUNT_NO_PROFILE)
|
|
||||||
CountDenominator += Count;
|
|
||||||
|
|
||||||
// "shouldOptimize" for passes that run after finalize
|
// "shouldOptimize" for passes that run after finalize
|
||||||
if (!(I.second.isSimple() && I.second.hasCFG() && !I.second.isIgnored()) ||
|
if (!(I.second.isSimple() && I.second.hasCFG() && !I.second.isIgnored()) ||
|
||||||
!opts::shouldFrameOptimize(I.second)) {
|
!opts::shouldFrameOptimize(I.second)) {
|
||||||
++NumFunctionsNotOptimized;
|
++NumFunctionsNotOptimized;
|
||||||
if (Count != BinaryFunction::COUNT_NO_PROFILE)
|
|
||||||
CountFunctionsNotOptimized += Count;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -545,9 +541,7 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
|
||||||
"FA breakdown", opts::TimeFA);
|
"FA breakdown", opts::TimeFA);
|
||||||
if (!restoreFrameIndex(I.second)) {
|
if (!restoreFrameIndex(I.second)) {
|
||||||
++NumFunctionsFailedRestoreFI;
|
++NumFunctionsFailedRestoreFI;
|
||||||
uint64_t Count = I.second.getExecutionCount();
|
CountFunctionsFailedRestoreFI += I.second.getFunctionScore();
|
||||||
if (Count != BinaryFunction::COUNT_NO_PROFILE)
|
|
||||||
CountFunctionsFailedRestoreFI += Count;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -568,10 +562,7 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
|
||||||
|
|
||||||
void FrameAnalysis::printStats() {
|
void FrameAnalysis::printStats() {
|
||||||
outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
|
outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
|
||||||
<< " function(s) "
|
<< " function(s) were not optimized.\n"
|
||||||
<< format("(%.1lf%% dyn cov)",
|
|
||||||
(100.0 * CountFunctionsNotOptimized / CountDenominator))
|
|
||||||
<< " were not optimized.\n"
|
|
||||||
<< "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
|
<< "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
|
||||||
<< " function(s) "
|
<< " function(s) "
|
||||||
<< format("(%.1lf%% dyn cov)",
|
<< format("(%.1lf%% dyn cov)",
|
||||||
|
|
|
@ -108,6 +108,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
++NumRedundantLoads;
|
++NumRedundantLoads;
|
||||||
|
FreqRedundantLoads += BB.getKnownExecutionCount();
|
||||||
Changed = true;
|
Changed = true;
|
||||||
LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
|
LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
|
||||||
LLVM_DEBUG(Inst.dump());
|
LLVM_DEBUG(Inst.dump());
|
||||||
|
@ -120,11 +121,12 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
|
||||||
LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
|
LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
++NumLoadsChangedToReg;
|
FreqLoadsChangedToReg += BB.getKnownExecutionCount();
|
||||||
MIB->removeAnnotation(Inst, "FrameAccessEntry");
|
MIB->removeAnnotation(Inst, "FrameAccessEntry");
|
||||||
LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
|
LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
|
||||||
if (MIB->isRedundantMove(Inst)) {
|
if (MIB->isRedundantMove(Inst)) {
|
||||||
++NumLoadsDeleted;
|
++NumLoadsDeleted;
|
||||||
|
FreqLoadsDeleted += BB.getKnownExecutionCount();
|
||||||
LLVM_DEBUG(dbgs() << "Created a redundant move\n");
|
LLVM_DEBUG(dbgs() << "Created a redundant move\n");
|
||||||
// Delete it!
|
// Delete it!
|
||||||
ToErase.push_front(std::make_pair(&BB, &Inst));
|
ToErase.push_front(std::make_pair(&BB, &Inst));
|
||||||
|
@ -136,7 +138,7 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
|
||||||
if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
|
if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
|
||||||
LLVM_DEBUG(dbgs() << "FAILED\n");
|
LLVM_DEBUG(dbgs() << "FAILED\n");
|
||||||
} else {
|
} else {
|
||||||
++NumLoadsChangedToImm;
|
FreqLoadsChangedToImm += BB.getKnownExecutionCount();
|
||||||
MIB->removeAnnotation(Inst, "FrameAccessEntry");
|
MIB->removeAnnotation(Inst, "FrameAccessEntry");
|
||||||
LLVM_DEBUG(dbgs() << "Ok\n");
|
LLVM_DEBUG(dbgs() << "Ok\n");
|
||||||
}
|
}
|
||||||
|
@ -199,6 +201,7 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
++NumRedundantStores;
|
++NumRedundantStores;
|
||||||
|
FreqRedundantStores += BB.getKnownExecutionCount();
|
||||||
Changed = true;
|
Changed = true;
|
||||||
LLVM_DEBUG(dbgs() << "Unused store instruction: ");
|
LLVM_DEBUG(dbgs() << "Unused store instruction: ");
|
||||||
LLVM_DEBUG(Inst.dump());
|
LLVM_DEBUG(Inst.dump());
|
||||||
|
@ -286,11 +289,16 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
|
||||||
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
|
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
|
||||||
<< " redundant load(s) and " << NumRedundantStores
|
<< " redundant load(s) and " << NumRedundantStores
|
||||||
<< " unused store(s)\n";
|
<< " unused store(s)\n";
|
||||||
outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
|
outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
|
||||||
<< " load(s) to use a register instead of a stack access, and "
|
<< " and frequency of unused stores is " << FreqRedundantStores
|
||||||
<< NumLoadsChangedToImm << " to use an immediate.\n"
|
<< "\n";
|
||||||
<< "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
|
outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
|
||||||
<< NumRedundantStores << " store(s).\n";
|
<< FreqLoadsChangedToReg
|
||||||
|
<< " and frequency of loads changed to use an immediate is "
|
||||||
|
<< FreqLoadsChangedToImm << "\n";
|
||||||
|
outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
|
||||||
|
<< " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
|
||||||
|
<< NumRedundantStores << " store(s)\n";
|
||||||
FA->printStats();
|
FA->printStats();
|
||||||
ShrinkWrapping::printStats();
|
ShrinkWrapping::printStats();
|
||||||
}
|
}
|
||||||
|
@ -323,34 +331,51 @@ void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
|
||||||
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
|
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
|
||||||
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
|
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
|
||||||
|
|
||||||
|
std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
|
||||||
|
auto LogFunc = [&](BinaryFunction &BF) {
|
||||||
|
auto Lower = std::lower_bound(
|
||||||
|
Top10Funcs.begin(), Top10Funcs.end(), BF.getKnownExecutionCount(),
|
||||||
|
[](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
|
||||||
|
uint64_t Value) { return Elmt.first > Value; });
|
||||||
|
if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
|
||||||
|
return;
|
||||||
|
Top10Funcs.insert(Lower,
|
||||||
|
std::make_pair<>(BF.getKnownExecutionCount(), &BF));
|
||||||
|
if (Top10Funcs.size() > 10)
|
||||||
|
Top10Funcs.resize(10);
|
||||||
|
};
|
||||||
|
(void)LogFunc;
|
||||||
|
|
||||||
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
|
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
|
||||||
if (!FA.hasFrameInfo(BF))
|
if (BF.getFunctionScore() == 0)
|
||||||
return true;
|
|
||||||
|
|
||||||
if (opts::FrameOptimization == FOP_HOT &&
|
|
||||||
(BF.getKnownExecutionCount() < BC.getHotThreshold()))
|
|
||||||
return true;
|
|
||||||
|
|
||||||
if (BF.getKnownExecutionCount() == 0)
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const bool HotOnly = opts::FrameOptimization == FOP_HOT;
|
||||||
|
|
||||||
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
|
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
|
||||||
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
|
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
|
||||||
DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
|
DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
|
||||||
ShrinkWrapping SW(FA, BF, Info, AllocatorId);
|
ShrinkWrapping SW(FA, BF, Info, AllocatorId);
|
||||||
|
|
||||||
if (SW.perform()) {
|
if (SW.perform(HotOnly)) {
|
||||||
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
|
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
|
||||||
FuncsChanged.insert(&BF);
|
FuncsChanged.insert(&BF);
|
||||||
|
LLVM_DEBUG(LogFunc(BF));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
|
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
|
||||||
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
|
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
|
||||||
SkipPredicate, "shrink-wrapping");
|
SkipPredicate, "shrink-wrapping");
|
||||||
|
|
||||||
|
if (!Top10Funcs.empty()) {
|
||||||
|
outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
|
||||||
|
for (const auto &Elmt : Top10Funcs)
|
||||||
|
outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace bolt
|
} // namespace bolt
|
||||||
|
|
|
@ -712,6 +712,10 @@ void StackLayoutModifier::initialize() {
|
||||||
|
|
||||||
std::atomic_uint64_t ShrinkWrapping::SpillsMovedRegularMode{0};
|
std::atomic_uint64_t ShrinkWrapping::SpillsMovedRegularMode{0};
|
||||||
std::atomic_uint64_t ShrinkWrapping::SpillsMovedPushPopMode{0};
|
std::atomic_uint64_t ShrinkWrapping::SpillsMovedPushPopMode{0};
|
||||||
|
std::atomic_uint64_t ShrinkWrapping::SpillsMovedDynamicCount{0};
|
||||||
|
std::atomic_uint64_t ShrinkWrapping::SpillsFailedDynamicCount{0};
|
||||||
|
std::atomic_uint64_t ShrinkWrapping::InstrDynamicCount{0};
|
||||||
|
std::atomic_uint64_t ShrinkWrapping::StoreDynamicCount{0};
|
||||||
|
|
||||||
using BBIterTy = BinaryBasicBlock::iterator;
|
using BBIterTy = BinaryBasicBlock::iterator;
|
||||||
|
|
||||||
|
@ -1273,16 +1277,19 @@ void ShrinkWrapping::moveSaveRestores() {
|
||||||
// Keeps info about successfully moved regs: reg index, save position and
|
// Keeps info about successfully moved regs: reg index, save position and
|
||||||
// save size
|
// save size
|
||||||
std::vector<std::tuple<unsigned, MCInst *, size_t>> MovedRegs;
|
std::vector<std::tuple<unsigned, MCInst *, size_t>> MovedRegs;
|
||||||
|
uint64_t TotalEstimatedWin = 0;
|
||||||
|
|
||||||
for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
|
for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
|
||||||
MCInst *BestPosSave = nullptr;
|
MCInst *BestPosSave = nullptr;
|
||||||
uint64_t TotalEstimatedWin = 0;
|
uint64_t EstimatedWin = 0;
|
||||||
if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin))
|
if (!isBestSavePosCold(I, BestPosSave, EstimatedWin))
|
||||||
continue;
|
continue;
|
||||||
SmallVector<ProgramPoint, 4> RestorePoints =
|
SmallVector<ProgramPoint, 4> RestorePoints =
|
||||||
doRestorePlacement(BestPosSave, I, TotalEstimatedWin);
|
doRestorePlacement(BestPosSave, I, EstimatedWin);
|
||||||
if (RestorePoints.empty())
|
if (RestorePoints.empty()) {
|
||||||
|
SpillsFailedDynamicCount += EstimatedWin;
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[I];
|
const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[I];
|
||||||
const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[I];
|
const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[I];
|
||||||
|
@ -1295,8 +1302,10 @@ void ShrinkWrapping::moveSaveRestores() {
|
||||||
|
|
||||||
// If we don't know stack state at this point, bail
|
// If we don't know stack state at this point, bail
|
||||||
if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) &&
|
if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) &&
|
||||||
(SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY))
|
(SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) {
|
||||||
|
SpillsFailedDynamicCount += EstimatedWin;
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Operation mode: if true, will insert push/pops instead of loads/restores
|
// Operation mode: if true, will insert push/pops instead of loads/restores
|
||||||
bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset);
|
bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset);
|
||||||
|
@ -1319,6 +1328,7 @@ void ShrinkWrapping::moveSaveRestores() {
|
||||||
scheduleOldSaveRestoresRemoval(I, UsePushPops);
|
scheduleOldSaveRestoresRemoval(I, UsePushPops);
|
||||||
scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops);
|
scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops);
|
||||||
MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize));
|
MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize));
|
||||||
|
TotalEstimatedWin += EstimatedWin;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Revert push-pop mode if it failed for a single CSR
|
// Revert push-pop mode if it failed for a single CSR
|
||||||
|
@ -1348,6 +1358,7 @@ void ShrinkWrapping::moveSaveRestores() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
SpillsMovedDynamicCount += TotalEstimatedWin;
|
||||||
|
|
||||||
// Update statistics
|
// Update statistics
|
||||||
if (!UsedPushPopMode) {
|
if (!UsedPushPopMode) {
|
||||||
|
@ -1941,12 +1952,36 @@ void ShrinkWrapping::rebuildCFI() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ShrinkWrapping::perform() {
|
bool ShrinkWrapping::perform(bool HotOnly) {
|
||||||
HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false);
|
HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false);
|
||||||
PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
|
PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
|
||||||
PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
|
PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
|
||||||
DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
|
DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
|
||||||
|
|
||||||
|
// Update pass statistics
|
||||||
|
uint64_t TotalInstrs = 0ULL;
|
||||||
|
uint64_t TotalStoreInstrs = 0ULL;
|
||||||
|
for (BinaryBasicBlock *BB : BF.layout()) {
|
||||||
|
uint64_t BBExecCount = BB->getExecutionCount();
|
||||||
|
if (!BBExecCount || BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
|
||||||
|
continue;
|
||||||
|
for (const auto &Instr : *BB) {
|
||||||
|
if (BC.MIB->isPseudo(Instr))
|
||||||
|
continue;
|
||||||
|
if (BC.MIB->isStore(Instr))
|
||||||
|
TotalStoreInstrs += BBExecCount;
|
||||||
|
TotalInstrs += BBExecCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
InstrDynamicCount += TotalInstrs;
|
||||||
|
StoreDynamicCount += TotalStoreInstrs;
|
||||||
|
|
||||||
|
if (!FA.hasFrameInfo(BF))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold()))
|
||||||
|
return false;
|
||||||
|
|
||||||
if (BF.checkForAmbiguousJumpTables()) {
|
if (BF.checkForAmbiguousJumpTables()) {
|
||||||
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
|
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
|
||||||
<< ".\n");
|
<< ".\n");
|
||||||
|
@ -1993,6 +2028,24 @@ void ShrinkWrapping::printStats() {
|
||||||
outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
|
outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
|
||||||
<< " spills inserting load/stores and " << SpillsMovedPushPopMode
|
<< " spills inserting load/stores and " << SpillsMovedPushPopMode
|
||||||
<< " spills inserting push/pops\n";
|
<< " spills inserting push/pops\n";
|
||||||
|
if (!InstrDynamicCount || !StoreDynamicCount)
|
||||||
|
return;
|
||||||
|
outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
|
||||||
|
<< " store executions ("
|
||||||
|
<< format("%.1lf%%",
|
||||||
|
(100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
|
||||||
|
<< " total instructions executed, "
|
||||||
|
<< format("%.1lf%%",
|
||||||
|
(100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
|
||||||
|
<< " store instructions)\n";
|
||||||
|
outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
|
||||||
|
<< SpillsFailedDynamicCount << " store executions ("
|
||||||
|
<< format("%.1lf%%",
|
||||||
|
(100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
|
||||||
|
<< " total instructions executed, "
|
||||||
|
<< format("%.1lf%%",
|
||||||
|
(100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
|
||||||
|
<< " store instructions)\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Operators necessary as a result of using MCAnnotation
|
// Operators necessary as a result of using MCAnnotation
|
||||||
|
|
|
@ -34,6 +34,8 @@ a:
|
||||||
|
|
||||||
# Check shrink wrapping results:
|
# Check shrink wrapping results:
|
||||||
# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 2 spills inserting push/pops
|
# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 2 spills inserting push/pops
|
||||||
|
# CHECK: BOLT-INFO: Shrink wrapping reduced 6 store executions (28.6% total instructions executed, 100.0% store instructions)
|
||||||
|
# CHECK: BOLT-INFO: Shrink wrapping failed at reducing 0 store executions (0.0% total instructions executed, 0.0% store instructions)
|
||||||
|
|
||||||
# Check that order is correct
|
# Check that order is correct
|
||||||
# CHECK: Binary Function "_start" after frame-optimizer
|
# CHECK: Binary Function "_start" after frame-optimizer
|
||||||
|
|
Loading…
Reference in New Issue