[BOLT] Fix instruction encoding validation

Always use non-symbolizing disassembler for instruction encoding
validation as symbols will be treated as undefined/zeros be the encoder
and causing byte sequence mismatches.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D136118
This commit is contained in:
Maksim Panchenko 2022-10-17 16:15:59 -07:00
parent 5b773dcd2d
commit bcc4c90954
4 changed files with 50 additions and 18 deletions

View File

@ -1236,10 +1236,10 @@ public:
return Size;
}
/// Verify that assembling instruction \p Inst results in the same sequence of
/// bytes as \p Encoding.
bool validateEncoding(const MCInst &Instruction,
ArrayRef<uint8_t> Encoding) const;
/// Validate that disassembling the \p Sequence of bytes into an instruction
/// and assembling the instruction again, results in a byte sequence identical
/// to the original one.
bool validateInstructionEncoding(ArrayRef<uint8_t> Sequence) const;
/// Return a function execution count threshold for determining whether
/// the function is 'hot'. Consider it hot if count is above the average exec

View File

@ -2287,19 +2287,25 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
return std::make_pair(HotSize, ColdSize);
}
bool BinaryContext::validateEncoding(const MCInst &Inst,
ArrayRef<uint8_t> InputEncoding) const {
bool BinaryContext::validateInstructionEncoding(
ArrayRef<uint8_t> InputSequence) const {
MCInst Inst;
uint64_t InstSize;
DisAsm->getInstruction(Inst, InstSize, InputSequence, 0, nulls());
assert(InstSize == InputSequence.size() &&
"Disassembled instruction size does not match the sequence.");
SmallString<256> Code;
SmallVector<MCFixup, 4> Fixups;
raw_svector_ostream VecOS(Code);
MCE->encodeInstruction(Inst, VecOS, Fixups, *STI);
auto EncodedData = ArrayRef<uint8_t>((uint8_t *)Code.data(), Code.size());
if (InputEncoding != EncodedData) {
auto OutputSequence = ArrayRef<uint8_t>((uint8_t *)Code.data(), Code.size());
if (InputSequence != OutputSequence) {
if (opts::Verbosity > 1) {
errs() << "BOLT-WARNING: mismatched encoding detected\n"
<< " input: " << InputEncoding << '\n'
<< " output: " << EncodedData << '\n';
<< " input: " << InputSequence << '\n'
<< " output: " << OutputSequence << '\n';
}
return false;
}

View File

@ -1217,7 +1217,7 @@ bool BinaryFunction::disassemble() {
// Check integrity of LLVM assembler/disassembler.
if (opts::CheckEncoding && !BC.MIB->isBranch(Instruction) &&
!BC.MIB->isCall(Instruction) && !BC.MIB->isNoop(Instruction)) {
if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) {
if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) {
errs() << "BOLT-WARNING: mismatching LLVM encoding detected in "
<< "function " << *this << " for instruction :\n";
BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
@ -1233,15 +1233,10 @@ bool BinaryFunction::disassemble() {
break;
}
// Disassemble again without the symbolizer and check that the disassembly
// matches the assembler output.
MCInst TempInst;
BC.DisAsm->getInstruction(TempInst, Size, FunctionData.slice(Offset),
AbsoluteInstrAddr, nulls());
if (!BC.validateEncoding(TempInst, FunctionData.slice(Offset, Size))) {
if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) {
errs() << "BOLT-WARNING: internal assembler/disassembler error "
"detected for AVX512 instruction:\n";
BC.printInstruction(errs(), TempInst, AbsoluteInstrAddr);
BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
errs() << " in function " << *this << '\n';
setIgnored();
break;

View File

@ -0,0 +1,31 @@
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
# RUN: ld.lld %t.o -o %t.exe -q
# RUN: llvm-bolt %t.exe --relocs -o %t.out --check-encoding |& FileCheck %s
.text
.globl _start
.type _start, %function
_start:
.cfi_startproc
## Check that llvm-bolt uses non-symbolizing disassembler while validating
## instruction encodings. If symbol "foo" below is symbolized, the encoded
## instruction would have a different sequence of bytes from the input
## sequence, as "foo" will not have any address assigned at that point.
movq foo(%rip), %rax
# CHECK-NOT: mismatching LLVM encoding detected
ret
.cfi_endproc
.size _start, .-_start
.globl foo
.type foo, %function
foo:
.cfi_startproc
ret
.cfi_endproc
.size foo, .-foo