[NVPTX] Use the mask() operator to initialize packed structs with pointers

The current implementation assumes that all pointers used in the
initialization of an aggregate are aligned according to the pointer size
of the target; that might not be so if the object is packed. In that
case, an array of .u8 should be used and pointers should be decorated
with the mask() operator.

The operator was introduced in PTX ISA 7.1, so an error is issued if the
case is detected for an earlier version.

Differential Revision: https://reviews.llvm.org/D127504
This commit is contained in:
Igor Kudrin 2022-07-18 04:03:55 -07:00
parent e27f9214c0
commit 04e978ccba
4 changed files with 160 additions and 20 deletions

View File

@ -75,6 +75,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
@ -1172,11 +1173,26 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
bufferAggregateConstant(Initializer, &aggBuffer);
if (aggBuffer.numSymbols()) {
unsigned int ptrSize = MAI->getCodePointerSize();
O << " .u" << ptrSize * 8 << " ";
getSymbol(GVar)->print(O, MAI);
O << "[" << ElementSize / ptrSize << "] = {";
aggBuffer.printWords(O);
O << "}";
if (ElementSize % ptrSize ||
!aggBuffer.allSymbolsAligned(ptrSize)) {
// Print in bytes and use the mask() operator for pointers.
if (!STI.hasMaskOperator())
report_fatal_error(
"initialized packed aggregate with pointers '" +
GVar->getName() +
"' requires at least PTX ISA version 7.1");
O << " .u8 ";
getSymbol(GVar)->print(O, MAI);
O << "[" << ElementSize << "] = {";
aggBuffer.printBytes(O);
O << "}";
} else {
O << " .u" << ptrSize * 8 << " ";
getSymbol(GVar)->print(O, MAI);
O << "[" << ElementSize / ptrSize << "] = {";
aggBuffer.printWords(O);
O << "}";
}
} else {
O << " .b8 ";
getSymbol(GVar)->print(O, MAI);
@ -1233,10 +1249,33 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
}
void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
for (unsigned int pos = 0; pos < size; ++pos) {
unsigned int ptrSize = AP.MAI->getCodePointerSize();
symbolPosInBuffer.push_back(size);
unsigned int nSym = 0;
unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
for (unsigned int pos = 0; pos < size;) {
if (pos)
os << ", ";
os << (unsigned int)buffer[pos];
if (pos != nextSymbolPos) {
os << (unsigned int)buffer[pos];
++pos;
continue;
}
// Generate a per-byte mask() operator for the symbol, which looks like:
// .global .u8 addr[] = {0xFF(foo), 0xFF00(foo), 0xFF0000(foo), ...};
// See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers
std::string symText;
llvm::raw_string_ostream oss(symText);
printSymbol(nSym, oss);
for (unsigned i = 0; i < ptrSize; ++i) {
if (i)
os << ", ";
llvm::write_hex(os, 0xFFULL << i * 8, HexPrintStyle::PrefixUpper);
os << "(" << symText << ")";
}
pos += ptrSize;
nextSymbolPos = symbolPosInBuffer[++nSym];
assert(nextSymbolPos >= pos);
}
}

View File

@ -61,26 +61,31 @@ class MCOperand;
class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
class AggBuffer {
// Used to buffer the emitted string for initializing global
// aggregates.
// Used to buffer the emitted string for initializing global aggregates.
//
// Normally an aggregate (array, vector or structure) is emitted
// as a u8[]. However, if one element/field of the aggregate
// is a non-NULL address, then the aggregate is emitted as u32[]
// or u64[].
// Normally an aggregate (array, vector, or structure) is emitted as a u8[].
// However, if either element/field of the aggregate is a non-NULL address,
// and all such addresses are properly aligned, then the aggregate is
// emitted as u32[] or u64[]. In the case of unaligned addresses, the
// aggregate is emitted as u8[], and the mask() operator is used for all
// pointers.
//
// We first layout the aggregate in 'buffer' in bytes, except for
// those symbol addresses. For the i-th symbol address in the
//aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
// are filled with 0s. symbolPosInBuffer[i-1] records its position
// in 'buffer', and Symbols[i-1] records the Value*.
// We first layout the aggregate in 'buffer' in bytes, except for those
// symbol addresses. For the i-th symbol address in the aggregate, its
// corresponding 4-byte or 8-byte elements in 'buffer' are filled with 0s.
// symbolPosInBuffer[i-1] records its position in 'buffer', and Symbols[i-1]
// records the Value*.
//
// Once we have this AggBuffer setup, we can choose how to print
// it out.
// Once we have this AggBuffer setup, we can choose how to print it out.
public:
// number of symbol addresses
unsigned numSymbols() const { return Symbols.size(); }
bool allSymbolsAligned(unsigned ptrSize) const {
return llvm::all_of(symbolPosInBuffer,
[=](unsigned pos) { return pos % ptrSize == 0; });
}
private:
const unsigned size; // size of the buffer in bytes
std::vector<unsigned char> buffer; // the buffer

View File

@ -77,6 +77,7 @@ public:
bool hasImageHandles() const;
bool hasFP16Math() const { return SmVersion >= 53; }
bool allowFP16Math() const;
bool hasMaskOperator() const { return PTXVersion >= 71; }
unsigned int getSmVersion() const { return SmVersion; }
std::string getTargetName() const { return TargetName; }

View File

@ -0,0 +1,95 @@
; RUN: not --crash llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx70 2>&1 | \
; RUN: FileCheck %s --check-prefix=ERROR
; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | \
; RUN: FileCheck %s --check-prefixes=CHECK,CHECK32
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | \
; RUN: FileCheck %s --check-prefixes=CHECK,CHECK64
; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
; RUN: %if ptxas-11.1 %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
;; Test that packed structs with symbol references are represented using the
;; mask() operator.
declare void @func()
@p = addrspace(1) global i8 0
; CHECK: .extern .func func
; CHECK: .u8 p;
%t1 = type <{ i16, i8*, i8, void ()*, i8*, i32 }>
@s1 = addrspace(1) global %t1 <{
; ERROR: initialized packed aggregate with pointers 's1' requires at least PTX ISA version 7.1
; CHECK32: .global .align 1 .u8 s1[19] = {
; CHECK64: .global .align 1 .u8 s1[31] = {
i16 12,
; CHECK-SAME: 12, 0,
i8* addrspacecast (i8 addrspace(1)* @p to i8*),
; CHECK-SAME: 0xFF(generic(p)), 0xFF00(generic(p)), 0xFF0000(generic(p)), 0xFF000000(generic(p)),
; CHECK64-SAME: 0xFF00000000(generic(p)), 0xFF0000000000(generic(p)), 0xFF000000000000(generic(p)), 0xFF00000000000000(generic(p)),
i8 34,
; CHECK-SAME: 34
void ()* @func,
; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
i8* addrspacecast (i8 addrspace(1)* getelementptr (i8, i8 addrspace(1)* @p, i32 3) to i8*),
; CHECK-SAME: 0xFF(generic(p)+3), 0xFF00(generic(p)+3), 0xFF0000(generic(p)+3), 0xFF000000(generic(p)+3),
; CHECK64-SAME: 0xFF00000000(generic(p)+3), 0xFF0000000000(generic(p)+3), 0xFF000000000000(generic(p)+3), 0xFF00000000000000(generic(p)+3),
i32 56 }>, align 1
; CHECK-SAME: 56, 0, 0, 0};
;; Test a case than an unaligned pointer is in a nested struct.
%t2i = type <{ void ()* }>
%t2o = type { i8, %t2i, i32 }
@s2 = addrspace(1) global %t2o {
; CHECK32: .global .align 8 .u8 s2[12] = {
; CHECK64: .global .align 8 .u8 s2[16] = {
i8 12,
; CHECK-SAME: 12,
%t2i <{ void()* @func }>,
; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
i32 34}
; CHECK-SAME: 0, 0, 0,
; CHECK-SAME: 34, 0, 0, 0};
;; Test that a packed struct which size is not multiple of the pointer size
;; is printed in bytes and uses the mask() operator for pointers even though
;; the pointers are aligned.
%t3 = type <{ void ()*, i8 }>
@s3 = addrspace(1) global %t3 <{
; CHECK32: .global .align 1 .u8 s3[5] = {
; CHECK64: .global .align 1 .u8 s3[9] = {
void ()* @func,
; CHECK-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
i8 56 }>, align 1
; CHECK-SAME: 56};
;; Test that a packed struct with aligned pointers is printed in words.
%t4 = type <{ void ()*, i64 }>
@s4 = addrspace(1) global %t4 <{
; CHECK32: .global .align 1 .u32 s4[3] = {
; CHECK64: .global .align 1 .u64 s4[2] = {
void()* @func,
; CHECK-SAME: func,
i64 15}>, align 1
; CHECK32-SAME: 15, 0};
; CHECK64-SAME: 15};
;; Test that a packed struct with unaligned pointers inside an array is handled.
%t5 = type <{ void ()*, i16 }>
@a5 = addrspace(1) global [2 x %t5] [%t5 <{ void()* @func, i16 5 }>, %t5 <{ void()* @func, i16 9 }> ]
; CHECK32: .global .align 8 .u8 a5[12] = {
; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 5, 0,
; CHECK32-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func), 9, 0};
; CHECK64: .global .align 8 .u8 a5[20] = {
; CHECK64-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
; CHECK64-SAME: 5, 0,
; CHECK64-SAME: 0xFF(func), 0xFF00(func), 0xFF0000(func), 0xFF000000(func),
; CHECK64-SAME: 0xFF00000000(func), 0xFF0000000000(func), 0xFF000000000000(func), 0xFF00000000000000(func),
; CHECK64-SAME: 9, 0};