AMDGPU: Use attributor to propagate amdgpu-flat-work-group-size

This can merge the acceptable ranges based on the call graph, rather
than the simple application of the attribute. Remove the handling from
the old pass.
This commit is contained in:
Matt Arsenault 2021-09-09 19:57:12 -04:00
parent 8d4b74ac3f
commit ec57b37551
4 changed files with 343 additions and 54 deletions

View File

@ -128,6 +128,17 @@ public:
return ST.hasApertureRegs();
}
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getFlatWorkGroupSizes(F);
}
std::pair<unsigned, unsigned>
getMaximumFlatWorkGroupRange(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
}
private:
/// Check if the ConstantExpr \p CE requires queue ptr attribute.
static bool visitConstExpr(const ConstantExpr *CE) {
@ -470,6 +481,118 @@ AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
llvm_unreachable("AAAMDAttributes is only valid for function position");
}
/// Propagate amdgpu-flat-work-group-size attribute.
struct AAAMDFlatWorkGroupSize
: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
: Base(IRP, 32) {}
/// See AbstractAttribute::getState(...).
IntegerRangeState &getState() override { return *this; }
const IntegerRangeState &getState() const override { return *this; }
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
unsigned MinGroupSize, MaxGroupSize;
std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
intersectKnown(
ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
}
ChangeStatus updateImpl(Attributor &A) override {
ChangeStatus Change = ChangeStatus::UNCHANGED;
auto CheckCallSite = [&](AbstractCallSite CS) {
Function *Caller = CS.getInstruction()->getFunction();
LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
<< "->" << getAssociatedFunction()->getName() << '\n');
const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
Change |=
clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
return true;
};
bool AllCallSitesKnown = true;
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
return indicatePessimisticFixpoint();
return Change;
}
ChangeStatus manifest(Attributor &A) override {
SmallVector<Attribute, 8> AttrList;
Function *F = getAssociatedFunction();
LLVMContext &Ctx = F->getContext();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
unsigned Min, Max;
std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
// Don't add the attribute if it's the implied default.
if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
return ChangeStatus::UNCHANGED;
SmallString<10> Buffer;
raw_svector_ostream OS(Buffer);
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
AttrList.push_back(
Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
/* ForceReplace */ true);
}
const std::string getAsStr() const override {
std::string Str;
raw_string_ostream OS(Str);
OS << "AMDFlatWorkGroupSize[";
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
OS << ']';
return OS.str();
}
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
/// Create an abstract attribute view for the position \p IRP.
static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
Attributor &A);
/// See AbstractAttribute::getName()
const std::string getName() const override {
return "AAAMDFlatWorkGroupSize";
}
/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDFlatWorkGroupSize
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
/// Unique ID (due to the unique address)
static const char ID;
};
const char AAAMDFlatWorkGroupSize::ID = 0;
AAAMDFlatWorkGroupSize &
AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
llvm_unreachable(
"AAAMDFlatWorkGroupSize is only valid for function position");
}
class AMDGPUAttributor : public ModulePass {
public:
AMDGPUAttributor() : ModulePass(ID) {}
@ -497,7 +620,8 @@ public:
BumpPtrAllocator Allocator;
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AACallEdges::ID});
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
&AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID});
Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
@ -505,6 +629,9 @@ public:
if (!F.isIntrinsic()) {
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
}
}
}

View File

@ -55,10 +55,7 @@ static constexpr const FeatureBitset TargetFeatures = {
// Attributes to propagate.
// TODO: Support conservative min/max merging instead of cloning.
static constexpr const char* AttributeNames[] = {
"amdgpu-waves-per-eu",
"amdgpu-flat-work-group-size"
};
static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"};
static constexpr unsigned NumAttr =
sizeof(AttributeNames) / sizeof(AttributeNames[0]);

View File

@ -1,49 +0,0 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-propagate-attributes-late %s | FileCheck %s
; CHECK: define internal void @max_flat_1_1024() #0 {
define internal void @max_flat_1_1024() #0 {
ret void
}
; CHECK: define internal void @max_flat_1_256() #1 {
define internal void @max_flat_1_256() #1 {
ret void
}
; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 {
define amdgpu_kernel void @kernel_1_256_call_default() #1 {
call void @default()
ret void
}
; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
call void @max_flat_1_256()
ret void
}
; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
call void @max_flat_64_64()
ret void
}
; CHECK: define internal void @max_flat_64_64() #2 {
define internal void @max_flat_64_64() #2 {
ret void
}
; CHECK: define internal void @default() #2 {
define internal void @default() #3 {
ret void
}
attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" }
attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" }
attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" }
attributes #3 = { noinline }
; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024"
; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256"
; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256"

View File

@ -0,0 +1,214 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
; Check propagation of amdgpu-flat-work-group-size attribute.
; Called from a single kernel with 1,256
define internal void @default_to_1_256() {
; CHECK-LABEL: define {{[^@]+}}@default_to_1_256
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define amdgpu_kernel void @kernel_1_256() #0 {
; CHECK-LABEL: define {{[^@]+}}@kernel_1_256
; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: call void @default_to_1_256()
; CHECK-NEXT: ret void
;
call void @default_to_1_256()
ret void
}
; Called from a single kernel with 64,128
define internal void @default_to_64_128() {
; CHECK-LABEL: define {{[^@]+}}@default_to_64_128
; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define amdgpu_kernel void @kernel_64_128() #1 {
; CHECK-LABEL: define {{[^@]+}}@kernel_64_128
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: call void @default_to_64_128()
; CHECK-NEXT: call void @flat_group_64_64()
; CHECK-NEXT: call void @default_to_64_256()
; CHECK-NEXT: call void @flat_group_128_256()
; CHECK-NEXT: ret void
;
call void @default_to_64_128()
call void @flat_group_64_64()
call void @default_to_64_256()
call void @flat_group_128_256()
ret void
}
; Called from kernels with 128,512 and 512,512
define internal void @default_to_128_512() {
; CHECK-LABEL: define {{[^@]+}}@default_to_128_512
; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
; This already has a strict bounds, but called from kernels with wider
; bounds, and should not be changed.
define internal void @flat_group_64_64() #2 {
; CHECK-LABEL: define {{[^@]+}}@flat_group_64_64
; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
; 128,256 -> 128,128
define internal void @flat_group_128_256() #3 {
; CHECK-LABEL: define {{[^@]+}}@flat_group_128_256
; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define internal void @flat_group_512_1024() #4 {
; CHECK-LABEL: define {{[^@]+}}@flat_group_512_1024
; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
define amdgpu_kernel void @kernel_128_512() #5 {
; CHECK-LABEL: define {{[^@]+}}@kernel_128_512
; CHECK-SAME: () #[[ATTR2]] {
; CHECK-NEXT: call void @default_to_128_512()
; CHECK-NEXT: call void @flat_group_64_64()
; CHECK-NEXT: ret void
;
call void @default_to_128_512()
call void @flat_group_64_64()
ret void
}
define amdgpu_kernel void @kernel_512_512() #6 {
; CHECK-LABEL: define {{[^@]+}}@kernel_512_512
; CHECK-SAME: () #[[ATTR5]] {
; CHECK-NEXT: call void @default_to_128_512()
; CHECK-NEXT: call void @flat_group_512_1024()
; CHECK-NEXT: ret void
;
call void @default_to_128_512()
call void @flat_group_512_1024()
ret void
}
; Called from kernels with 128,256 and 64,128 => 64,256
define internal void @default_to_64_256() {
; CHECK-LABEL: define {{[^@]+}}@default_to_64_256
; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}
; The kernel's lower bound is higher than the callee's lower bound, so
; this should probably be illegal.
define amdgpu_kernel void @kernel_128_256() #3 {
; CHECK-LABEL: define {{[^@]+}}@kernel_128_256
; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
; CHECK-NEXT: call void @default_to_64_256()
; CHECK-NEXT: ret void
;
call void @default_to_64_256()
ret void
}
; 64,128 -> 64,128
define internal void @merge_cycle_0() #1 {
; CHECK-LABEL: define {{[^@]+}}@merge_cycle_0
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: call void @merge_cycle_1()
; CHECK-NEXT: ret void
;
call void @merge_cycle_1()
ret void
}
; 128,256 -> 128,128
define internal void @merge_cycle_1() #3 {
; CHECK-LABEL: define {{[^@]+}}@merge_cycle_1
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @merge_cycle_0()
; CHECK-NEXT: ret void
;
call void @merge_cycle_0()
ret void
}
define amdgpu_kernel void @kernel_64_256() #7 {
; CHECK-LABEL: define {{[^@]+}}@kernel_64_256
; CHECK-SAME: () #[[ATTR6]] {
; CHECK-NEXT: call void @merge_cycle_0()
; CHECK-NEXT: call void @default_captured_address()
; CHECK-NEXT: call void @externally_visible_default()
; CHECK-NEXT: [[F32:%.*]] = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
; CHECK-NEXT: ret void
;
call void @merge_cycle_0()
call void @default_captured_address()
call void @externally_visible_default()
%f32 = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
ret void
}
define internal void @default_captured_address() {
; CHECK-LABEL: define {{[^@]+}}@default_captured_address
; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
; CHECK-NEXT: store volatile void ()* @default_captured_address, void ()** undef, align 8
; CHECK-NEXT: ret void
;
store volatile void ()* @default_captured_address, void ()** undef, align 8
ret void
}
define void @externally_visible_default() {
; CHECK-LABEL: define {{[^@]+}}@externally_visible_default
; CHECK-SAME: () #[[ATTR8]] {
; CHECK-NEXT: ret void
;
ret void
}
; 1,1024 -> 64,256
define internal i32 @bitcasted_function() {
; CHECK-LABEL: define {{[^@]+}}@bitcasted_function
; CHECK-SAME: () #[[ATTR6]] {
; CHECK-NEXT: ret i32 0
;
ret i32 0
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { "amdgpu-flat-work-group-size"="64,128" }
attributes #2 = { "amdgpu-flat-work-group-size"="64,64" }
attributes #3 = { "amdgpu-flat-work-group-size"="128,256" }
attributes #4 = { "amdgpu-flat-work-group-size"="512,1024" }
attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.