[Libomptarget] Add basic support for dynamic shared memory on AMDGPU
This patchs adds the arguments necessary to allocate the size of the dynamic shared memory via the `LIBOMPTARGET_SHARED_MEMORY_SIZE` environment variable. This patch only allocates the memory, AMDGPU has a limitation that shared memory can only be accessed from the kernel directly. So this will currently only work with optimizations to inline the accessor function. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D125252
This commit is contained in:
parent
716d428ab5
commit
f4f23de1a4
|
@ -336,6 +336,7 @@ struct EnvironmentVariables {
|
|||
int TeamLimit;
|
||||
int TeamThreadLimit;
|
||||
int MaxTeamsDefault;
|
||||
int DynamicMemSize;
|
||||
};
|
||||
|
||||
template <uint32_t wavesize>
|
||||
|
@ -692,9 +693,9 @@ public:
|
|||
return HostFineGrainedMemoryPool;
|
||||
}
|
||||
|
||||
static int readEnvElseMinusOne(const char *Env) {
|
||||
static int readEnv(const char *Env, int Default = -1) {
|
||||
const char *envStr = getenv(Env);
|
||||
int res = -1;
|
||||
int res = Default;
|
||||
if (envStr) {
|
||||
res = std::stoi(envStr);
|
||||
DP("Parsed %s=%d\n", Env, res);
|
||||
|
@ -811,10 +812,11 @@ public:
|
|||
}
|
||||
|
||||
// Get environment variables regarding teams
|
||||
Env.TeamLimit = readEnvElseMinusOne("OMP_TEAM_LIMIT");
|
||||
Env.NumTeams = readEnvElseMinusOne("OMP_NUM_TEAMS");
|
||||
Env.MaxTeamsDefault = readEnvElseMinusOne("OMP_MAX_TEAMS_DEFAULT");
|
||||
Env.TeamThreadLimit = readEnvElseMinusOne("OMP_TEAMS_THREAD_LIMIT");
|
||||
Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
|
||||
Env.NumTeams = readEnv("OMP_NUM_TEAMS");
|
||||
Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
|
||||
Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
|
||||
Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
|
||||
|
||||
// Default state.
|
||||
RequiresFlags = OMP_REQ_UNDEFINED;
|
||||
|
@ -1123,7 +1125,8 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
|||
|
||||
const atl_kernel_info_t KernelInfoEntry =
|
||||
KernelInfoTable[device_id][kernel_name];
|
||||
const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
|
||||
const uint32_t group_segment_size =
|
||||
KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize;
|
||||
const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
|
||||
const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
|
||||
const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
|
||||
|
@ -1182,7 +1185,7 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
|||
packet->grid_size_y = 1;
|
||||
packet->grid_size_z = 1;
|
||||
packet->private_segment_size = KernelInfoEntry.private_segment_size;
|
||||
packet->group_segment_size = KernelInfoEntry.group_segment_size;
|
||||
packet->group_segment_size = group_segment_size;
|
||||
packet->kernel_object = KernelInfoEntry.kernel_object;
|
||||
packet->kernarg_address = 0; // use the block allocator
|
||||
packet->reserved2 = 0; // impl writes id_ here
|
||||
|
@ -1530,14 +1533,14 @@ struct device_environment {
|
|||
__tgt_device_image *image;
|
||||
const size_t img_size;
|
||||
|
||||
device_environment(int device_id, int number_devices,
|
||||
device_environment(int device_id, int number_devices, int dynamic_mem_size,
|
||||
__tgt_device_image *image, const size_t img_size)
|
||||
: image(image), img_size(img_size) {
|
||||
|
||||
host_device_env.NumDevices = number_devices;
|
||||
host_device_env.DeviceNum = device_id;
|
||||
host_device_env.DebugKind = 0;
|
||||
host_device_env.DynamicMemSize = 0;
|
||||
host_device_env.DynamicMemSize = dynamic_mem_size;
|
||||
if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
|
||||
host_device_env.DebugKind = std::stoi(envStr);
|
||||
}
|
||||
|
@ -1861,8 +1864,9 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
|
|||
}
|
||||
|
||||
{
|
||||
auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
|
||||
img_size);
|
||||
auto env =
|
||||
device_environment(device_id, DeviceInfo.NumberOfDevices,
|
||||
DeviceInfo.Env.DynamicMemSize, image, img_size);
|
||||
|
||||
auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
|
||||
auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device
|
||||
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
|
||||
// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
|
||||
// REQUIRES: amdgcn-amd-amdhsa
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main() {
|
||||
int x;
|
||||
#pragma omp target parallel map(from : x)
|
||||
{
|
||||
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
|
||||
#pragma omp barrier
|
||||
if (omp_get_thread_num() == 0)
|
||||
*buf = 1;
|
||||
#pragma omp barrier
|
||||
if (omp_get_thread_num() == 1)
|
||||
x = *buf;
|
||||
}
|
||||
|
||||
// CHECK: PASS
|
||||
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
|
||||
printf("PASS\n");
|
||||
}
|
Loading…
Reference in New Issue