From f4f23de1a46f94762b8192e82f20fb86b41c339f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 9 May 2022 14:26:01 -0400 Subject: [PATCH] [Libomptarget] Add basic support for dynamic shared memory on AMDGPU This patchs adds the arguments necessary to allocate the size of the dynamic shared memory via the `LIBOMPTARGET_SHARED_MEMORY_SIZE` environment variable. This patch only allocates the memory, AMDGPU has a limitation that shared memory can only be accessed from the kernel directly. So this will currently only work with optimizations to inline the accessor function. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D125252 --- .../libomptarget/plugins/amdgpu/src/rtl.cpp | 28 +++++++++++-------- .../api/omp_dynamic_shared_memory_amdgpu.c | 25 +++++++++++++++++ 2 files changed, 41 insertions(+), 12 deletions(-) create mode 100644 openmp/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 3086334f8ef6..4b5dd0de4f06 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -336,6 +336,7 @@ struct EnvironmentVariables { int TeamLimit; int TeamThreadLimit; int MaxTeamsDefault; + int DynamicMemSize; }; template @@ -692,9 +693,9 @@ public: return HostFineGrainedMemoryPool; } - static int readEnvElseMinusOne(const char *Env) { + static int readEnv(const char *Env, int Default = -1) { const char *envStr = getenv(Env); - int res = -1; + int res = Default; if (envStr) { res = std::stoi(envStr); DP("Parsed %s=%d\n", Env, res); @@ -811,10 +812,11 @@ public: } // Get environment variables regarding teams - Env.TeamLimit = readEnvElseMinusOne("OMP_TEAM_LIMIT"); - Env.NumTeams = readEnvElseMinusOne("OMP_NUM_TEAMS"); - Env.MaxTeamsDefault = readEnvElseMinusOne("OMP_MAX_TEAMS_DEFAULT"); - Env.TeamThreadLimit = readEnvElseMinusOne("OMP_TEAMS_THREAD_LIMIT"); + Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); + Env.NumTeams = readEnv("OMP_NUM_TEAMS"); + Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); + Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); + Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); // Default state. RequiresFlags = OMP_REQ_UNDEFINED; @@ -1123,7 +1125,8 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, const atl_kernel_info_t KernelInfoEntry = KernelInfoTable[device_id][kernel_name]; - const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; + const uint32_t group_segment_size = + KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize; const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; @@ -1182,7 +1185,7 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, packet->grid_size_y = 1; packet->grid_size_z = 1; packet->private_segment_size = KernelInfoEntry.private_segment_size; - packet->group_segment_size = KernelInfoEntry.group_segment_size; + packet->group_segment_size = group_segment_size; packet->kernel_object = KernelInfoEntry.kernel_object; packet->kernarg_address = 0; // use the block allocator packet->reserved2 = 0; // impl writes id_ here @@ -1530,14 +1533,14 @@ struct device_environment { __tgt_device_image *image; const size_t img_size; - device_environment(int device_id, int number_devices, + device_environment(int device_id, int number_devices, int dynamic_mem_size, __tgt_device_image *image, const size_t img_size) : image(image), img_size(img_size) { host_device_env.NumDevices = number_devices; host_device_env.DeviceNum = device_id; host_device_env.DebugKind = 0; - host_device_env.DynamicMemSize = 0; + host_device_env.DynamicMemSize = dynamic_mem_size; if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { host_device_env.DebugKind = std::stoi(envStr); } @@ -1861,8 +1864,9 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, } { - auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, - img_size); + auto env = + device_environment(device_id, DeviceInfo.NumberOfDevices, + DeviceInfo.Env.DynamicMemSize, image, img_size); auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c new file mode 100644 index 000000000000..0b4d9d6ea9d4 --- /dev/null +++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c @@ -0,0 +1,25 @@ +// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device +// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \ +// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa +// REQUIRES: amdgcn-amd-amdhsa + +#include +#include + +int main() { + int x; +#pragma omp target parallel map(from : x) + { + int *buf = llvm_omp_target_dynamic_shared_alloc() + 252; +#pragma omp barrier + if (omp_get_thread_num() == 0) + *buf = 1; +#pragma omp barrier + if (omp_get_thread_num() == 1) + x = *buf; + } + + // CHECK: PASS + if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL) + printf("PASS\n"); +}