From b09a5e5cb3802ddc8efa32c53146fcae007a4c2f Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Tue, 29 Nov 2022 15:19:44 -0600 Subject: [PATCH] Revert "Add mean_anyway to hpc config" my bad, wrong repo ,so sorry. This reverts commit 0b9350f3da7daf1d740bbbfab79d01613fcd29f4. --- openmp/libomptarget/DeviceRTL/src/Mapping.cpp | 2 +- .../amdgpu/impl/get_elf_mach_gfx_name.cpp | 6 -- .../amdgpu/impl/get_elf_mach_gfx_name.h | 45 ---------- .../plugins/amdgpu/impl/internal.h | 11 +++ .../plugins/amdgpu/impl/system.cpp | 45 +--------- .../libomptarget/plugins/amdgpu/src/rtl.cpp | 86 ++----------------- openmp/runtime/cmake/LibompHandleFlags.cmake | 2 +- openmp/runtime/cmake/config-ix.cmake | 2 +- 8 files changed, 27 insertions(+), 172 deletions(-) diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp index 6dd935e1128a..512577c06f9e 100644 --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -16,7 +16,7 @@ #include "Utils.h" #pragma omp begin declare target device_type(nohost) -extern const uint16_t __oclc_ABI_version; + #include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace _OMP; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp index f4a4ceaa92a8..69f2a716a8fd 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp @@ -11,7 +11,6 @@ // identifier) and contains more up to date values for the enum checked here. // rtl.cpp uses the system elf.h. #include "llvm/BinaryFormat/ELF.h" -using namespace llvm::ELF; const char *get_elf_mach_gfx_name(uint32_t EFlags) { using namespace llvm::ELF; @@ -79,8 +78,3 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags) { return "--unknown gfx"; } } - -const uint16_t implicitArgsSize(uint16_t Version) { - return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE - : IMPLICITARGS::COV5_SIZE; -} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h index a5404bd3d793..177963e1b8b5 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h @@ -12,49 +12,4 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags); -enum IMPLICITARGS : uint16_t { - COV4_SIZE = 56, - COV4_HOSTCALL_PTR_OFFSET = 24, - HOSTCALL_PTR_SIZE = 8, - - COV5_SIZE = 256, - - COV5_BLOCK_COUNT_X_OFFSET = 0, - COV5_BLOCK_COUNT_X_SIZE = 4, - - COV5_BLOCK_COUNT_Y_OFFSET = 4, - COV5_BLOCK_COUNT_Y_SIZE = 4, - - COV5_BLOCK_COUNT_Z_OFFSET = 8, - COV5_BLOCK_COUNT_Z_SIZE = 4, - - COV5_GROUP_SIZE_X_OFFSET = 12, - COV5_GROUP_SIZE_X_SIZE = 2, - - COV5_GROUP_SIZE_Y_OFFSET = 14, - COV5_GROUP_SIZE_Y_SIZE = 2, - - COV5_GROUP_SIZE_Z_OFFSET = 16, - COV5_GROUP_SIZE_Z_SIZE = 2, - - COV5_REMAINDER_X_OFFSET = 18, - COV5_REMAINDER_X_SIZE = 2, - - COV5_REMAINDER_Y_OFFSET = 20, - COV5_REMAINDER_Y_SIZE = 2, - - COV5_REMAINDER_Z_OFFSET = 22, - COV5_REMAINDER_Z_SIZE = 2, - - COV5_GRID_DIMS_OFFSET = 64, - COV5_GRID_DIMS_SIZE = 2, - - COV5_HOSTCALL_PTR_OFFSET = 80, - - COV5_HEAPV1_PTR_OFFSET = 96, - COV5_HEAPV1_PTR_SIZE = 8 -}; - -const uint16_t implicitArgsSize(uint16_t Version); - #endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index dc94b0ed01f2..63b60b24a557 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -33,6 +33,17 @@ #define MAX_NUM_KERNELS (1024 * 16) +typedef struct impl_implicit_args_s { + uint64_t offset_x; + uint64_t offset_y; + uint64_t offset_z; + uint64_t hostcall_ptr; + uint64_t unused0; + uint64_t unused1; + uint64_t unused2; +} impl_implicit_args_t; +static_assert(sizeof(impl_implicit_args_t) == 56, ""); + // ---------------------- Kernel Start ------------- typedef struct atl_kernel_info_s { uint64_t kernel_object; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index 0170cd4440f5..e8dba47b6cde 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -67,17 +67,6 @@ public: HiddenMultiGridSyncArg, HiddenHostcallBuffer, HiddenHeapV1, - HiddenBlockCountX, - HiddenBlockCountY, - HiddenBlockCountZ, - HiddenGroupSizeX, - HiddenGroupSizeY, - HiddenGroupSizeZ, - HiddenRemainderX, - HiddenRemainderY, - HiddenRemainderZ, - HiddenGridDims, - HiddenQueuePtr, Unknown }; @@ -113,19 +102,7 @@ static const std::map ArgValueKind = { {"hidden_multigrid_sync_arg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, - {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}, - {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX}, - {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY}, - {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ}, - {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX}, - {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY}, - {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ}, - {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX}, - {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY}, - {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ}, - {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims}, - {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr}, -}; + {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}}; namespace core { @@ -187,17 +164,6 @@ static bool isImplicit(KernelArgMD::ValueKind value_kind) { case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: case KernelArgMD::ValueKind::HiddenHostcallBuffer: case KernelArgMD::ValueKind::HiddenHeapV1: - case KernelArgMD::ValueKind::HiddenBlockCountX: - case KernelArgMD::ValueKind::HiddenBlockCountY: - case KernelArgMD::ValueKind::HiddenBlockCountZ: - case KernelArgMD::ValueKind::HiddenGroupSizeX: - case KernelArgMD::ValueKind::HiddenGroupSizeY: - case KernelArgMD::ValueKind::HiddenGroupSizeZ: - case KernelArgMD::ValueKind::HiddenRemainderX: - case KernelArgMD::ValueKind::HiddenRemainderY: - case KernelArgMD::ValueKind::HiddenRemainderZ: - case KernelArgMD::ValueKind::HiddenGridDims: - case KernelArgMD::ValueKind::HiddenQueuePtr: return true; default: return false; @@ -507,7 +473,8 @@ static hsa_status_t get_code_object_custom_metadata( size_t new_offset = lcArg.offset_; size_t padding = new_offset - offset; offset = new_offset; - + DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, + lcArg.offset_); offset += lcArg.size_; // check if the arg is a hidden/implicit arg @@ -515,13 +482,9 @@ static hsa_status_t get_code_object_custom_metadata( if (!isImplicit(lcArg.valueKind_)) { info.explicit_argument_count++; kernel_explicit_args_size += lcArg.size_; - DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, - lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); } else { info.implicit_argument_count++; hasHiddenArgs = true; - DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, - lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); } kernel_explicit_args_size += padding; } @@ -529,7 +492,7 @@ static hsa_status_t get_code_object_custom_metadata( // TODO: Probably don't want this arithmetic info.kernel_segment_size = - (!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); + (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), kernel_segment_size, info.kernel_segment_size); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 38879c8e6eb8..b0e29cb6e4e9 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -124,10 +124,9 @@ public: uint32_t KernargSegmentSize; void *KernargRegion = nullptr; std::queue FreeKernargSegments; - uint16_t CodeObjectVersion; uint32_t kernargSizeIncludingImplicit() { - return KernargSegmentSize + implicitArgsSize(CodeObjectVersion); + return KernargSegmentSize + sizeof(impl_implicit_args_t); } ~KernelArgPool() { @@ -144,10 +143,8 @@ public: KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool, - uint16_t CodeObjectVersion) - : KernargSegmentSize(KernargSegmentSize), - CodeObjectVersion(CodeObjectVersion) { + KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) + : KernargSegmentSize(KernargSegmentSize) { // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue @@ -231,16 +228,16 @@ struct KernelTy { KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion) + hsa_amd_memory_pool_t &KernArgMemoryPool) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert(std::make_pair( - N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion)))); + KernelArgPoolMap.insert( + std::make_pair(N, std::unique_ptr(new KernelArgPool( + KernargSegmentSize, KernArgMemoryPool)))); } } }; @@ -477,7 +474,6 @@ public: std::vector WarpSize; std::vector GPUName; std::vector TargetID; - uint16_t CodeObjectVersion; // OpenMP properties std::vector NumTeams; @@ -491,7 +487,6 @@ public: // Resource pools SignalPoolT FreeSignalPool; - std::vector PreallocatedDeviceHeap; bool HostcallRequired = false; @@ -866,6 +861,7 @@ public: "Unexpected device id!"); FuncGblEntries[DeviceId].emplace_back(); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); + // KernelArgPoolMap.clear(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } @@ -1036,7 +1032,6 @@ public: SymbolInfoTable.resize(NumberOfDevices); DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); DeviceFineGrainedMemoryPools.resize(NumberOfDevices); - PreallocatedDeviceHeap.resize(NumberOfDevices); Err = setupDevicePools(HSAAgents); if (Err != HSA_STATUS_SUCCESS) { @@ -1366,27 +1361,6 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { return PacketId; } -const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) { - char *ImageBegin = (char *)Image->ImageStart; - size_t ImageSize = (char *)Image->ImageEnd - ImageBegin; - - StringRef Buffer = StringRef(ImageBegin, ImageSize); - auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), - /*InitContent=*/false); - if (!ElfOrErr) { - REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); - return 1; - } - - if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) { - auto Header = ELFObj->getELFFile().getHeader(); - uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]); - DP("ELFABIVERSION Version: %u\n", Version); - return Version; - } - return 0; -} - int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { @@ -1464,7 +1438,6 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, } uint64_t PacketId = acquireAvailablePacketId(Queue); - uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion; const uint32_t Mask = Queue->size - 1; // size is a power of 2 hsa_kernel_dispatch_packet_t *Packet = (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); @@ -2187,40 +2160,6 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, return Res; } -static void preAllocateHeapMemoryForCov5() { - void *DevPtr; - for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) { - DevPtr = nullptr; - size_t PreAllocSize = 131072; // 128KB per device - - hsa_amd_memory_pool_t MemoryPool = - DeviceInfo().DeviceCoarseGrainedMemoryPools[I]; - hsa_status_t Err = - hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error allocating preallocated heap device memory: %s\n", - get_error_string(Err)); - } - - Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL, - DevPtr); - if (Err != HSA_STATUS_SUCCESS) { - DP("hsa allow_access_to_all_gpu_agents failed: %s\n", - get_error_string(Err)); - } - - uint64_t Rounded = - sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t)); - Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t)); - if (Err != HSA_STATUS_SUCCESS) { - DP("Error zero-initializing preallocated heap device memory:%s\n", - get_error_string(Err)); - } - - DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr; - } -} - __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image) { // This function loads the device image onto gpu[DeviceId] and does other @@ -2255,12 +2194,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, if (!elfMachineIdIsAmdgcn(Image)) return NULL; - DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image); - if (DeviceInfo().CodeObjectVersion >= - llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) { - preAllocateHeapMemoryForCov5(); - } - { auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, @@ -2584,8 +2517,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool, - DeviceInfo().CodeObjectVersion)); + DeviceInfo().KernArgPool)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry); diff --git a/openmp/runtime/cmake/LibompHandleFlags.cmake b/openmp/runtime/cmake/LibompHandleFlags.cmake index 684eae9f0b25..a6adbe3f2f54 100644 --- a/openmp/runtime/cmake/LibompHandleFlags.cmake +++ b/openmp/runtime/cmake/LibompHandleFlags.cmake @@ -100,7 +100,7 @@ function(libomp_get_ldflags ldflags) libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}" IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG) libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG) - libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) + libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858 libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG) libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG) diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake index d1346121edf5..1e02d5a8b5cf 100644 --- a/openmp/runtime/cmake/config-ix.cmake +++ b/openmp/runtime/cmake/config-ix.cmake @@ -131,7 +131,7 @@ if(WIN32) elseif(NOT APPLE) libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG) libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG) - libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) + libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858 libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG) libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)