From c9353eb4bcf8180446d8401796527fe7790ba23c Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 27 Jun 2022 12:26:58 -0400 Subject: [PATCH] [Libomptarget] Use new tripcount argument in the runtime. The previous patch added an argument to the `__tgt_target_kernel` runtime function which includes the tripcount used for the loop clause. This was originally passed in via the `__kmpc_push_target_tripcount` function. Now we move this logic to the kernel launch itself and remove the need for the push function. Depends on D128816 Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D128817 --- openmp/libomptarget/include/omptarget.h | 4 +++- openmp/libomptarget/src/interface.cpp | 11 ++++++----- openmp/libomptarget/src/omptarget.cpp | 17 ++++++++++------- openmp/libomptarget/src/private.h | 3 ++- openmp/libomptarget/src/rtl.cpp | 2 +- 5 files changed, 22 insertions(+), 15 deletions(-) diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 6651cc8787b2..f55b779cd745 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -118,8 +118,10 @@ struct __tgt_kernel_arguments { int64_t *ArgSizes; // Size of the argument data in bytes. int64_t *ArgTypes; // Type of the data (e.g. to / from). void **ArgNames; // Name of the data for debugging, possibly null. - void **ArgMappers; // User-defined mappers, possible null. + void **ArgMappers; // User-defined mappers, possibly null. + int64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise. }; +static_assert(sizeof(__tgt_kernel_arguments) == 64 && "Invalid struct size"); /// This struct is a record of an entry point or global. For a function /// entry point the size is expected to be zero diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index b27d5dfccbf7..ca54c06a6aa6 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -278,8 +278,8 @@ EXTERN int __tgt_target_mapper(ident_t *Loc, int64_t DeviceId, void *HostPtr, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers) { TIMESCOPE_WITH_IDENT(Loc); - __tgt_kernel_arguments KernelArgs{1, ArgNum, ArgsBase, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers}; + __tgt_kernel_arguments KernelArgs{ + 1, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, -1}; return __tgt_target_kernel(Loc, DeviceId, -1, 0, HostPtr, &KernelArgs); } @@ -326,8 +326,8 @@ EXTERN int __tgt_target_teams_mapper(ident_t *Loc, int64_t DeviceId, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit) { TIMESCOPE_WITH_IDENT(Loc); - __tgt_kernel_arguments KernelArgs{1, ArgNum, ArgsBase, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers}; + __tgt_kernel_arguments KernelArgs{ + 1, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, -1}; return __tgt_target_kernel(Loc, DeviceId, TeamNum, ThreadLimit, HostPtr, &KernelArgs); } @@ -381,7 +381,8 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, AsyncInfoTy AsyncInfo(Device); int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, - Args->ArgMappers, NumTeams, ThreadLimit, IsTeams, AsyncInfo); + Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, + IsTeams, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) Rc = AsyncInfo.synchronize(); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 1430b28e6077..1174c39bc626 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -190,9 +190,9 @@ static int initLibrary(DeviceTy &Device) { DP("Has pending ctors... call now\n"); for (auto &Entry : Lib.second.PendingCtors) { void *Ctor = Entry; - int Rc = - target(nullptr, Device, Ctor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); + int Rc = target(nullptr, Device, Ctor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, 0, true /*team*/, + AsyncInfo); if (Rc != OFFLOAD_SUCCESS) { REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(Ctor)); return OFFLOAD_FAIL; @@ -1140,7 +1140,6 @@ uint64_t getLoopTripCount(int64_t DeviceId) { if (I != Device.LoopTripCnt.end()) { LoopTripCount = I->second; Device.LoopTripCnt.erase(I); - DP("loop trip count is %" PRIu64 ".\n", LoopTripCount); } } @@ -1488,9 +1487,9 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { + int32_t ThreadLimit, uint64_t Tripcount, int IsTeamConstruct, + AsyncInfoTy &AsyncInfo) { int32_t DeviceId = Device.DeviceID; - TableMap *TM = getTableMap(HostPtr); // No map for this host pointer found! if (!TM) { @@ -1509,6 +1508,10 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, } assert(TargetTable && "Global data has not been mapped\n"); + // FIXME: Use legacy tripcount method if it is '-1'. + Tripcount = Tripcount == -1 ? getLoopTripCount(DeviceId) : Tripcount; + DP("loop trip count is %" PRIu64 ".\n", Tripcount); + // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we // need to manifest base pointers prior to launching a kernel. Even if we have // mapped an object only partially, e.g. A[N:M], although the kernel is @@ -1546,7 +1549,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, if (IsTeamConstruct) Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], TgtArgs.size(), TeamNum, ThreadLimit, - getLoopTripCount(DeviceId), AsyncInfo); + Tripcount, AsyncInfo); else Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], TgtArgs.size(), AsyncInfo); diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 49a9f1f25fc0..6be4a7f0e090 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -42,7 +42,8 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, - int IsTeamConstruct, AsyncInfoTy &AsyncInfo); + uint64_t Tripcount, int IsTeamConstruct, + AsyncInfoTy &AsyncInfo); extern void handleTargetOutcome(bool Success, ident_t *Loc); extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp index f8c421940cc6..7337854c5d1f 100644 --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -444,7 +444,7 @@ void RTLsTy::unregisterLib(__tgt_bin_desc *Desc) { AsyncInfoTy AsyncInfo(Device); for (auto &Dtor : Device.PendingCtorsDtors[Desc].PendingDtors) { int Rc = target(nullptr, Device, Dtor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, + nullptr, nullptr, nullptr, 1, 1, 0, true /*team*/, AsyncInfo); if (Rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(Dtor));