[Libomptarget] Allow the device runtime to be compiled for the host

Currently the OpenMP offloading device runtime is only expected to be
compiled for the specific architecture it's targeting. This is
problematic if we want to make compiling the device runtime more general
via the standar `clang` driver rather than invoking the clang front-end
directly. This patch addresses this by primarily changing the declare
type to `nohost` so the host will not contain any of this code.
Additionally we forward declare the functions that are defined via
variants, otherwise these would cause problems on the host.

Reviewed By: jdoerfert, tianshilei1992

Differential Revision: https://reviews.llvm.org/D125260
This commit is contained in:
Joseph Huber 2022-05-09 14:22:59 -04:00
parent 0a22dfcb11
commit b4f8443d97
14 changed files with 63 additions and 16 deletions

View File

@ -18,7 +18,7 @@ namespace _OMP {
namespace mapping {
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
inline constexpr uint32_t MaxThreadsPerTeam = 1024;

View File

@ -15,7 +15,7 @@
#include "Debug.h"
#include "Types.h"
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
namespace _OMP {

View File

@ -18,7 +18,7 @@
using namespace _OMP;
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
// defined by CGOpenMPRuntimeGPU
extern uint32_t __omp_rtl_debug_kind;

View File

@ -18,7 +18,7 @@
using namespace _OMP;
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
extern "C" {
void __assert_assume(bool condition) { __builtin_assume(condition); }
@ -30,6 +30,10 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
__builtin_trap();
}
namespace impl {
int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t);
}
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
int32_t vprintf(const char *, void *);
@ -55,8 +59,7 @@ int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
}
/// Current indentation level for the function trace. Only accessed by thread 0.
__attribute__((loader_uninitialized))
static uint32_t Level;
__attribute__((loader_uninitialized)) static uint32_t Level;
#pragma omp allocate(Level) allocator(omp_pteam_mem_alloc)
DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,

View File

@ -19,7 +19,7 @@
using namespace _OMP;
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
static void inititializeRuntime(bool IsSPMD) {
// Order is important here.

View File

@ -15,7 +15,7 @@
#include "Types.h"
#include "Utils.h"
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
@ -24,6 +24,23 @@ using namespace _OMP;
namespace _OMP {
namespace impl {
// Forward declarations defined to be defined for AMDGCN and NVPTX.
const llvm::omp::GV &getGridValue();
uint32_t getGridDim(uint32_t n, uint16_t d);
uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size,
uint16_t group_size);
uint32_t getNumHardwareThreadsInBlock();
LaneMaskTy activemask();
LaneMaskTy lanemaskLT();
LaneMaskTy lanemaskGT();
uint32_t getThreadIdInWarp();
uint32_t getThreadIdInBlock();
uint32_t getKernelSize();
uint32_t getBlockId();
uint32_t getNumberOfBlocks();
uint32_t getWarpId();
uint32_t getNumberOfWarpsInBlock();
/// AMDGCN Implementation
///
///{

View File

@ -13,11 +13,15 @@
#include "Debug.h"
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
namespace _OMP {
namespace impl {
double getWTick();
double getWTime();
/// AMDGCN Implementation
///
///{

View File

@ -42,7 +42,7 @@
using namespace _OMP;
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
namespace {

View File

@ -22,7 +22,7 @@ using namespace _OMP;
namespace {
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {

View File

@ -19,7 +19,7 @@
using namespace _OMP;
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
/// Memory implementation
///

View File

@ -19,7 +19,7 @@
#include "Types.h"
#include "Utils.h"
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
using namespace _OMP;
@ -63,6 +63,22 @@ uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) {
}
///}
// Forward declarations defined to be defined for AMDGCN and NVPTX.
uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering);
void namedBarrierInit();
void namedBarrier();
void fenceTeam(int Ordering);
void fenceKernel(int Ordering);
void fenceSystem(int Ordering);
void syncWarp(__kmpc_impl_lanemask_t);
void syncThreads();
void syncThreadsAligned() { syncThreads(); }
void unsetLock(omp_lock_t *);
int testLock(omp_lock_t *);
void initLock(omp_lock_t *);
void destroyLock(omp_lock_t *);
void setLock(omp_lock_t *);
/// AMDGCN Implementation
///
///{

View File

@ -20,7 +20,7 @@
using namespace _OMP;
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
uint64_t TaskSizeInclPrivateValues,

View File

@ -15,7 +15,7 @@
#include "Interface.h"
#include "Mapping.h"
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
using namespace _OMP;
@ -32,6 +32,9 @@ __attribute__((used, retain, weak, optnone, cold)) void keepAlive() {
namespace impl {
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
/// AMDGCN Implementation
///
///{
@ -72,6 +75,10 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
#pragma omp end declare variant
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
/// AMDGCN Implementation
///
///{

View File

@ -43,7 +43,7 @@ struct DynamicScheduleTracker {
#define NOT_FINISHED 1
#define LAST_CHUNK 2
#pragma omp declare target
#pragma omp begin declare target device_type(nohost)
// TODO: This variable is a hack inherited from the old runtime.
static uint64_t SHARED(Cnt);