[libomptarget] Implement host plugin for amdgpu
[libomptarget] Implement host plugin for amdgpu Replacement for D71384. Primary difference is inlining the dependency on atmi followed by extensive simplification and bugfixes. This is the latest version from https://github.com/ROCm-Developer-Tools/amd-llvm-project/tree/aomp12 with minor patches and a rename from hsa to amdgpu, on the basis that this can't be used by other implementations of hsa without additional work. This will not build unless the ROCM_DIR variable is passed so won't break other builds. That variable is used to locate two amdgpu specific libraries that ship as part of rocm: libhsakmt at https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface libhsa-runtime64 at https://github.com/RadeonOpenCompute/ROCR-Runtime These libraries build from source. The build scripts in those repos are for shared libraries, but can be adapted to statically link both into this plugin. There are caveats. - This works well enough to run various tests and benchmarks, and will be used to support the current clang bring up - It is adequately thread safe for the above but there will be races remaining - It is not stylistically correct for llvm, though has had clang-format run - It has suboptimal memory management and locking strategies - The debug printing / error handling is inconsistent I would like to contribute this pretty much as-is and then improve it in-tree. This would be advantagous because the aomp12 branch that was in use for fixing this codebase has just been joined with the amd internal rocm dev process. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D85742
This commit is contained in:
parent
a49b05bb61
commit
d0b312955f
|
@ -66,6 +66,7 @@ endif()
|
|||
endmacro()
|
||||
|
||||
add_subdirectory(aarch64)
|
||||
add_subdirectory(amdgpu)
|
||||
add_subdirectory(cuda)
|
||||
add_subdirectory(ppc64)
|
||||
add_subdirectory(ppc64le)
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# The LLVM Compiler Infrastructure
|
||||
#
|
||||
# This file is dual licensed under the MIT and the University of Illinois Open
|
||||
# Source Licenses. See LICENSE.txt for details.
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build a plugin for an AMDGPU machine if available.
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
################################################################################
|
||||
|
||||
if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
|
||||
libomptarget_say("Not building AMDGPU plugin: LIBELF not found")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(NOT ROCM_DIR)
|
||||
libomptarget_say("Not building AMDGPU plugin: ROCM_DIR is not set")
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS ${ROCM_DIR}/hsa/include ${ROCM_DIR}/hsa/include/hsa)
|
||||
set(LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${ROCM_DIR}/hsa/lib)
|
||||
set(LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS ${ROCM_DIR}/lib)
|
||||
|
||||
mark_as_advanced( LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS)
|
||||
|
||||
if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
libomptarget_say("Not building amdgpu plugin: only support amdgpu in Linux x86_64, ppc64le, or aarch64 hosts.")
|
||||
return()
|
||||
endif()
|
||||
libomptarget_say("Building amdgpu offloading plugin using ROCM_DIR = ${ROCM_DIR}")
|
||||
|
||||
libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS: ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}")
|
||||
libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS}")
|
||||
libomptarget_say("LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS: ${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}")
|
||||
|
||||
################################################################################
|
||||
# Define the suffix for the runtime messaging dumps.
|
||||
add_definitions(-DTARGET_NAME=AMDGPU)
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
|
||||
add_definitions(-DLITTLEENDIAN_CPU=1)
|
||||
endif()
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
add_definitions(-DDEBUG)
|
||||
endif()
|
||||
|
||||
include_directories(
|
||||
${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/impl
|
||||
)
|
||||
|
||||
add_library(omptarget.rtl.amdgpu SHARED
|
||||
impl/atmi.cpp
|
||||
impl/atmi_interop_hsa.cpp
|
||||
impl/data.cpp
|
||||
impl/machine.cpp
|
||||
impl/system.cpp
|
||||
impl/utils.cpp
|
||||
impl/msgpack.cpp
|
||||
src/rtl.cpp
|
||||
)
|
||||
|
||||
# Install plugin under the lib destination folder.
|
||||
# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug
|
||||
install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}")
|
||||
|
||||
target_link_libraries(
|
||||
omptarget.rtl.amdgpu
|
||||
-lpthread -ldl -Wl,-rpath,${OPENMP_INSTALL_LIBDIR}
|
||||
-L${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} -L${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS},-rpath,${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}
|
||||
-lelf
|
||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
||||
"-Wl,-z,defs"
|
||||
)
|
||||
|
||||
# Report to the parent scope that we are building a plugin for amdgpu
|
||||
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "rt.h"
|
||||
/*
|
||||
* Initialize/Finalize
|
||||
*/
|
||||
atmi_status_t atmi_init() { return core::Runtime::Initialize(); }
|
||||
|
||||
atmi_status_t atmi_finalize() { return core::Runtime::Finalize(); }
|
||||
|
||||
/*
|
||||
* Machine Info
|
||||
*/
|
||||
atmi_machine_t *atmi_machine_get_info() {
|
||||
return core::Runtime::GetMachineInfo();
|
||||
}
|
||||
|
||||
/*
|
||||
* Modules
|
||||
*/
|
||||
atmi_status_t atmi_module_register_from_memory_to_place(
|
||||
void *module_bytes, size_t module_size, atmi_place_t place,
|
||||
atmi_status_t (*on_deserialized_data)(void *data, size_t size,
|
||||
void *cb_state),
|
||||
void *cb_state) {
|
||||
return core::Runtime::getInstance().RegisterModuleFromMemory(
|
||||
module_bytes, module_size, place, on_deserialized_data, cb_state);
|
||||
}
|
||||
|
||||
/*
|
||||
* Data
|
||||
*/
|
||||
atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size) {
|
||||
return core::Runtime::Memcpy(dest, src, size);
|
||||
}
|
||||
|
||||
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
|
||||
|
||||
atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {
|
||||
return core::Runtime::Malloc(ptr, size, place);
|
||||
}
|
|
@ -0,0 +1,203 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef INCLUDE_ATMI_H_
|
||||
#define INCLUDE_ATMI_H_
|
||||
|
||||
#define ROCM_VERSION_MAJOR 3
|
||||
#define ROCM_VERSION_MINOR 2
|
||||
|
||||
/** \defgroup enumerations Enumerated Types
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Status codes.
|
||||
*/
|
||||
typedef enum atmi_status_t {
|
||||
/**
|
||||
* The function has been executed successfully.
|
||||
*/
|
||||
ATMI_STATUS_SUCCESS = 0,
|
||||
/**
|
||||
* A undocumented error has occurred.
|
||||
*/
|
||||
ATMI_STATUS_UNKNOWN = 1,
|
||||
/**
|
||||
* A generic error has occurred.
|
||||
*/
|
||||
ATMI_STATUS_ERROR = 2,
|
||||
} atmi_status_t;
|
||||
|
||||
/**
|
||||
* @brief Device Types.
|
||||
*/
|
||||
typedef enum atmi_devtype_s {
|
||||
ATMI_DEVTYPE_CPU = 0x0001,
|
||||
ATMI_DEVTYPE_iGPU = 0x0010, // Integrated GPU
|
||||
ATMI_DEVTYPE_dGPU = 0x0100, // Discrete GPU
|
||||
ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU
|
||||
ATMI_DEVTYPE_ALL = 0x111 // Union of all device types
|
||||
} atmi_devtype_t;
|
||||
|
||||
/**
|
||||
* @brief Memory Access Type.
|
||||
*/
|
||||
typedef enum atmi_memtype_s {
|
||||
ATMI_MEMTYPE_FINE_GRAINED = 0,
|
||||
ATMI_MEMTYPE_COARSE_GRAINED = 1,
|
||||
ATMI_MEMTYPE_ANY
|
||||
} atmi_memtype_t;
|
||||
|
||||
/**
|
||||
* @brief ATMI Memory Fences for Tasks.
|
||||
*/
|
||||
typedef enum atmi_task_fence_scope_s {
|
||||
/**
|
||||
* No memory fence applied; external fences have to be applied around the task
|
||||
* launch/completion.
|
||||
*/
|
||||
ATMI_FENCE_SCOPE_NONE = 0,
|
||||
/**
|
||||
* The fence is applied to the device.
|
||||
*/
|
||||
ATMI_FENCE_SCOPE_DEVICE = 1,
|
||||
/**
|
||||
* The fence is applied to the entire system.
|
||||
*/
|
||||
ATMI_FENCE_SCOPE_SYSTEM = 2
|
||||
} atmi_task_fence_scope_t;
|
||||
|
||||
/** @} */
|
||||
|
||||
/** \defgroup common Common ATMI Structures
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief ATMI Compute Place
|
||||
*/
|
||||
typedef struct atmi_place_s {
|
||||
/**
|
||||
* The node in a cluster where computation should occur.
|
||||
* Default is node_id = 0 for local computations.
|
||||
*/
|
||||
unsigned int node_id;
|
||||
/**
|
||||
* Device type: CPU, GPU or DSP
|
||||
*/
|
||||
atmi_devtype_t type;
|
||||
/**
|
||||
* The device ordinal number ordered by runtime; -1 for any
|
||||
*/
|
||||
int device_id;
|
||||
} atmi_place_t;
|
||||
|
||||
/**
|
||||
* @brief ATMI Memory Place
|
||||
*/
|
||||
typedef struct atmi_mem_place_s {
|
||||
/**
|
||||
* The node in a cluster where computation should occur.
|
||||
* Default is node_id = 0 for local computations.
|
||||
*/
|
||||
unsigned int node_id;
|
||||
/**
|
||||
* Device type: CPU, GPU or DSP
|
||||
*/
|
||||
atmi_devtype_t dev_type;
|
||||
/**
|
||||
* The device ordinal number ordered by runtime; -1 for any
|
||||
*/
|
||||
int dev_id;
|
||||
// atmi_memtype_t mem_type; // Fine grained or Coarse grained
|
||||
/**
|
||||
* The memory space/region ordinal number ordered by runtime; -1 for any
|
||||
*/
|
||||
int mem_id;
|
||||
} atmi_mem_place_t;
|
||||
|
||||
/**
|
||||
* @brief ATMI Memory Space/region Structure
|
||||
*/
|
||||
typedef struct atmi_memory_s {
|
||||
/**
|
||||
* Memory capacity
|
||||
*/
|
||||
unsigned long int capacity;
|
||||
/**
|
||||
* Memory type
|
||||
*/
|
||||
atmi_memtype_t type;
|
||||
} atmi_memory_t;
|
||||
|
||||
/**
|
||||
* @brief ATMI Device Structure
|
||||
*/
|
||||
typedef struct atmi_device_s {
|
||||
/**
|
||||
* Device type: CPU, GPU or DSP
|
||||
*/
|
||||
atmi_devtype_t type;
|
||||
/**
|
||||
* The number of compute cores
|
||||
*/
|
||||
unsigned int core_count;
|
||||
/**
|
||||
* The number of memory spaces/regions that are accessible
|
||||
* from this device
|
||||
*/
|
||||
unsigned int memory_count;
|
||||
/**
|
||||
* Array of memory spaces/regions that are accessible
|
||||
* from this device.
|
||||
*/
|
||||
atmi_memory_t *memories;
|
||||
} atmi_device_t;
|
||||
|
||||
/**
|
||||
* @brief ATMI Machine Structure
|
||||
*/
|
||||
typedef struct atmi_machine_s {
|
||||
/**
|
||||
* The number of devices categorized by the device type
|
||||
*/
|
||||
unsigned int device_count_by_type[ATMI_DEVTYPE_ALL];
|
||||
/**
|
||||
* The device structures categorized by the device type
|
||||
*/
|
||||
atmi_device_t *devices_by_type[ATMI_DEVTYPE_ALL];
|
||||
} atmi_machine_t;
|
||||
|
||||
// Below are some helper macros that can be used to setup
|
||||
// some of the ATMI data structures.
|
||||
#define ATMI_PLACE_CPU(node, cpu_id) \
|
||||
{ .node_id = node, .type = ATMI_DEVTYPE_CPU, .device_id = cpu_id }
|
||||
#define ATMI_PLACE_GPU(node, gpu_id) \
|
||||
{ .node_id = node, .type = ATMI_DEVTYPE_GPU, .device_id = gpu_id }
|
||||
#define ATMI_MEM_PLACE_CPU(node, cpu_id) \
|
||||
{ \
|
||||
.node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \
|
||||
.mem_id = -1 \
|
||||
}
|
||||
#define ATMI_MEM_PLACE_GPU(node, gpu_id) \
|
||||
{ \
|
||||
.node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \
|
||||
.mem_id = -1 \
|
||||
}
|
||||
#define ATMI_MEM_PLACE_CPU_MEM(node, cpu_id, cpu_mem_id) \
|
||||
{ \
|
||||
.node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \
|
||||
.mem_id = cpu_mem_id \
|
||||
}
|
||||
#define ATMI_MEM_PLACE_GPU_MEM(node, gpu_id, gpu_mem_id) \
|
||||
{ \
|
||||
.node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \
|
||||
.mem_id = gpu_mem_id \
|
||||
}
|
||||
#define ATMI_MEM_PLACE(d_type, d_id, m_id) \
|
||||
{ .node_id = 0, .dev_type = d_type, .dev_id = d_id, .mem_id = m_id }
|
||||
|
||||
#endif // INCLUDE_ATMI_H_
|
|
@ -0,0 +1,96 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "atmi_interop_hsa.h"
|
||||
#include "internal.h"
|
||||
|
||||
using core::atl_is_atmi_initialized;
|
||||
|
||||
atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
|
||||
const char *symbol,
|
||||
void **var_addr,
|
||||
unsigned int *var_size) {
|
||||
/*
|
||||
// Typical usage:
|
||||
void *var_addr;
|
||||
size_t var_size;
|
||||
atmi_interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
|
||||
&var_size);
|
||||
atmi_memcpy(host_add, var_addr, var_size);
|
||||
*/
|
||||
|
||||
if (!atl_is_atmi_initialized())
|
||||
return ATMI_STATUS_ERROR;
|
||||
atmi_machine_t *machine = atmi_machine_get_info();
|
||||
if (!symbol || !var_addr || !var_size || !machine)
|
||||
return ATMI_STATUS_ERROR;
|
||||
if (place.dev_id < 0 ||
|
||||
place.dev_id >= machine->device_count_by_type[place.dev_type])
|
||||
return ATMI_STATUS_ERROR;
|
||||
|
||||
// get the symbol info
|
||||
std::string symbolStr = std::string(symbol);
|
||||
if (SymbolInfoTable[place.dev_id].find(symbolStr) !=
|
||||
SymbolInfoTable[place.dev_id].end()) {
|
||||
atl_symbol_info_t info = SymbolInfoTable[place.dev_id][symbolStr];
|
||||
*var_addr = reinterpret_cast<void *>(info.addr);
|
||||
*var_size = info.size;
|
||||
return ATMI_STATUS_SUCCESS;
|
||||
} else {
|
||||
*var_addr = NULL;
|
||||
*var_size = 0;
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
atmi_status_t atmi_interop_hsa_get_kernel_info(
|
||||
atmi_mem_place_t place, const char *kernel_name,
|
||||
hsa_executable_symbol_info_t kernel_info, uint32_t *value) {
|
||||
/*
|
||||
// Typical usage:
|
||||
uint32_t value;
|
||||
atmi_interop_hsa_get_kernel_addr(gpu_place, "kernel_name",
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
|
||||
&val);
|
||||
*/
|
||||
|
||||
if (!atl_is_atmi_initialized())
|
||||
return ATMI_STATUS_ERROR;
|
||||
atmi_machine_t *machine = atmi_machine_get_info();
|
||||
if (!kernel_name || !value || !machine)
|
||||
return ATMI_STATUS_ERROR;
|
||||
if (place.dev_id < 0 ||
|
||||
place.dev_id >= machine->device_count_by_type[place.dev_type])
|
||||
return ATMI_STATUS_ERROR;
|
||||
|
||||
atmi_status_t status = ATMI_STATUS_SUCCESS;
|
||||
// get the kernel info
|
||||
std::string kernelStr = std::string(kernel_name);
|
||||
if (KernelInfoTable[place.dev_id].find(kernelStr) !=
|
||||
KernelInfoTable[place.dev_id].end()) {
|
||||
atl_kernel_info_t info = KernelInfoTable[place.dev_id][kernelStr];
|
||||
switch (kernel_info) {
|
||||
case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE:
|
||||
*value = info.group_segment_size;
|
||||
break;
|
||||
case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE:
|
||||
*value = info.private_segment_size;
|
||||
break;
|
||||
case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE:
|
||||
// return the size for non-implicit args
|
||||
*value = info.kernel_segment_size - sizeof(atmi_implicit_args_t);
|
||||
break;
|
||||
default:
|
||||
*value = 0;
|
||||
status = ATMI_STATUS_ERROR;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
*value = 0;
|
||||
status = ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef INCLUDE_ATMI_INTEROP_HSA_H_
|
||||
#define INCLUDE_ATMI_INTEROP_HSA_H_
|
||||
|
||||
#include "atmi_runtime.h"
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_amd.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/** \defgroup interop_hsa_functions ATMI-HSA Interop
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Get the device address and size of an HSA global symbol
|
||||
*
|
||||
* @detail Use this function to query the device address and size of an HSA
|
||||
* global symbol.
|
||||
* The symbol can be set at by the compiler or by the application writer in a
|
||||
* language-specific manner. This function is meaningful only after calling one
|
||||
* of the @p atmi_module_register functions.
|
||||
*
|
||||
* @param[in] place The ATMI memory place
|
||||
*
|
||||
* @param[in] symbol Pointer to a non-NULL global symbol name
|
||||
*
|
||||
* @param[in] var_addr Pointer to a non-NULL @p void* variable that will
|
||||
* hold the device address of the global symbol object.
|
||||
*
|
||||
* @param[in] var_size Pointer to a non-NULL @p uint variable that will
|
||||
* hold the size of the global symbol object.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
|
||||
* invalid
|
||||
* location in the current node, or if ATMI is not initialized.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*/
|
||||
atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
|
||||
const char *symbol,
|
||||
void **var_addr,
|
||||
unsigned int *var_size);
|
||||
|
||||
/**
|
||||
* @brief Get the HSA-specific kernel info from a kernel name
|
||||
*
|
||||
* @detail Use this function to query the HSA-specific kernel info from the
|
||||
* kernel name.
|
||||
* This function is meaningful only after calling one
|
||||
* of the @p atmi_module_register functions.
|
||||
*
|
||||
* @param[in] place The ATMI memory place
|
||||
*
|
||||
* @param[in] kernel_name Pointer to a char array with the kernel name
|
||||
*
|
||||
* @param[in] info The different possible kernel properties
|
||||
*
|
||||
* @param[in] value Pointer to a non-NULL @p uint variable that will
|
||||
* hold the return value of the kernel property.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
|
||||
* invalid
|
||||
* location in the current node, or if ATMI is not initialized.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*/
|
||||
atmi_status_t atmi_interop_hsa_get_kernel_info(
|
||||
atmi_mem_place_t place, const char *kernel_name,
|
||||
hsa_executable_symbol_info_t info, uint32_t *value);
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // INCLUDE_ATMI_INTEROP_HSA_H_
|
|
@ -0,0 +1,39 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef INCLUDE_ATMI_KL_H_
|
||||
#define INCLUDE_ATMI_KL_H_
|
||||
|
||||
#include "atmi.h"
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
#include "ockl_hsa.h"
|
||||
#endif
|
||||
#define MAX_NUM_KERNELS (1024 * 16)
|
||||
|
||||
typedef struct atmi_implicit_args_s {
|
||||
unsigned long offset_x;
|
||||
unsigned long offset_y;
|
||||
unsigned long offset_z;
|
||||
unsigned long hostcall_ptr;
|
||||
char num_gpu_queues;
|
||||
unsigned long gpu_queue_ptr;
|
||||
char num_cpu_queues;
|
||||
unsigned long cpu_worker_signals;
|
||||
unsigned long cpu_queue_ptr;
|
||||
unsigned long kernarg_template_ptr;
|
||||
// possible TODO: send signal pool to be used by DAGs on GPU
|
||||
// uint8_t num_signals;
|
||||
// unsigned long signal_ptr;
|
||||
} atmi_implicit_args_t;
|
||||
|
||||
typedef struct atmi_kernel_enqueue_template_s {
|
||||
unsigned long kernel_handle;
|
||||
hsa_kernel_dispatch_packet_t k_packet;
|
||||
hsa_agent_dispatch_packet_t a_packet;
|
||||
unsigned long kernarg_segment_size;
|
||||
void *kernarg_regions;
|
||||
} atmi_kernel_enqueue_template_t;
|
||||
|
||||
#endif // INCLUDE_ATMI_KL_H_
|
|
@ -0,0 +1,193 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef INCLUDE_ATMI_RUNTIME_H_
|
||||
#define INCLUDE_ATMI_RUNTIME_H_
|
||||
|
||||
#include "atmi.h"
|
||||
#include <inttypes.h>
|
||||
#include <stdlib.h>
|
||||
#ifndef __cplusplus
|
||||
#include <stdbool.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** \defgroup context_functions ATMI Context Setup and Finalize
|
||||
* @{
|
||||
*/
|
||||
/**
|
||||
* @brief Initialize the ATMI runtime environment.
|
||||
*
|
||||
* @detal All ATMI runtime functions will fail if this function is not called
|
||||
* at least once. The user may initialize difference device types at different
|
||||
* regions in the program in order for optimization purposes.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*/
|
||||
atmi_status_t atmi_init();
|
||||
|
||||
/**
|
||||
* @brief Finalize the ATMI runtime environment.
|
||||
*
|
||||
* @detail ATMI runtime functions will fail if called after finalize.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*/
|
||||
atmi_status_t atmi_finalize();
|
||||
/** @} */
|
||||
|
||||
/** \defgroup module_functions ATMI Module
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Register the ATMI code module from memory on to a specific place
|
||||
* (device).
|
||||
*
|
||||
* @detail Currently, only GPU devices need explicit module registration because
|
||||
* of their specific ISAs that require a separate compilation phase. On the
|
||||
* other
|
||||
* hand, CPU devices execute regular x86 functions that are compiled with the
|
||||
* host program.
|
||||
*
|
||||
* @param[in] module_bytes A memory region that contains the GPU modules
|
||||
* targeting ::AMDGCN platform types. Value cannot be NULL.
|
||||
*
|
||||
* @param[in] module_size Size of module region
|
||||
*
|
||||
* @param[in] place Denotes the execution place (device) on which the module
|
||||
* should be registered and loaded.
|
||||
*
|
||||
* @param[in] on_deserialized_data Callback run on deserialized code object,
|
||||
* before loading it
|
||||
*
|
||||
* @param[in] cb_state void* passed to on_deserialized_data callback
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*
|
||||
*/
|
||||
atmi_status_t atmi_module_register_from_memory_to_place(
|
||||
void *module_bytes, size_t module_size, atmi_place_t place,
|
||||
atmi_status_t (*on_deserialized_data)(void *data, size_t size,
|
||||
void *cb_state),
|
||||
void *cb_state);
|
||||
|
||||
/** @} */
|
||||
|
||||
/** \defgroup machine ATMI Machine
|
||||
* @{
|
||||
*/
|
||||
/**
|
||||
* @brief ATMI's device discovery function to get the current machine's
|
||||
* topology.
|
||||
*
|
||||
* @detail The @p atmi_machine_t structure is a tree-based representation of the
|
||||
* compute and memory elements in the current node. Once ATMI is initialized,
|
||||
* this function can be called to retrieve the pointer to this global structure.
|
||||
*
|
||||
* @return Returns a pointer to a global structure of tyoe @p atmi_machine_t.
|
||||
* Returns NULL if ATMI is not initialized.
|
||||
*/
|
||||
atmi_machine_t *atmi_machine_get_info();
|
||||
/** @} */
|
||||
|
||||
/** \defgroup memory_functions ATMI Data Management
|
||||
* @{
|
||||
*/
|
||||
/**
|
||||
* @brief Allocate memory from the specified memory place.
|
||||
*
|
||||
* @detail This function allocates memory from the specified memory place. If
|
||||
* the memory
|
||||
* place belongs primarily to the CPU, then the memory will be accessible by
|
||||
* other GPUs and CPUs in the system. If the memory place belongs primarily to a
|
||||
* GPU,
|
||||
* then it cannot be accessed by other devices in the system.
|
||||
*
|
||||
* @param[in] ptr The pointer to the memory that will be allocated.
|
||||
*
|
||||
* @param[in] size The size of the allocation in bytes.
|
||||
*
|
||||
* @param[in] place The memory place in the system to perform the allocation.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*
|
||||
*/
|
||||
atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
|
||||
|
||||
/**
|
||||
* @brief Frees memory that was previously allocated.
|
||||
*
|
||||
* @detail This function frees memory that was previously allocated by calling
|
||||
* @p atmi_malloc. It throws an error otherwise. It is illegal to access a
|
||||
* pointer after a call to this function.
|
||||
*
|
||||
* @param[in] ptr The pointer to the memory that has to be freed.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*
|
||||
*/
|
||||
atmi_status_t atmi_free(void *ptr);
|
||||
|
||||
/**
|
||||
* @brief Syncrhonously copy memory from the source to destination memory
|
||||
* locations.
|
||||
*
|
||||
* @detail This function assumes that the source and destination regions are
|
||||
* non-overlapping. The runtime determines the memory place of the source and
|
||||
* the
|
||||
* destination and executes the appropriate optimized data movement methodology.
|
||||
*
|
||||
* @param[in] dest The destination pointer previously allocated by a system
|
||||
* allocator or @p atmi_malloc.
|
||||
*
|
||||
* @param[in] src The source pointer previously allocated by a system
|
||||
* allocator or @p atmi_malloc.
|
||||
*
|
||||
* @param[in] size The size of the data to be copied in bytes.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_ERROR The function encountered errors.
|
||||
*
|
||||
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
|
||||
*
|
||||
*/
|
||||
atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size);
|
||||
|
||||
/** @} */
|
||||
|
||||
/** \defgroup cpu_dev_runtime ATMI CPU Device Runtime
|
||||
* @{
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // INCLUDE_ATMI_RUNTIME_H_
|
|
@ -0,0 +1,203 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "data.h"
|
||||
#include "atmi_runtime.h"
|
||||
#include "internal.h"
|
||||
#include "machine.h"
|
||||
#include "rt.h"
|
||||
#include <cassert>
|
||||
#include <hsa.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
using core::TaskImpl;
|
||||
extern ATLMachine g_atl_machine;
|
||||
extern hsa_signal_t IdentityCopySignal;
|
||||
|
||||
namespace core {
|
||||
ATLPointerTracker g_data_map; // Track all am pointer allocations.
|
||||
void allow_access_to_all_gpu_agents(void *ptr);
|
||||
|
||||
const char *getPlaceStr(atmi_devtype_t type) {
|
||||
switch (type) {
|
||||
case ATMI_DEVTYPE_CPU:
|
||||
return "CPU";
|
||||
case ATMI_DEVTYPE_GPU:
|
||||
return "GPU";
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const ATLData *ap) {
|
||||
atmi_mem_place_t place = ap->place();
|
||||
os << " devicePointer:" << ap->ptr() << " sizeBytes:" << ap->size()
|
||||
<< " place:(" << getPlaceStr(place.dev_type) << ", " << place.dev_id
|
||||
<< ", " << place.mem_id << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
void ATLPointerTracker::insert(void *pointer, ATLData *p) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
|
||||
DEBUG_PRINT("insert: %p + %zu\n", pointer, p->size());
|
||||
tracker_.insert(std::make_pair(ATLMemoryRange(pointer, p->size()), p));
|
||||
}
|
||||
|
||||
void ATLPointerTracker::remove(void *pointer) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
DEBUG_PRINT("remove: %p\n", pointer);
|
||||
tracker_.erase(ATLMemoryRange(pointer, 1));
|
||||
}
|
||||
|
||||
ATLData *ATLPointerTracker::find(const void *pointer) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
ATLData *ret = NULL;
|
||||
auto iter = tracker_.find(ATLMemoryRange(pointer, 1));
|
||||
DEBUG_PRINT("find: %p\n", pointer);
|
||||
if (iter != tracker_.end()) // found
|
||||
ret = iter->second;
|
||||
return ret;
|
||||
}
|
||||
|
||||
ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
|
||||
int dev_id = place.dev_id;
|
||||
switch (place.dev_type) {
|
||||
case ATMI_DEVTYPE_CPU:
|
||||
return g_atl_machine.processors<ATLCPUProcessor>()[dev_id];
|
||||
case ATMI_DEVTYPE_GPU:
|
||||
return g_atl_machine.processors<ATLGPUProcessor>()[dev_id];
|
||||
}
|
||||
}
|
||||
|
||||
static hsa_agent_t get_mem_agent(atmi_mem_place_t place) {
|
||||
return get_processor_by_mem_place(place).agent();
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) {
|
||||
ATLProcessor &proc = get_processor_by_mem_place(place);
|
||||
return get_memory_pool(proc, place.mem_id);
|
||||
}
|
||||
|
||||
void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) {
|
||||
ATLData *data = new ATLData(ptr, size, place);
|
||||
g_data_map.insert(ptr, data);
|
||||
if (place.dev_type == ATMI_DEVTYPE_CPU)
|
||||
allow_access_to_all_gpu_agents(ptr);
|
||||
// TODO(ashwinma): what if one GPU wants to access another GPU?
|
||||
}
|
||||
|
||||
atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) {
|
||||
atmi_status_t ret = ATMI_STATUS_SUCCESS;
|
||||
hsa_amd_memory_pool_t pool = get_memory_pool_by_mem_place(place);
|
||||
hsa_status_t err = hsa_amd_memory_pool_allocate(pool, size, 0, ptr);
|
||||
ErrorCheck(atmi_malloc, err);
|
||||
DEBUG_PRINT("Malloced [%s %d] %p\n",
|
||||
place.dev_type == ATMI_DEVTYPE_CPU ? "CPU" : "GPU", place.dev_id,
|
||||
*ptr);
|
||||
if (err != HSA_STATUS_SUCCESS)
|
||||
ret = ATMI_STATUS_ERROR;
|
||||
|
||||
register_allocation(*ptr, size, place);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
atmi_status_t Runtime::Memfree(void *ptr) {
|
||||
atmi_status_t ret = ATMI_STATUS_SUCCESS;
|
||||
hsa_status_t err;
|
||||
ATLData *data = g_data_map.find(ptr);
|
||||
if (!data)
|
||||
ErrorCheck(Checking pointer info userData,
|
||||
HSA_STATUS_ERROR_INVALID_ALLOCATION);
|
||||
|
||||
g_data_map.remove(ptr);
|
||||
delete data;
|
||||
|
||||
err = hsa_amd_memory_pool_free(ptr);
|
||||
ErrorCheck(atmi_free, err);
|
||||
DEBUG_PRINT("Freed %p\n", ptr);
|
||||
|
||||
if (err != HSA_STATUS_SUCCESS || !data)
|
||||
ret = ATMI_STATUS_ERROR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static hsa_status_t invoke_hsa_copy(void *dest, const void *src, size_t size,
|
||||
hsa_agent_t agent) {
|
||||
// TODO: Use thread safe signal
|
||||
hsa_signal_store_release(IdentityCopySignal, 1);
|
||||
|
||||
hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
|
||||
NULL, IdentityCopySignal);
|
||||
ErrorCheck(Copy async between memory pools, err);
|
||||
|
||||
// TODO: async reports errors in the signal, use NE 1
|
||||
hsa_signal_wait_acquire(IdentityCopySignal, HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
UINT64_MAX, ATMI_WAIT_STATE);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
atmi_status_t Runtime::Memcpy(void *dest, const void *src, size_t size) {
|
||||
atmi_status_t ret;
|
||||
hsa_status_t err;
|
||||
ATLData *src_data = g_data_map.find(src);
|
||||
ATLData *dest_data = g_data_map.find(dest);
|
||||
atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
|
||||
void *temp_host_ptr;
|
||||
|
||||
if (src_data && !dest_data) {
|
||||
// Copy from device to scratch to host
|
||||
hsa_agent_t agent = get_mem_agent(src_data->place());
|
||||
DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle);
|
||||
ret = atmi_malloc(&temp_host_ptr, size, cpu);
|
||||
if (ret != ATMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
err = invoke_hsa_copy(temp_host_ptr, src, size, agent);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
memcpy(dest, temp_host_ptr, size);
|
||||
|
||||
} else if (!src_data && dest_data) {
|
||||
// Copy from host to scratch to device
|
||||
hsa_agent_t agent = get_mem_agent(dest_data->place());
|
||||
DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle);
|
||||
ret = atmi_malloc(&temp_host_ptr, size, cpu);
|
||||
if (ret != ATMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
memcpy(temp_host_ptr, src, size);
|
||||
|
||||
DEBUG_PRINT("Memcpy device agent: %lu\n", agent.handle);
|
||||
err = invoke_hsa_copy(dest, temp_host_ptr, size, agent);
|
||||
|
||||
} else if (!src_data && !dest_data) {
|
||||
DEBUG_PRINT("atmi_memcpy invoked without metadata\n");
|
||||
// would be host to host, just call memcpy, or missing metadata
|
||||
return ATMI_STATUS_ERROR;
|
||||
} else {
|
||||
DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n");
|
||||
return ATMI_STATUS_ERROR;
|
||||
}
|
||||
|
||||
ret = atmi_free(temp_host_ptr);
|
||||
|
||||
if (err != HSA_STATUS_SUCCESS || ret != ATMI_STATUS_SUCCESS)
|
||||
ret = ATMI_STATUS_ERROR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace core
|
|
@ -0,0 +1,83 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef SRC_RUNTIME_INCLUDE_DATA_H_
|
||||
#define SRC_RUNTIME_INCLUDE_DATA_H_
|
||||
#include "atmi.h"
|
||||
#include <hsa.h>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
// we maintain our own mapping of device addr to a user specified data object
|
||||
// in order to work around a (possibly historic) bug in ROCr's
|
||||
// hsa_amd_pointer_info_set_userdata for variable symbols
|
||||
// this is expected to be temporary
|
||||
|
||||
namespace core {
|
||||
// Internal representation of any data that is created and managed by ATMI.
|
||||
// Data can be located on any device memory or host memory.
|
||||
class ATLData {
|
||||
public:
|
||||
ATLData(void *ptr, size_t size, atmi_mem_place_t place)
|
||||
: ptr_(ptr), size_(size), place_(place) {}
|
||||
|
||||
void *ptr() const { return ptr_; }
|
||||
size_t size() const { return size_; }
|
||||
atmi_mem_place_t place() const { return place_; }
|
||||
|
||||
private:
|
||||
void *ptr_;
|
||||
size_t size_;
|
||||
atmi_mem_place_t place_;
|
||||
};
|
||||
|
||||
//---
|
||||
struct ATLMemoryRange {
|
||||
const void *base_pointer;
|
||||
const void *end_pointer;
|
||||
ATLMemoryRange(const void *bp, size_t size_bytes)
|
||||
: base_pointer(bp),
|
||||
end_pointer(reinterpret_cast<const unsigned char *>(bp) + size_bytes -
|
||||
1) {}
|
||||
};
|
||||
|
||||
// Functor to compare ranges:
|
||||
struct ATLMemoryRangeCompare {
|
||||
// Return true is LHS range is less than RHS - used to order the ranges
|
||||
bool operator()(const ATLMemoryRange &lhs, const ATLMemoryRange &rhs) const {
|
||||
return lhs.end_pointer < rhs.base_pointer;
|
||||
}
|
||||
};
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
// This structure tracks information for each pointer.
|
||||
// Uses memory-range-based lookups - so pointers that exist anywhere in the
|
||||
// range of hostPtr + size
|
||||
// will find the associated ATLPointerInfo.
|
||||
// The insertions and lookups use a self-balancing binary tree and should
|
||||
// support O(logN) lookup speed.
|
||||
// The structure is thread-safe - writers obtain a mutex before modifying the
|
||||
// tree. Multiple simulatenous readers are supported.
|
||||
class ATLPointerTracker {
|
||||
typedef std::map<ATLMemoryRange, ATLData *, ATLMemoryRangeCompare>
|
||||
MapTrackerType;
|
||||
|
||||
public:
|
||||
void insert(void *pointer, ATLData *data);
|
||||
void remove(void *pointer);
|
||||
ATLData *find(const void *pointer);
|
||||
|
||||
private:
|
||||
MapTrackerType tracker_;
|
||||
std::mutex mutex_;
|
||||
};
|
||||
|
||||
extern ATLPointerTracker g_data_map; // Track all am pointer allocations.
|
||||
|
||||
enum class Direction { ATMI_H2D, ATMI_D2H, ATMI_D2D, ATMI_H2H };
|
||||
|
||||
} // namespace core
|
||||
#endif // SRC_RUNTIME_INCLUDE_DATA_H_
|
|
@ -0,0 +1,266 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
|
||||
#define SRC_RUNTIME_INCLUDE_INTERNAL_H_
|
||||
#include <inttypes.h>
|
||||
#include <pthread.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
#include <deque>
|
||||
#include <map>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_amd.h"
|
||||
#include "hsa_ext_finalize.h"
|
||||
|
||||
#include "atmi.h"
|
||||
#include "atmi_runtime.h"
|
||||
#include "rt.h"
|
||||
|
||||
#define MAX_NUM_KERNELS (1024 * 16)
|
||||
|
||||
typedef struct atmi_implicit_args_s {
|
||||
unsigned long offset_x;
|
||||
unsigned long offset_y;
|
||||
unsigned long offset_z;
|
||||
unsigned long hostcall_ptr;
|
||||
char num_gpu_queues;
|
||||
unsigned long gpu_queue_ptr;
|
||||
char num_cpu_queues;
|
||||
unsigned long cpu_worker_signals;
|
||||
unsigned long cpu_queue_ptr;
|
||||
unsigned long kernarg_template_ptr;
|
||||
} atmi_implicit_args_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define check(msg, status) \
|
||||
if (status != HSA_STATUS_SUCCESS) { \
|
||||
printf("%s failed.\n", #msg); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
#define DEBUG_PRINT(fmt, ...) \
|
||||
if (core::Runtime::getInstance().getDebugMode()) { \
|
||||
fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__); \
|
||||
}
|
||||
#else
|
||||
#define DEBUG_PRINT(...) \
|
||||
do { \
|
||||
} while (false)
|
||||
#endif
|
||||
|
||||
#ifndef HSA_RUNTIME_INC_HSA_H_
|
||||
typedef struct hsa_signal_s {
|
||||
uint64_t handle;
|
||||
} hsa_signal_t;
|
||||
#endif
|
||||
|
||||
/* All global values go in this global structure */
|
||||
typedef struct atl_context_s {
|
||||
bool struct_initialized;
|
||||
bool g_hsa_initialized;
|
||||
bool g_gpu_initialized;
|
||||
bool g_tasks_initialized;
|
||||
} atl_context_t;
|
||||
extern atl_context_t atlc;
|
||||
extern atl_context_t *atlc_p;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ---------------------------------------------------------------------------------
|
||||
* Simulated CPU Data Structures and API
|
||||
* ---------------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED
|
||||
|
||||
// ---------------------- Kernel Start -------------
|
||||
typedef struct atl_kernel_info_s {
|
||||
uint64_t kernel_object;
|
||||
uint32_t group_segment_size;
|
||||
uint32_t private_segment_size;
|
||||
uint32_t kernel_segment_size;
|
||||
uint32_t num_args;
|
||||
std::vector<uint64_t> arg_alignments;
|
||||
std::vector<uint64_t> arg_offsets;
|
||||
std::vector<uint64_t> arg_sizes;
|
||||
} atl_kernel_info_t;
|
||||
|
||||
typedef struct atl_symbol_info_s {
|
||||
uint64_t addr;
|
||||
uint32_t size;
|
||||
} atl_symbol_info_t;
|
||||
|
||||
extern std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
|
||||
extern std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
|
||||
|
||||
// ---------------------- Kernel End -------------
|
||||
|
||||
extern struct timespec context_init_time;
|
||||
|
||||
namespace core {
|
||||
class TaskgroupImpl;
|
||||
class TaskImpl;
|
||||
class Kernel;
|
||||
class KernelImpl;
|
||||
} // namespace core
|
||||
|
||||
struct SignalPoolT {
|
||||
SignalPoolT() {
|
||||
// If no signals are created, and none can be created later,
|
||||
// will ultimately fail at pop()
|
||||
|
||||
unsigned N = 1024; // default max pool size from atmi
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
hsa_signal_t new_signal;
|
||||
hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
break;
|
||||
}
|
||||
state.push(new_signal);
|
||||
}
|
||||
DEBUG_PRINT("Signal Pool Initial Size: %lu\n", state.size());
|
||||
}
|
||||
SignalPoolT(const SignalPoolT &) = delete;
|
||||
SignalPoolT(SignalPoolT &&) = delete;
|
||||
~SignalPoolT() {
|
||||
size_t N = state.size();
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
hsa_signal_t signal = state.front();
|
||||
state.pop();
|
||||
hsa_status_t rc = hsa_signal_destroy(signal);
|
||||
if (rc != HSA_STATUS_SUCCESS) {
|
||||
DEBUG_PRINT("Signal pool destruction failed\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
size_t size() {
|
||||
lock l(&mutex);
|
||||
return state.size();
|
||||
}
|
||||
void push(hsa_signal_t s) {
|
||||
lock l(&mutex);
|
||||
state.push(s);
|
||||
}
|
||||
hsa_signal_t pop(void) {
|
||||
lock l(&mutex);
|
||||
if (!state.empty()) {
|
||||
hsa_signal_t res = state.front();
|
||||
state.pop();
|
||||
return res;
|
||||
}
|
||||
|
||||
// Pool empty, attempt to create another signal
|
||||
hsa_signal_t new_signal;
|
||||
hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
|
||||
if (err == HSA_STATUS_SUCCESS) {
|
||||
return new_signal;
|
||||
}
|
||||
|
||||
// Fail
|
||||
return {0};
|
||||
}
|
||||
|
||||
private:
|
||||
static pthread_mutex_t mutex;
|
||||
std::queue<hsa_signal_t> state;
|
||||
struct lock {
|
||||
lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
|
||||
~lock() { pthread_mutex_unlock(m); }
|
||||
pthread_mutex_t *m;
|
||||
};
|
||||
};
|
||||
|
||||
extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
|
||||
|
||||
namespace core {
|
||||
atmi_status_t atl_init_gpu_context();
|
||||
|
||||
hsa_status_t init_hsa();
|
||||
hsa_status_t finalize_hsa();
|
||||
/*
|
||||
* Generic utils
|
||||
*/
|
||||
template <typename T> inline T alignDown(T value, size_t alignment) {
|
||||
return (T)(value & ~(alignment - 1));
|
||||
}
|
||||
|
||||
template <typename T> inline T *alignDown(T *value, size_t alignment) {
|
||||
return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
|
||||
}
|
||||
|
||||
template <typename T> inline T alignUp(T value, size_t alignment) {
|
||||
return alignDown((T)(value + alignment - 1), alignment);
|
||||
}
|
||||
|
||||
template <typename T> inline T *alignUp(T *value, size_t alignment) {
|
||||
return reinterpret_cast<T *>(
|
||||
alignDown((intptr_t)(value + alignment - 1), alignment));
|
||||
}
|
||||
|
||||
extern void register_allocation(void *addr, size_t size,
|
||||
atmi_mem_place_t place);
|
||||
extern hsa_amd_memory_pool_t
|
||||
get_memory_pool_by_mem_place(atmi_mem_place_t place);
|
||||
extern bool atl_is_atmi_initialized();
|
||||
|
||||
bool handle_group_signal(hsa_signal_value_t value, void *arg);
|
||||
|
||||
void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest);
|
||||
uint16_t
|
||||
create_header(hsa_packet_type_t type, int barrier,
|
||||
atmi_task_fence_scope_t acq_fence = ATMI_FENCE_SCOPE_SYSTEM,
|
||||
atmi_task_fence_scope_t rel_fence = ATMI_FENCE_SCOPE_SYSTEM);
|
||||
|
||||
void allow_access_to_all_gpu_agents(void *ptr);
|
||||
} // namespace core
|
||||
|
||||
const char *get_error_string(hsa_status_t err);
|
||||
const char *get_atmi_error_string(atmi_status_t err);
|
||||
|
||||
#define ATMIErrorCheck(msg, status) \
|
||||
if (status != ATMI_STATUS_SUCCESS) { \
|
||||
printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
|
||||
get_atmi_error_string(status)); \
|
||||
exit(1); \
|
||||
} else { \
|
||||
/* printf("%s succeeded.\n", #msg);*/ \
|
||||
}
|
||||
|
||||
#define ErrorCheck(msg, status) \
|
||||
if (status != HSA_STATUS_SUCCESS) { \
|
||||
printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
|
||||
get_error_string(status)); \
|
||||
exit(1); \
|
||||
} else { \
|
||||
/* printf("%s succeeded.\n", #msg);*/ \
|
||||
}
|
||||
|
||||
#define ErrorCheckAndContinue(msg, status) \
|
||||
if (status != HSA_STATUS_SUCCESS) { \
|
||||
DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
|
||||
get_error_string(status)); \
|
||||
continue; \
|
||||
} else { \
|
||||
/* printf("%s succeeded.\n", #msg);*/ \
|
||||
}
|
||||
|
||||
#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
|
|
@ -0,0 +1,128 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "machine.h"
|
||||
#include "atmi_runtime.h"
|
||||
#include "internal.h"
|
||||
#include <cassert>
|
||||
#include <hsa.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
|
||||
extern ATLMachine g_atl_machine;
|
||||
extern hsa_region_t atl_cpu_kernarg_region;
|
||||
|
||||
void *ATLMemory::alloc(size_t sz) {
|
||||
void *ret;
|
||||
hsa_status_t err = hsa_amd_memory_pool_allocate(memory_pool_, sz, 0, &ret);
|
||||
ErrorCheck(Allocate from memory pool, err);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ATLMemory::free(void *ptr) {
|
||||
hsa_status_t err = hsa_amd_memory_pool_free(ptr);
|
||||
ErrorCheck(Allocate from memory pool, err);
|
||||
}
|
||||
|
||||
void ATLProcessor::addMemory(const ATLMemory &mem) {
|
||||
for (auto &mem_obj : memories_) {
|
||||
// if the memory already exists, then just return
|
||||
if (mem.memory().handle == mem_obj.memory().handle)
|
||||
return;
|
||||
}
|
||||
memories_.push_back(mem);
|
||||
}
|
||||
|
||||
const std::vector<ATLMemory> &ATLProcessor::memories() const {
|
||||
return memories_;
|
||||
}
|
||||
|
||||
template <> std::vector<ATLCPUProcessor> &ATLMachine::processors() {
|
||||
return cpu_processors_;
|
||||
}
|
||||
|
||||
template <> std::vector<ATLGPUProcessor> &ATLMachine::processors() {
|
||||
return gpu_processors_;
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
|
||||
const int mem_id) {
|
||||
hsa_amd_memory_pool_t pool;
|
||||
const std::vector<ATLMemory> &mems = proc.memories();
|
||||
assert(mems.size() && mem_id >= 0 && mem_id < mems.size() &&
|
||||
"Invalid memory pools for this processor");
|
||||
pool = mems[mem_id].memory();
|
||||
return pool;
|
||||
}
|
||||
|
||||
template <> void ATLMachine::addProcessor(const ATLCPUProcessor &p) {
|
||||
cpu_processors_.push_back(p);
|
||||
}
|
||||
|
||||
template <> void ATLMachine::addProcessor(const ATLGPUProcessor &p) {
|
||||
gpu_processors_.push_back(p);
|
||||
}
|
||||
|
||||
void callbackQueue(hsa_status_t status, hsa_queue_t *source, void *data) {
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "[%s:%d] GPU error in queue %p %d\n", __FILE__, __LINE__,
|
||||
source, status);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
void ATLGPUProcessor::createQueues(const int count) {
|
||||
int *num_cus = reinterpret_cast<int *>(calloc(count, sizeof(int)));
|
||||
|
||||
hsa_status_t err;
|
||||
/* Query the maximum size of the queue. */
|
||||
uint32_t queue_size = 0;
|
||||
err = hsa_agent_get_info(agent_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
|
||||
ErrorCheck(Querying the agent maximum queue size, err);
|
||||
if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
|
||||
queue_size = core::Runtime::getInstance().getMaxQueueSize();
|
||||
}
|
||||
|
||||
/* Create queues for each device. */
|
||||
int qid;
|
||||
for (qid = 0; qid < count; qid++) {
|
||||
hsa_queue_t *this_Q;
|
||||
err =
|
||||
hsa_queue_create(agent_, queue_size, HSA_QUEUE_TYPE_MULTI,
|
||||
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &this_Q);
|
||||
ErrorCheck(Creating the queue, err);
|
||||
err = hsa_amd_profiling_set_profiler_enabled(this_Q, 1);
|
||||
ErrorCheck(Enabling profiling support, err);
|
||||
|
||||
queues_.push_back(this_Q);
|
||||
|
||||
DEBUG_PRINT("Queue[%d]: %p\n", qid, this_Q);
|
||||
}
|
||||
|
||||
free(num_cus);
|
||||
}
|
||||
|
||||
void ATLCPUProcessor::createQueues(const int) {}
|
||||
|
||||
void ATLProcessor::destroyQueues() {
|
||||
for (auto queue : queues_) {
|
||||
hsa_status_t err = hsa_queue_destroy(queue);
|
||||
ErrorCheck(Destroying the queue, err);
|
||||
}
|
||||
}
|
||||
|
||||
int ATLProcessor::num_cus() const {
|
||||
hsa_status_t err;
|
||||
/* Query the number of compute units. */
|
||||
uint32_t num_cus = 0;
|
||||
err = hsa_agent_get_info(
|
||||
agent_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
|
||||
&num_cus);
|
||||
ErrorCheck(Querying the agent number of compute units, err);
|
||||
|
||||
return num_cus;
|
||||
}
|
|
@ -0,0 +1,109 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef SRC_RUNTIME_INCLUDE_MACHINE_H_
|
||||
#define SRC_RUNTIME_INCLUDE_MACHINE_H_
|
||||
#include "atmi.h"
|
||||
#include "internal.h"
|
||||
#include <hsa.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <vector>
|
||||
|
||||
class ATLMemory;
|
||||
|
||||
class ATLProcessor {
|
||||
public:
|
||||
explicit ATLProcessor(hsa_agent_t agent,
|
||||
atmi_devtype_t type = ATMI_DEVTYPE_ALL)
|
||||
: agent_(agent), type_(type) {
|
||||
queues_.clear();
|
||||
memories_.clear();
|
||||
}
|
||||
void addMemory(const ATLMemory &p);
|
||||
hsa_agent_t agent() const { return agent_; }
|
||||
// TODO(ashwinma): Do we need this or are we building the machine structure
|
||||
// just once in the program?
|
||||
// void removeMemory(ATLMemory &p);
|
||||
const std::vector<ATLMemory> &memories() const;
|
||||
atmi_devtype_t type() const { return type_; }
|
||||
|
||||
virtual void createQueues(const int count) {}
|
||||
virtual void destroyQueues();
|
||||
std::vector<hsa_queue_t *> queues() const { return queues_; }
|
||||
|
||||
int num_cus() const;
|
||||
|
||||
protected:
|
||||
hsa_agent_t agent_;
|
||||
atmi_devtype_t type_;
|
||||
std::vector<hsa_queue_t *> queues_;
|
||||
std::vector<ATLMemory> memories_;
|
||||
};
|
||||
|
||||
class ATLCPUProcessor : public ATLProcessor {
|
||||
public:
|
||||
explicit ATLCPUProcessor(hsa_agent_t agent)
|
||||
: ATLProcessor(agent, ATMI_DEVTYPE_CPU) {}
|
||||
void createQueues(const int count);
|
||||
};
|
||||
|
||||
class ATLGPUProcessor : public ATLProcessor {
|
||||
public:
|
||||
explicit ATLGPUProcessor(hsa_agent_t agent,
|
||||
atmi_devtype_t type = ATMI_DEVTYPE_dGPU)
|
||||
: ATLProcessor(agent, type) {}
|
||||
void createQueues(const int count);
|
||||
};
|
||||
|
||||
class ATLMemory {
|
||||
public:
|
||||
ATLMemory(hsa_amd_memory_pool_t pool, ATLProcessor p, atmi_memtype_t t)
|
||||
: memory_pool_(pool), processor_(p), type_(t) {}
|
||||
ATLProcessor &processor() { return processor_; }
|
||||
hsa_amd_memory_pool_t memory() const { return memory_pool_; }
|
||||
|
||||
atmi_memtype_t type() const { return type_; }
|
||||
|
||||
void *alloc(size_t s);
|
||||
void free(void *p);
|
||||
|
||||
private:
|
||||
hsa_amd_memory_pool_t memory_pool_;
|
||||
ATLProcessor processor_;
|
||||
atmi_memtype_t type_;
|
||||
};
|
||||
|
||||
class ATLMachine {
|
||||
public:
|
||||
ATLMachine() {
|
||||
cpu_processors_.clear();
|
||||
gpu_processors_.clear();
|
||||
}
|
||||
template <typename T> void addProcessor(const T &p);
|
||||
template <typename T> std::vector<T> &processors();
|
||||
template <typename T> size_t processorCount() {
|
||||
return processors<T>().size();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<ATLCPUProcessor> cpu_processors_;
|
||||
std::vector<ATLGPUProcessor> gpu_processors_;
|
||||
};
|
||||
|
||||
hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
|
||||
const int mem_id);
|
||||
|
||||
extern ATLMachine g_atl_machine;
|
||||
template <typename T> T &get_processor(atmi_place_t place) {
|
||||
int dev_id = place.device_id;
|
||||
if (dev_id == -1) {
|
||||
// user is asking runtime to pick a device
|
||||
// TODO(ashwinma): best device of this type? pick 0 for now
|
||||
dev_id = 0;
|
||||
}
|
||||
return g_atl_machine.processors<T>()[dev_id];
|
||||
}
|
||||
|
||||
#endif // SRC_RUNTIME_INCLUDE_MACHINE_H_
|
|
@ -0,0 +1,264 @@
|
|||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
#include "msgpack.h"
|
||||
|
||||
namespace msgpack {
|
||||
|
||||
[[noreturn]] void internal_error() {
|
||||
printf("internal error\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const char *type_name(type ty) {
|
||||
switch (ty) {
|
||||
#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
|
||||
case NAME: \
|
||||
return #NAME;
|
||||
#include "msgpack.def"
|
||||
#undef X
|
||||
}
|
||||
internal_error();
|
||||
}
|
||||
|
||||
unsigned bytes_used_fixed(msgpack::type ty) {
|
||||
using namespace msgpack;
|
||||
switch (ty) {
|
||||
#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
|
||||
case NAME: \
|
||||
return WIDTH;
|
||||
#include "msgpack.def"
|
||||
#undef X
|
||||
}
|
||||
internal_error();
|
||||
}
|
||||
|
||||
msgpack::type parse_type(unsigned char x) {
|
||||
|
||||
#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
|
||||
if (x >= LOWER && x <= UPPER) { \
|
||||
return NAME; \
|
||||
} else
|
||||
#include "msgpack.def"
|
||||
#undef X
|
||||
{ internal_error(); }
|
||||
}
|
||||
|
||||
template <typename T, typename R> R bitcast(T x) {
|
||||
static_assert(sizeof(T) == sizeof(R), "");
|
||||
R tmp;
|
||||
memcpy(&tmp, &x, sizeof(T));
|
||||
return tmp;
|
||||
}
|
||||
template int64_t bitcast<uint64_t, int64_t>(uint64_t);
|
||||
} // namespace msgpack
|
||||
|
||||
// Helper functions for reading additional payload from the header
|
||||
// Depending on the type, this can be a number of bytes, elements,
|
||||
// key-value pairs or an embedded integer.
|
||||
// Each takes a pointer to the start of the header and returns a uint64_t
|
||||
|
||||
namespace {
|
||||
namespace payload {
|
||||
uint64_t read_zero(const unsigned char *) { return 0; }
|
||||
|
||||
// Read the first byte and zero/sign extend it
|
||||
uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; }
|
||||
uint64_t read_embedded_s8(const unsigned char *start) {
|
||||
int64_t res = msgpack::bitcast<uint8_t, int8_t>(start[0]);
|
||||
return msgpack::bitcast<int64_t, uint64_t>(res);
|
||||
}
|
||||
|
||||
// Read a masked part of the first byte
|
||||
uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; }
|
||||
uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; }
|
||||
uint64_t read_via_mask_0x1f(const unsigned char *start) {
|
||||
return *start & 0x1fu;
|
||||
}
|
||||
|
||||
// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend
|
||||
// Big endian format.
|
||||
uint64_t read_size_field_u8(const unsigned char *from) {
|
||||
from++;
|
||||
return from[0];
|
||||
}
|
||||
|
||||
// TODO: detect whether host is little endian or not, and whether the intrinsic
|
||||
// is available. And probably use the builtin to test the diy
|
||||
const bool use_bswap = false;
|
||||
|
||||
uint64_t read_size_field_u16(const unsigned char *from) {
|
||||
from++;
|
||||
if (use_bswap) {
|
||||
uint16_t b;
|
||||
memcpy(&b, from, 2);
|
||||
return __builtin_bswap16(b);
|
||||
} else {
|
||||
return (from[0] << 8u) | from[1];
|
||||
}
|
||||
}
|
||||
uint64_t read_size_field_u32(const unsigned char *from) {
|
||||
from++;
|
||||
if (use_bswap) {
|
||||
uint32_t b;
|
||||
memcpy(&b, from, 4);
|
||||
return __builtin_bswap32(b);
|
||||
} else {
|
||||
return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) |
|
||||
(from[3] << 0u);
|
||||
}
|
||||
}
|
||||
uint64_t read_size_field_u64(const unsigned char *from) {
|
||||
from++;
|
||||
if (use_bswap) {
|
||||
uint64_t b;
|
||||
memcpy(&b, from, 8);
|
||||
return __builtin_bswap64(b);
|
||||
} else {
|
||||
return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) |
|
||||
((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) |
|
||||
(from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) |
|
||||
(from[7] << 0u);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t read_size_field_s8(const unsigned char *from) {
|
||||
uint8_t u = read_size_field_u8(from);
|
||||
int64_t res = msgpack::bitcast<uint8_t, int8_t>(u);
|
||||
return msgpack::bitcast<int64_t, uint64_t>(res);
|
||||
}
|
||||
uint64_t read_size_field_s16(const unsigned char *from) {
|
||||
uint16_t u = read_size_field_u16(from);
|
||||
int64_t res = msgpack::bitcast<uint16_t, int16_t>(u);
|
||||
return msgpack::bitcast<int64_t, uint64_t>(res);
|
||||
}
|
||||
uint64_t read_size_field_s32(const unsigned char *from) {
|
||||
uint32_t u = read_size_field_u32(from);
|
||||
int64_t res = msgpack::bitcast<uint32_t, int32_t>(u);
|
||||
return msgpack::bitcast<int64_t, uint64_t>(res);
|
||||
}
|
||||
uint64_t read_size_field_s64(const unsigned char *from) {
|
||||
uint64_t u = read_size_field_u64(from);
|
||||
int64_t res = msgpack::bitcast<uint64_t, int64_t>(u);
|
||||
return msgpack::bitcast<int64_t, uint64_t>(res);
|
||||
}
|
||||
} // namespace payload
|
||||
} // namespace
|
||||
|
||||
namespace msgpack {
|
||||
|
||||
payload_info_t payload_info(msgpack::type ty) {
|
||||
using namespace msgpack;
|
||||
switch (ty) {
|
||||
#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
|
||||
case NAME: \
|
||||
return payload::PAYLOAD;
|
||||
#include "msgpack.def"
|
||||
#undef X
|
||||
}
|
||||
internal_error();
|
||||
}
|
||||
|
||||
} // namespace msgpack
|
||||
|
||||
const unsigned char *msgpack::skip_next_message(const unsigned char *start,
|
||||
const unsigned char *end) {
|
||||
class f : public functors_defaults<f> {};
|
||||
return handle_msgpack({start, end}, f());
|
||||
}
|
||||
|
||||
namespace msgpack {
|
||||
bool message_is_string(byte_range bytes, const char *needle) {
|
||||
bool matched = false;
|
||||
size_t needleN = strlen(needle);
|
||||
|
||||
foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) {
|
||||
if (N == needleN) {
|
||||
if (memcmp(needle, str, N) == 0) {
|
||||
matched = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
return matched;
|
||||
}
|
||||
|
||||
void dump(byte_range bytes) {
|
||||
struct inner : functors_defaults<inner> {
|
||||
inner(unsigned indent) : indent(indent) {}
|
||||
const unsigned by = 2;
|
||||
unsigned indent = 0;
|
||||
|
||||
void handle_string(size_t N, const unsigned char *bytes) {
|
||||
char *tmp = (char *)malloc(N + 1);
|
||||
memcpy(tmp, bytes, N);
|
||||
tmp[N] = '\0';
|
||||
printf("\"%s\"", tmp);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
void handle_signed(int64_t x) { printf("%ld", x); }
|
||||
void handle_unsigned(uint64_t x) { printf("%lu", x); }
|
||||
|
||||
const unsigned char *handle_array(uint64_t N, byte_range bytes) {
|
||||
printf("\n%*s[\n", indent, "");
|
||||
indent += by;
|
||||
|
||||
for (uint64_t i = 0; i < N; i++) {
|
||||
indent += by;
|
||||
printf("%*s", indent, "");
|
||||
const unsigned char *next = handle_msgpack<inner>(bytes, {indent});
|
||||
printf(",\n");
|
||||
indent -= by;
|
||||
bytes.start = next;
|
||||
if (!next) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
indent -= by;
|
||||
printf("%*s]", indent, "");
|
||||
|
||||
return bytes.start;
|
||||
}
|
||||
|
||||
const unsigned char *handle_map(uint64_t N, byte_range bytes) {
|
||||
printf("\n%*s{\n", indent, "");
|
||||
indent += by;
|
||||
|
||||
for (uint64_t i = 0; i < 2 * N; i += 2) {
|
||||
const unsigned char *start_key = bytes.start;
|
||||
printf("%*s", indent, "");
|
||||
const unsigned char *end_key =
|
||||
handle_msgpack<inner>({start_key, bytes.end}, {indent});
|
||||
if (!end_key) {
|
||||
break;
|
||||
}
|
||||
|
||||
printf(" : ");
|
||||
|
||||
const unsigned char *start_value = end_key;
|
||||
const unsigned char *end_value =
|
||||
handle_msgpack<inner>({start_value, bytes.end}, {indent});
|
||||
|
||||
if (!end_value) {
|
||||
break;
|
||||
}
|
||||
|
||||
printf(",\n");
|
||||
bytes.start = end_value;
|
||||
}
|
||||
|
||||
indent -= by;
|
||||
printf("%*s}", indent, "");
|
||||
|
||||
return bytes.start;
|
||||
}
|
||||
};
|
||||
|
||||
handle_msgpack<inner>(bytes, {0});
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
} // namespace msgpack
|
|
@ -0,0 +1,38 @@
|
|||
// name, header width, reader, [lower, upper] encoding
|
||||
X(posfixint, 1, read_embedded_u8, 0x00, 0x7f)
|
||||
X(negfixint, 1, read_embedded_s8, 0xe0, 0xff)
|
||||
X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f)
|
||||
X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f)
|
||||
X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf)
|
||||
X(nil, 1, read_zero, 0xc0, 0xc0)
|
||||
X(never_used, 1, read_zero, 0xc1, 0xc1)
|
||||
X(f, 1, read_via_mask_0x1, 0xc2, 0xc2)
|
||||
X(t, 1, read_via_mask_0x1, 0xc3, 0xc3)
|
||||
X(bin8, 2, read_size_field_u8, 0xc4, 0xc4)
|
||||
X(bin16, 3, read_size_field_u16, 0xc5, 0xc5)
|
||||
X(bin32, 5, read_size_field_u32, 0xc6, 0xc6)
|
||||
X(ext8, 3, read_size_field_u8, 0xc7, 0xc7)
|
||||
X(ext16, 4, read_size_field_u16, 0xc8, 0xc8)
|
||||
X(ext32, 6, read_size_field_u32, 0xc9, 0xc9)
|
||||
X(float32, 5, read_zero, 0xca, 0xca)
|
||||
X(float64, 9, read_zero, 0xcb, 0xcb)
|
||||
X(uint8, 2, read_size_field_u8, 0xcc, 0xcc)
|
||||
X(uint16, 3, read_size_field_u16, 0xcd, 0xcd)
|
||||
X(uint32, 5, read_size_field_u32, 0xce, 0xce)
|
||||
X(uint64, 9, read_size_field_u64, 0xcf, 0xcf)
|
||||
X(int8, 2, read_size_field_s8, 0xd0, 0xd0)
|
||||
X(int16, 3, read_size_field_s16, 0xd1, 0xd1)
|
||||
X(int32, 5, read_size_field_s32, 0xd2, 0xd2)
|
||||
X(int64, 9, read_size_field_s64, 0xd3, 0xd3)
|
||||
X(fixext1, 3, read_zero, 0xd4, 0xd4)
|
||||
X(fixext2, 4, read_zero, 0xd5, 0xd5)
|
||||
X(fixext4, 6, read_zero, 0xd6, 0xd6)
|
||||
X(fixext8, 10, read_zero, 0xd7, 0xd7)
|
||||
X(fixext16, 18, read_zero, 0xd8, 0xd8)
|
||||
X(str8, 2, read_size_field_u8, 0xd9, 0xd9)
|
||||
X(str16, 3, read_size_field_u16, 0xda, 0xda)
|
||||
X(str32, 5, read_size_field_u32, 0xdb, 0xdb)
|
||||
X(array16, 3, read_size_field_u16, 0xdc, 0xdc)
|
||||
X(array32, 5, read_size_field_u32, 0xdd, 0xdd)
|
||||
X(map16, 3, read_size_field_u16, 0xde, 0xde)
|
||||
X(map32, 5, read_size_field_u32, 0xdf, 0xdf)
|
|
@ -0,0 +1,275 @@
|
|||
#ifndef MSGPACK_H
|
||||
#define MSGPACK_H
|
||||
|
||||
#include <functional>
|
||||
|
||||
namespace msgpack {
|
||||
|
||||
// The message pack format is dynamically typed, schema-less. Format is:
|
||||
// message: [type][header][payload]
|
||||
// where type is one byte, header length is a fixed length function of type
|
||||
// payload is zero to N bytes, with the length encoded in [type][header]
|
||||
|
||||
// Scalar fields include boolean, signed integer, float, string etc
|
||||
// Composite types are sequences of messages
|
||||
// Array field is [header][element][element]...
|
||||
// Map field is [header][key][value][key][value]...
|
||||
|
||||
// Multibyte integer fields are big endian encoded
|
||||
// The map key can be any message type
|
||||
// Maps may contain duplicate keys
|
||||
// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or
|
||||
// in as many as nine, as signed or unsigned. Implementation defined.
|
||||
// Similarly "foo" may embed the length in the type field or in multiple bytes
|
||||
|
||||
// This parser is structured as an iterator over a sequence of bytes.
|
||||
// It calls a user provided function on each message in order to extract fields
|
||||
// The default implementation for each scalar type is to do nothing. For map or
|
||||
// arrays, the default implementation returns just after that message to support
|
||||
// iterating to the next message, but otherwise has no effect.
|
||||
|
||||
struct byte_range {
|
||||
const unsigned char *start;
|
||||
const unsigned char *end;
|
||||
};
|
||||
|
||||
const unsigned char *skip_next_message(const unsigned char *start,
|
||||
const unsigned char *end);
|
||||
|
||||
template <typename Derived> class functors_defaults {
|
||||
public:
|
||||
void cb_string(size_t N, const unsigned char *str) {
|
||||
derived().handle_string(N, str);
|
||||
}
|
||||
void cb_boolean(bool x) { derived().handle_boolean(x); }
|
||||
void cb_signed(int64_t x) { derived().handle_signed(x); }
|
||||
void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); }
|
||||
void cb_array_elements(byte_range bytes) {
|
||||
derived().handle_array_elements(bytes);
|
||||
}
|
||||
void cb_map_elements(byte_range key, byte_range value) {
|
||||
derived().handle_map_elements(key, value);
|
||||
}
|
||||
const unsigned char *cb_array(uint64_t N, byte_range bytes) {
|
||||
return derived().handle_array(N, bytes);
|
||||
}
|
||||
const unsigned char *cb_map(uint64_t N, byte_range bytes) {
|
||||
return derived().handle_map(N, bytes);
|
||||
}
|
||||
|
||||
private:
|
||||
Derived &derived() { return *static_cast<Derived *>(this); }
|
||||
|
||||
// Default implementations for scalar ops are no-ops
|
||||
void handle_string(size_t, const unsigned char *) {}
|
||||
void handle_boolean(bool) {}
|
||||
void handle_signed(int64_t) {}
|
||||
void handle_unsigned(uint64_t) {}
|
||||
void handle_array_elements(byte_range) {}
|
||||
void handle_map_elements(byte_range, byte_range) {}
|
||||
|
||||
// Default implementation for sequences is to skip over the messages
|
||||
const unsigned char *handle_array(uint64_t N, byte_range bytes) {
|
||||
for (uint64_t i = 0; i < N; i++) {
|
||||
const unsigned char *next = skip_next_message(bytes.start, bytes.end);
|
||||
if (!next) {
|
||||
return nullptr;
|
||||
}
|
||||
cb_array_elements(bytes);
|
||||
bytes.start = next;
|
||||
}
|
||||
return bytes.start;
|
||||
}
|
||||
const unsigned char *handle_map(uint64_t N, byte_range bytes) {
|
||||
for (uint64_t i = 0; i < N; i++) {
|
||||
const unsigned char *start_key = bytes.start;
|
||||
const unsigned char *end_key = skip_next_message(start_key, bytes.end);
|
||||
if (!end_key) {
|
||||
return nullptr;
|
||||
}
|
||||
const unsigned char *start_value = end_key;
|
||||
const unsigned char *end_value =
|
||||
skip_next_message(start_value, bytes.end);
|
||||
if (!end_value) {
|
||||
return nullptr;
|
||||
}
|
||||
cb_map_elements({start_key, end_key}, {start_value, end_value});
|
||||
bytes.start = end_value;
|
||||
}
|
||||
return bytes.start;
|
||||
}
|
||||
};
|
||||
|
||||
typedef enum : uint8_t {
|
||||
#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME,
|
||||
#include "msgpack.def"
|
||||
#undef X
|
||||
} type;
|
||||
|
||||
[[noreturn]] void internal_error();
|
||||
type parse_type(unsigned char x);
|
||||
unsigned bytes_used_fixed(type ty);
|
||||
|
||||
typedef uint64_t (*payload_info_t)(const unsigned char *);
|
||||
payload_info_t payload_info(msgpack::type ty);
|
||||
|
||||
template <typename T, typename R> R bitcast(T x);
|
||||
|
||||
template <typename F, msgpack::type ty>
|
||||
const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) {
|
||||
const unsigned char *start = bytes.start;
|
||||
const unsigned char *end = bytes.end;
|
||||
const uint64_t available = end - start;
|
||||
assert(available != 0);
|
||||
assert(ty == parse_type(*start));
|
||||
|
||||
const uint64_t bytes_used = bytes_used_fixed(ty);
|
||||
if (available < bytes_used) {
|
||||
return 0;
|
||||
}
|
||||
const uint64_t available_post_header = available - bytes_used;
|
||||
|
||||
const payload_info_t info = payload_info(ty);
|
||||
const uint64_t N = info(start);
|
||||
|
||||
switch (ty) {
|
||||
case msgpack::t:
|
||||
case msgpack::f: {
|
||||
// t is 0b11000010, f is 0b11000011, masked with 0x1
|
||||
f.cb_boolean(N);
|
||||
return start + bytes_used;
|
||||
}
|
||||
|
||||
case msgpack::posfixint:
|
||||
case msgpack::uint8:
|
||||
case msgpack::uint16:
|
||||
case msgpack::uint32:
|
||||
case msgpack::uint64: {
|
||||
f.cb_unsigned(N);
|
||||
return start + bytes_used;
|
||||
}
|
||||
|
||||
case msgpack::negfixint:
|
||||
case msgpack::int8:
|
||||
case msgpack::int16:
|
||||
case msgpack::int32:
|
||||
case msgpack::int64: {
|
||||
f.cb_signed(bitcast<uint64_t, int64_t>(N));
|
||||
return start + bytes_used;
|
||||
}
|
||||
|
||||
case msgpack::fixstr:
|
||||
case msgpack::str8:
|
||||
case msgpack::str16:
|
||||
case msgpack::str32: {
|
||||
if (available_post_header < N) {
|
||||
return 0;
|
||||
} else {
|
||||
f.cb_string(N, start + bytes_used);
|
||||
return start + bytes_used + N;
|
||||
}
|
||||
}
|
||||
|
||||
case msgpack::fixarray:
|
||||
case msgpack::array16:
|
||||
case msgpack::array32: {
|
||||
return f.cb_array(N, {start + bytes_used, end});
|
||||
}
|
||||
|
||||
case msgpack::fixmap:
|
||||
case msgpack::map16:
|
||||
case msgpack::map32: {
|
||||
return f.cb_map(N, {start + bytes_used, end});
|
||||
}
|
||||
|
||||
case msgpack::nil:
|
||||
case msgpack::bin8:
|
||||
case msgpack::bin16:
|
||||
case msgpack::bin32:
|
||||
case msgpack::float32:
|
||||
case msgpack::float64:
|
||||
case msgpack::ext8:
|
||||
case msgpack::ext16:
|
||||
case msgpack::ext32:
|
||||
case msgpack::fixext1:
|
||||
case msgpack::fixext2:
|
||||
case msgpack::fixext4:
|
||||
case msgpack::fixext8:
|
||||
case msgpack::fixext16:
|
||||
case msgpack::never_used: {
|
||||
if (available_post_header < N) {
|
||||
return 0;
|
||||
}
|
||||
return start + bytes_used + N;
|
||||
}
|
||||
}
|
||||
internal_error();
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
const unsigned char *handle_msgpack(byte_range bytes, F f) {
|
||||
const unsigned char *start = bytes.start;
|
||||
const unsigned char *end = bytes.end;
|
||||
const uint64_t available = end - start;
|
||||
if (available == 0) {
|
||||
return 0;
|
||||
}
|
||||
const type ty = parse_type(*start);
|
||||
|
||||
switch (ty) {
|
||||
#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
|
||||
case msgpack::NAME: \
|
||||
return handle_msgpack_given_type<F, msgpack::NAME>(bytes, f);
|
||||
#include "msgpack.def"
|
||||
#undef X
|
||||
}
|
||||
|
||||
internal_error();
|
||||
}
|
||||
|
||||
bool message_is_string(byte_range bytes, const char *str);
|
||||
|
||||
template <typename C> void foronly_string(byte_range bytes, C callback) {
|
||||
struct inner : functors_defaults<inner> {
|
||||
inner(C &cb) : cb(cb) {}
|
||||
C &cb;
|
||||
void handle_string(size_t N, const unsigned char *str) { cb(N, str); }
|
||||
};
|
||||
handle_msgpack<inner>(bytes, {callback});
|
||||
}
|
||||
|
||||
template <typename C> void foronly_unsigned(byte_range bytes, C callback) {
|
||||
struct inner : functors_defaults<inner> {
|
||||
inner(C &cb) : cb(cb) {}
|
||||
C &cb;
|
||||
void handle_unsigned(uint64_t x) { cb(x); }
|
||||
};
|
||||
handle_msgpack<inner>(bytes, {callback});
|
||||
}
|
||||
|
||||
template <typename C> void foreach_array(byte_range bytes, C callback) {
|
||||
struct inner : functors_defaults<inner> {
|
||||
inner(C &cb) : cb(cb) {}
|
||||
C &cb;
|
||||
void handle_array_elements(byte_range element) { cb(element); }
|
||||
};
|
||||
handle_msgpack<inner>(bytes, {callback});
|
||||
}
|
||||
|
||||
template <typename C> void foreach_map(byte_range bytes, C callback) {
|
||||
struct inner : functors_defaults<inner> {
|
||||
inner(C &cb) : cb(cb) {}
|
||||
C &cb;
|
||||
void handle_map_elements(byte_range key, byte_range value) {
|
||||
cb(key, value);
|
||||
}
|
||||
};
|
||||
handle_msgpack<inner>(bytes, {callback});
|
||||
}
|
||||
|
||||
// Crude approximation to json
|
||||
void dump(byte_range);
|
||||
|
||||
} // namespace msgpack
|
||||
|
||||
#endif
|
|
@ -0,0 +1,108 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#ifndef SRC_RUNTIME_INCLUDE_RT_H_
|
||||
#define SRC_RUNTIME_INCLUDE_RT_H_
|
||||
|
||||
#include "atmi_runtime.h"
|
||||
#include "hsa.h"
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
|
||||
namespace core {
|
||||
|
||||
#define DEFAULT_MAX_QUEUE_SIZE 4096
|
||||
#define DEFAULT_MAX_KERNEL_TYPES 32
|
||||
#define DEFAULT_NUM_GPU_QUEUES -1 // computed in code
|
||||
#define DEFAULT_NUM_CPU_QUEUES -1 // computed in code
|
||||
#define DEFAULT_DEBUG_MODE 0
|
||||
class Environment {
|
||||
public:
|
||||
Environment()
|
||||
: max_queue_size_(DEFAULT_MAX_QUEUE_SIZE),
|
||||
max_kernel_types_(DEFAULT_MAX_KERNEL_TYPES),
|
||||
num_gpu_queues_(DEFAULT_NUM_GPU_QUEUES),
|
||||
num_cpu_queues_(DEFAULT_NUM_CPU_QUEUES),
|
||||
debug_mode_(DEFAULT_DEBUG_MODE) {
|
||||
GetEnvAll();
|
||||
}
|
||||
|
||||
~Environment() {}
|
||||
|
||||
void GetEnvAll();
|
||||
|
||||
int getMaxQueueSize() const { return max_queue_size_; }
|
||||
int getMaxKernelTypes() const { return max_kernel_types_; }
|
||||
int getNumGPUQueues() const { return num_gpu_queues_; }
|
||||
int getNumCPUQueues() const { return num_cpu_queues_; }
|
||||
// TODO(ashwinma): int may change to enum if we have more debug modes
|
||||
int getDebugMode() const { return debug_mode_; }
|
||||
// TODO(ashwinma): int may change to enum if we have more profile modes
|
||||
|
||||
private:
|
||||
std::string GetEnv(const char *name) {
|
||||
char *env = getenv(name);
|
||||
std::string ret;
|
||||
if (env) {
|
||||
ret = env;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int max_queue_size_;
|
||||
int max_kernel_types_;
|
||||
int num_gpu_queues_;
|
||||
int num_cpu_queues_;
|
||||
int debug_mode_;
|
||||
};
|
||||
|
||||
class Runtime final {
|
||||
public:
|
||||
static Runtime &getInstance() {
|
||||
static Runtime instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
// init/finalize
|
||||
static atmi_status_t Initialize();
|
||||
static atmi_status_t Finalize();
|
||||
|
||||
// modules
|
||||
static atmi_status_t RegisterModuleFromMemory(
|
||||
void *, size_t, atmi_place_t,
|
||||
atmi_status_t (*on_deserialized_data)(void *data, size_t size,
|
||||
void *cb_state),
|
||||
void *cb_state);
|
||||
|
||||
// machine info
|
||||
static atmi_machine_t *GetMachineInfo();
|
||||
|
||||
// data
|
||||
static atmi_status_t Memcpy(void *, const void *, size_t);
|
||||
static atmi_status_t Memfree(void *);
|
||||
static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t);
|
||||
|
||||
// environment variables
|
||||
int getMaxQueueSize() const { return env_.getMaxQueueSize(); }
|
||||
int getMaxKernelTypes() const { return env_.getMaxKernelTypes(); }
|
||||
int getNumGPUQueues() const { return env_.getNumGPUQueues(); }
|
||||
int getNumCPUQueues() const { return env_.getNumCPUQueues(); }
|
||||
// TODO(ashwinma): int may change to enum if we have more debug modes
|
||||
int getDebugMode() const { return env_.getDebugMode(); }
|
||||
|
||||
protected:
|
||||
Runtime() = default;
|
||||
~Runtime() = default;
|
||||
Runtime(const Runtime &) = delete;
|
||||
Runtime &operator=(const Runtime &) = delete;
|
||||
|
||||
protected:
|
||||
// variable to track environment variables
|
||||
Environment env_;
|
||||
};
|
||||
|
||||
} // namespace core
|
||||
|
||||
#endif // SRC_RUNTIME_INCLUDE_RT_H_
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,136 @@
|
|||
/*===--------------------------------------------------------------------------
|
||||
* ATMI (Asynchronous Task and Memory Interface)
|
||||
*
|
||||
* This file is distributed under the MIT License. See LICENSE.txt for details.
|
||||
*===------------------------------------------------------------------------*/
|
||||
#include "internal.h"
|
||||
#include "rt.h"
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include <errno.h>
|
||||
#include <iostream>
|
||||
#include <pthread.h>
|
||||
#include <sched.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* Helper functions
|
||||
*/
|
||||
const char *get_atmi_error_string(atmi_status_t err) {
|
||||
switch (err) {
|
||||
case ATMI_STATUS_SUCCESS:
|
||||
return "ATMI_STATUS_SUCCESS";
|
||||
case ATMI_STATUS_UNKNOWN:
|
||||
return "ATMI_STATUS_UNKNOWN";
|
||||
case ATMI_STATUS_ERROR:
|
||||
return "ATMI_STATUS_ERROR";
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
const char *get_error_string(hsa_status_t err) {
|
||||
switch (err) {
|
||||
case HSA_STATUS_SUCCESS:
|
||||
return "HSA_STATUS_SUCCESS";
|
||||
case HSA_STATUS_INFO_BREAK:
|
||||
return "HSA_STATUS_INFO_BREAK";
|
||||
case HSA_STATUS_ERROR:
|
||||
return "HSA_STATUS_ERROR";
|
||||
case HSA_STATUS_ERROR_INVALID_ARGUMENT:
|
||||
return "HSA_STATUS_ERROR_INVALID_ARGUMENT";
|
||||
case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION:
|
||||
return "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION";
|
||||
case HSA_STATUS_ERROR_INVALID_ALLOCATION:
|
||||
return "HSA_STATUS_ERROR_INVALID_ALLOCATION";
|
||||
case HSA_STATUS_ERROR_INVALID_AGENT:
|
||||
return "HSA_STATUS_ERROR_INVALID_AGENT";
|
||||
case HSA_STATUS_ERROR_INVALID_REGION:
|
||||
return "HSA_STATUS_ERROR_INVALID_REGION";
|
||||
case HSA_STATUS_ERROR_INVALID_SIGNAL:
|
||||
return "HSA_STATUS_ERROR_INVALID_SIGNAL";
|
||||
case HSA_STATUS_ERROR_INVALID_QUEUE:
|
||||
return "HSA_STATUS_ERROR_INVALID_QUEUE";
|
||||
case HSA_STATUS_ERROR_OUT_OF_RESOURCES:
|
||||
return "HSA_STATUS_ERROR_OUT_OF_RESOURCES";
|
||||
case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT:
|
||||
return "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT";
|
||||
case HSA_STATUS_ERROR_RESOURCE_FREE:
|
||||
return "HSA_STATUS_ERROR_RESOURCE_FREE";
|
||||
case HSA_STATUS_ERROR_NOT_INITIALIZED:
|
||||
return "HSA_STATUS_ERROR_NOT_INITIALIZED";
|
||||
case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW:
|
||||
return "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW";
|
||||
case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS:
|
||||
return "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS";
|
||||
case HSA_STATUS_ERROR_INVALID_INDEX:
|
||||
return "HSA_STATUS_ERROR_INVALID_INDEX";
|
||||
case HSA_STATUS_ERROR_INVALID_ISA:
|
||||
return "HSA_STATUS_ERROR_INVALID_ISA";
|
||||
case HSA_STATUS_ERROR_INVALID_ISA_NAME:
|
||||
return "HSA_STATUS_ERROR_INVALID_ISA_NAME";
|
||||
case HSA_STATUS_ERROR_INVALID_CODE_OBJECT:
|
||||
return "HSA_STATUS_ERROR_INVALID_CODE_OBJECT";
|
||||
case HSA_STATUS_ERROR_INVALID_EXECUTABLE:
|
||||
return "HSA_STATUS_ERROR_INVALID_EXECUTABLE";
|
||||
case HSA_STATUS_ERROR_FROZEN_EXECUTABLE:
|
||||
return "HSA_STATUS_ERROR_FROZEN_EXECUTABLE";
|
||||
case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME:
|
||||
return "HSA_STATUS_ERROR_INVALID_SYMBOL_NAME";
|
||||
case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED:
|
||||
return "HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED";
|
||||
case HSA_STATUS_ERROR_VARIABLE_UNDEFINED:
|
||||
return "HSA_STATUS_ERROR_VARIABLE_UNDEFINED";
|
||||
case HSA_STATUS_ERROR_EXCEPTION:
|
||||
return "HSA_STATUS_ERROR_EXCEPTION";
|
||||
}
|
||||
}
|
||||
|
||||
namespace core {
|
||||
/*
|
||||
* Environment variables
|
||||
*/
|
||||
void Environment::GetEnvAll() {
|
||||
std::string var = GetEnv("ATMI_HELP");
|
||||
if (!var.empty()) {
|
||||
std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl
|
||||
<< "ATMI_MAX_KERNEL_TYPES : positive integer" << std::endl
|
||||
<< "ATMI_DEVICE_GPU_WORKERS : positive integer" << std::endl
|
||||
<< "ATMI_DEVICE_CPU_WORKERS : positive integer" << std::endl
|
||||
<< "ATMI_DEBUG : 1 for printing out trace/debug info"
|
||||
<< std::endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE");
|
||||
if (!var.empty())
|
||||
max_queue_size_ = std::stoi(var);
|
||||
|
||||
var = GetEnv("ATMI_MAX_KERNEL_TYPES");
|
||||
if (!var.empty())
|
||||
max_kernel_types_ = std::stoi(var);
|
||||
|
||||
/* TODO: If we get a good use case for device-specific worker count, we
|
||||
* should explore it, but let us keep the worker count uniform for all
|
||||
* devices of a type until that time
|
||||
*/
|
||||
var = GetEnv("ATMI_DEVICE_GPU_WORKERS");
|
||||
if (!var.empty())
|
||||
num_gpu_queues_ = std::stoi(var);
|
||||
|
||||
/* TODO: If we get a good use case for device-specific worker count, we
|
||||
* should explore it, but let us keep the worker count uniform for all
|
||||
* devices of a type until that time
|
||||
*/
|
||||
var = GetEnv("ATMI_DEVICE_CPU_WORKERS");
|
||||
if (!var.empty())
|
||||
num_cpu_queues_ = std::stoi(var);
|
||||
|
||||
var = GetEnv("ATMI_DEBUG");
|
||||
if (!var.empty())
|
||||
debug_mode_ = std::stoi(var);
|
||||
}
|
||||
} // namespace core
|
File diff suppressed because it is too large
Load Diff
|
@ -27,7 +27,9 @@ static const char *RTLNames[] = {
|
|||
/* PowerPC target */ "libomptarget.rtl.ppc64.so",
|
||||
/* x86_64 target */ "libomptarget.rtl.x86_64.so",
|
||||
/* CUDA target */ "libomptarget.rtl.cuda.so",
|
||||
/* AArch64 target */ "libomptarget.rtl.aarch64.so"};
|
||||
/* AArch64 target */ "libomptarget.rtl.aarch64.so",
|
||||
/* AMDGPU target */ "libomptarget.rtl.amdgpu.so",
|
||||
};
|
||||
|
||||
RTLsTy *RTLs;
|
||||
std::mutex *RTLsMtx;
|
||||
|
|
Loading…
Reference in New Issue