[SE] Remove StreamExecutor
Summary: The project has been renamed to Acxxel, so this old directory needs to be deleted. Reviewers: jlebar, jprice Subscribers: beanz, mgorny, parallel_libs-commits, modocache Differential Revision: https://reviews.llvm.org/D25964 llvm-svn: 285115
This commit is contained in:
parent
209a77d8d9
commit
b3f709e10f
|
@ -1,3 +1 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
|
||||
add_subdirectory(streamexecutor)
|
||||
|
|
|
@ -1,118 +0,0 @@
|
|||
cmake_minimum_required(VERSION 3.1)
|
||||
|
||||
option(STREAM_EXECUTOR_UNIT_TESTS "enable unit tests" ON)
|
||||
option(STREAM_EXECUTOR_ENABLE_DOXYGEN "enable StreamExecutor doxygen" ON)
|
||||
option(
|
||||
STREAM_EXECUTOR_ENABLE_CONFIG_TOOL
|
||||
"enable building streamexecutor-config tool"
|
||||
ON)
|
||||
option(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
"enable building the CUDA StreamExecutor platform \
|
||||
(see CMake's 'FindCUDA' documentation for info on specifying the CUDA path)"
|
||||
OFF)
|
||||
|
||||
configure_file(
|
||||
"include/streamexecutor/PlatformOptions.h.in"
|
||||
"include/streamexecutor/PlatformOptions.h")
|
||||
|
||||
# First find includes relative to the streamexecutor top-level source path.
|
||||
include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include)
|
||||
# Also look for configured headers in the top-level binary directory.
|
||||
include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include)
|
||||
|
||||
# If we are not building as part of LLVM, build StreamExecutor as a standalone
|
||||
# project using LLVM as an external library:
|
||||
string(
|
||||
COMPARE
|
||||
EQUAL
|
||||
"${CMAKE_SOURCE_DIR}"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
STREAM_EXECUTOR_STANDALONE)
|
||||
|
||||
if(STREAM_EXECUTOR_STANDALONE)
|
||||
project(StreamExecutor)
|
||||
|
||||
find_package(LLVM REQUIRED CONFIG)
|
||||
message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
|
||||
message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
|
||||
|
||||
include_directories(${LLVM_INCLUDE_DIRS})
|
||||
add_definitions(${LLVM_DEFINITIONS})
|
||||
|
||||
# If LLVM does not have RTTI, don't use it here either.
|
||||
if (NOT LLVM_ENABLE_RTTI)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
|
||||
endif()
|
||||
|
||||
set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm")
|
||||
list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_PATH}")
|
||||
include(AddLLVM)
|
||||
|
||||
if(STREAM_EXECUTOR_UNIT_TESTS)
|
||||
enable_testing()
|
||||
find_package(GTest REQUIRED)
|
||||
include_directories(${GTEST_INCLUDE_DIRS})
|
||||
find_package(Threads REQUIRED)
|
||||
endif()
|
||||
else(NOT STREAM_EXECUTOR_STANDALONE)
|
||||
if(STREAM_EXECUTOR_UNIT_TESTS)
|
||||
include_directories(
|
||||
"${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include")
|
||||
endif()
|
||||
endif(STREAM_EXECUTOR_STANDALONE)
|
||||
|
||||
# Find the libraries that correspond to the LLVM components
|
||||
# that we wish to use
|
||||
llvm_map_components_to_libnames(llvm_libs support symbolize)
|
||||
|
||||
# Insist on C++ 11 features.
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
# Add warning flags.
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter")
|
||||
|
||||
# Check for CUDA if it is enabled.
|
||||
if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
find_package(CUDA REQUIRED)
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
find_library(CUDA_DRIVER_LIBRARY cuda)
|
||||
if(NOT CUDA_DRIVER_LIBRARY)
|
||||
message(FATAL_ERROR
|
||||
"could not find libcuda, \
|
||||
is the CUDA driver is installed on your system?")
|
||||
endif()
|
||||
set(
|
||||
STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT
|
||||
$<TARGET_OBJECTS:streamexecutor_cuda_platform>)
|
||||
set(
|
||||
STREAM_EXECUTOR_LIBCUDA_LIBRARIES
|
||||
${CUDA_DRIVER_LIBRARY})
|
||||
endif(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
|
||||
add_subdirectory(lib)
|
||||
add_subdirectory(examples)
|
||||
|
||||
if(STREAM_EXECUTOR_UNIT_TESTS)
|
||||
add_subdirectory(unittests)
|
||||
endif()
|
||||
|
||||
if(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL )
|
||||
add_subdirectory(tools/streamexecutor-config)
|
||||
endif(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL )
|
||||
|
||||
install(DIRECTORY include/ DESTINATION include)
|
||||
|
||||
if (STREAM_EXECUTOR_ENABLE_DOXYGEN)
|
||||
find_package(Doxygen REQUIRED)
|
||||
configure_file(Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
|
||||
add_custom_target(
|
||||
doc
|
||||
${DOXYGEN_EXECUTABLE}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
|
||||
WORKING_DIRECTORY
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
COMMENT
|
||||
"Generating API documentation with Doxygen"
|
||||
VERBATIM)
|
||||
endif(STREAM_EXECUTOR_ENABLE_DOXYGEN)
|
File diff suppressed because it is too large
Load Diff
|
@ -1,11 +0,0 @@
|
|||
StreamExecutor
|
||||
==============
|
||||
|
||||
StreamExecutor is a wrapper around CUDA and OpenCL (host-side) programming
|
||||
models (runtimes). This abstraction cleanly permits host code to target either
|
||||
CUDA or OpenCL devices with identically-functioning data parallel kernels. It
|
||||
manages the execution of concurrent work targeting the accelerator, similar to a
|
||||
host-side Executor.
|
||||
|
||||
This version of StreamExecutor can be built either as a sub-project of the LLVM
|
||||
project or as a standalone project depending on LLVM as an external package.
|
|
@ -1,20 +0,0 @@
|
|||
body {
|
||||
background-color: #e0e0eb;
|
||||
}
|
||||
|
||||
div.header {
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
max-width: 60em;
|
||||
padding-left: 2em;
|
||||
padding-right: 2em;
|
||||
}
|
||||
|
||||
div.contents {
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
max-width: 60em;
|
||||
background-color: white;
|
||||
padding: 2em;
|
||||
border-radius: 1em;
|
||||
}
|
|
@ -1,5 +0,0 @@
|
|||
add_executable(cuda_saxpy_example CUDASaxpy.cpp)
|
||||
target_link_libraries(cuda_saxpy_example streamexecutor)
|
||||
|
||||
add_executable(host_saxpy_example HostSaxpy.cpp)
|
||||
target_link_libraries(host_saxpy_example streamexecutor)
|
|
@ -1,141 +0,0 @@
|
|||
//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains example code demonstrating the usage of the
|
||||
/// StreamExecutor API. Snippets of this file will be included as code examples
|
||||
/// in documentation. Taking these examples from a real source file guarantees
|
||||
/// that the examples will always compile.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#include "streamexecutor/StreamExecutor.h"
|
||||
|
||||
/// [Example saxpy compiler-generated]
|
||||
// Code in this namespace is generated by the compiler (e.g. clang).
|
||||
//
|
||||
// The name of this namespace may depend on the compiler that generated it, so
|
||||
// this is just an example name.
|
||||
namespace __compilergen {
|
||||
|
||||
// Specialization of the streamexecutor::Kernel template class for the parameter
|
||||
// types of the saxpy(float A, float *X, float *Y) kernel.
|
||||
using SaxpyKernel =
|
||||
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
|
||||
streamexecutor::GlobalDeviceMemory<float>>;
|
||||
|
||||
// A string containing the PTX code generated by the device compiler for the
|
||||
// saxpy kernel. String contents not shown here.
|
||||
extern const char *SaxpyPTX;
|
||||
|
||||
// A global instance of a loader spec that knows how to load the code in the
|
||||
// SaxpyPTX string.
|
||||
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
|
||||
streamexecutor::MultiKernelLoaderSpec Spec;
|
||||
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
|
||||
return Spec;
|
||||
}();
|
||||
|
||||
} // namespace __compilergen
|
||||
/// [Example saxpy compiler-generated]
|
||||
|
||||
/// [Example saxpy host PTX]
|
||||
// The PTX text for a saxpy kernel.
|
||||
const char *__compilergen::SaxpyPTX = R"(
|
||||
.version 4.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
|
||||
.reg .f32 %AValue;
|
||||
.reg .f32 %XValue;
|
||||
.reg .f32 %YValue;
|
||||
.reg .f32 %Result;
|
||||
|
||||
.reg .b64 %XBaseAddrGeneric;
|
||||
.reg .b64 %YBaseAddrGeneric;
|
||||
.reg .b64 %XBaseAddrGlobal;
|
||||
.reg .b64 %YBaseAddrGlobal;
|
||||
.reg .b64 %XAddr;
|
||||
.reg .b64 %YAddr;
|
||||
.reg .b64 %ThreadByteOffset;
|
||||
|
||||
.reg .b32 %TID;
|
||||
|
||||
ld.param.f32 %AValue, [A];
|
||||
ld.param.u64 %XBaseAddrGeneric, [X];
|
||||
ld.param.u64 %YBaseAddrGeneric, [Y];
|
||||
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
|
||||
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
|
||||
mov.u32 %TID, %tid.x;
|
||||
mul.wide.u32 %ThreadByteOffset, %TID, 4;
|
||||
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
|
||||
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
|
||||
ld.global.f32 %XValue, [%XAddr];
|
||||
ld.global.f32 %YValue, [%YAddr];
|
||||
fma.rn.f32 %Result, %AValue, %XValue, %YValue;
|
||||
st.global.f32 [%XAddr], %Result;
|
||||
ret;
|
||||
}
|
||||
)";
|
||||
/// [Example saxpy host PTX]
|
||||
|
||||
int main() {
|
||||
/// [Example saxpy host main]
|
||||
namespace se = ::streamexecutor;
|
||||
namespace cg = ::__compilergen;
|
||||
|
||||
// Create some host data.
|
||||
float A = 42.0f;
|
||||
std::vector<float> HostX = {0, 1, 2, 3};
|
||||
std::vector<float> HostY = {4, 5, 6, 7};
|
||||
size_t ArraySize = HostX.size();
|
||||
|
||||
// Get a device object.
|
||||
se::Platform *Platform =
|
||||
getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
|
||||
if (Platform->getDeviceCount() == 0) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
se::Device Device = getOrDie(Platform->getDevice(0));
|
||||
|
||||
// Load the kernel onto the device.
|
||||
cg::SaxpyKernel Kernel =
|
||||
getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
|
||||
|
||||
se::RegisteredHostMemory<float> RegisteredX =
|
||||
getOrDie(Device.registerHostMemory<float>(HostX));
|
||||
se::RegisteredHostMemory<float> RegisteredY =
|
||||
getOrDie(Device.registerHostMemory<float>(HostY));
|
||||
|
||||
// Allocate memory on the device.
|
||||
se::GlobalDeviceMemory<float> X =
|
||||
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
||||
se::GlobalDeviceMemory<float> Y =
|
||||
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
||||
|
||||
// Run operations on a stream.
|
||||
se::Stream Stream = getOrDie(Device.createStream());
|
||||
Stream.thenCopyH2D(RegisteredX, X)
|
||||
.thenCopyH2D(RegisteredY, Y)
|
||||
.thenLaunch(ArraySize, 1, Kernel, A, X, Y)
|
||||
.thenCopyD2H(X, RegisteredX);
|
||||
// Wait for the stream to complete.
|
||||
se::dieIfError(Stream.blockHostUntilDone());
|
||||
|
||||
// Process output data in HostX.
|
||||
std::vector<float> ExpectedX = {4, 47, 90, 133};
|
||||
assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
|
||||
/// [Example saxpy host main]
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains example code demonstrating the usage of the
|
||||
/// StreamExecutor API for a host platform.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
#include "streamexecutor/StreamExecutor.h"
|
||||
|
||||
void Saxpy(float A, float *X, float *Y, size_t N) {
|
||||
for (size_t I = 0; I < N; ++I)
|
||||
X[I] = A * X[I] + Y[I];
|
||||
}
|
||||
|
||||
namespace __compilergen {
|
||||
using SaxpyKernel =
|
||||
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
|
||||
streamexecutor::GlobalDeviceMemory<float>, size_t>;
|
||||
|
||||
// Wrapper function converts argument addresses to arguments.
|
||||
void SaxpyWrapper(const void *const *ArgumentAddresses) {
|
||||
Saxpy(*static_cast<const float *>(ArgumentAddresses[0]),
|
||||
*static_cast<float **>(const_cast<void *>(ArgumentAddresses[1])),
|
||||
*static_cast<float **>(const_cast<void *>(ArgumentAddresses[2])),
|
||||
*static_cast<const size_t *>(ArgumentAddresses[3]));
|
||||
}
|
||||
|
||||
// The wrapper function is what gets registered.
|
||||
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
|
||||
streamexecutor::MultiKernelLoaderSpec Spec;
|
||||
Spec.addHostFunction("Saxpy", SaxpyWrapper);
|
||||
return Spec;
|
||||
}();
|
||||
} // namespace __compilergen
|
||||
|
||||
int main() {
|
||||
namespace se = ::streamexecutor;
|
||||
namespace cg = ::__compilergen;
|
||||
|
||||
// Create some host data.
|
||||
float A = 42.0f;
|
||||
std::vector<float> HostX = {0, 1, 2, 3};
|
||||
std::vector<float> HostY = {4, 5, 6, 7};
|
||||
size_t ArraySize = HostX.size();
|
||||
|
||||
// Get a device object.
|
||||
se::Platform *Platform =
|
||||
getOrDie(se::PlatformManager::getPlatformByName("host"));
|
||||
if (Platform->getDeviceCount() == 0) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
se::Device Device = getOrDie(Platform->getDevice(0));
|
||||
|
||||
// Load the kernel onto the device.
|
||||
cg::SaxpyKernel Kernel =
|
||||
getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
|
||||
|
||||
se::RegisteredHostMemory<float> RegisteredX =
|
||||
getOrDie(Device.registerHostMemory<float>(HostX));
|
||||
se::RegisteredHostMemory<float> RegisteredY =
|
||||
getOrDie(Device.registerHostMemory<float>(HostY));
|
||||
|
||||
// Allocate memory on the device.
|
||||
se::GlobalDeviceMemory<float> X =
|
||||
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
||||
se::GlobalDeviceMemory<float> Y =
|
||||
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
||||
|
||||
// Run operations on a stream.
|
||||
se::Stream Stream = getOrDie(Device.createStream());
|
||||
Stream.thenCopyH2D(RegisteredX, X)
|
||||
.thenCopyH2D(RegisteredY, Y)
|
||||
.thenLaunch(1, 1, Kernel, A, X, Y, ArraySize)
|
||||
.thenCopyD2H(X, RegisteredX);
|
||||
// Wait for the stream to complete.
|
||||
se::dieIfError(Stream.blockHostUntilDone());
|
||||
|
||||
// Process output data in HostX.
|
||||
std::vector<float> ExpectedX = {4, 47, 90, 133};
|
||||
assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
|
||||
}
|
|
@ -1,302 +0,0 @@
|
|||
//===-- Device.h - The Device class -----------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// The Device class which represents a single device of a specific platform.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_DEVICE_H
|
||||
#define STREAMEXECUTOR_DEVICE_H
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "streamexecutor/Error.h"
|
||||
#include "streamexecutor/HostMemory.h"
|
||||
#include "streamexecutor/KernelSpec.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
class Stream;
|
||||
|
||||
/// A class representing a StreamExecutor device.
|
||||
///
|
||||
/// Device instances are basically just pointers to the underlying
|
||||
/// implementation, so they are small and can be passed around by value.
|
||||
class Device {
|
||||
public:
|
||||
explicit Device(PlatformDevice *PDevice);
|
||||
virtual ~Device();
|
||||
|
||||
/// Gets the name of this device.
|
||||
std::string getName() const { return PDevice->getName(); }
|
||||
|
||||
/// Creates a kernel object for this device.
|
||||
template <typename KernelT>
|
||||
Expected<typename std::enable_if<std::is_base_of<KernelBase, KernelT>::value,
|
||||
KernelT>::type>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) {
|
||||
Expected<const void *> MaybeKernelHandle = PDevice->createKernel(Spec);
|
||||
if (!MaybeKernelHandle)
|
||||
return MaybeKernelHandle.takeError();
|
||||
return KernelT(PDevice, *MaybeKernelHandle, Spec.getKernelName());
|
||||
}
|
||||
|
||||
/// Creates a stream object for this device.
|
||||
Expected<Stream> createStream();
|
||||
|
||||
/// Allocates an array of ElementCount entries of type T in device memory.
|
||||
template <typename T>
|
||||
Expected<GlobalDeviceMemory<T>> allocateDeviceMemory(size_t ElementCount) {
|
||||
Expected<void *> MaybeMemory =
|
||||
PDevice->allocateDeviceMemory(ElementCount * sizeof(T));
|
||||
if (!MaybeMemory)
|
||||
return MaybeMemory.takeError();
|
||||
return GlobalDeviceMemory<T>(this, *MaybeMemory, ElementCount);
|
||||
}
|
||||
|
||||
/// Registers a previously allocated host array of type T for asynchronous
|
||||
/// memory operations.
|
||||
///
|
||||
/// Host memory registered by this function can be used for asynchronous
|
||||
/// memory copies on streams. See Stream::thenCopyD2H and Stream::thenCopyH2D.
|
||||
template <typename T>
|
||||
Expected<RegisteredHostMemory<T>>
|
||||
registerHostMemory(llvm::MutableArrayRef<T> Memory) {
|
||||
if (Error E = PDevice->registerHostMemory(Memory.data(),
|
||||
Memory.size() * sizeof(T)))
|
||||
return std::move(E);
|
||||
return RegisteredHostMemory<T>(this, Memory.data(), Memory.size());
|
||||
}
|
||||
|
||||
/// \anchor DeviceHostSyncCopyGroup
|
||||
/// \name Host-synchronous device memory copying functions
|
||||
///
|
||||
/// These methods block the calling host thread while copying data to or from
|
||||
/// device memory. On the device side, these methods do not block any ongoing
|
||||
/// device calls.
|
||||
///
|
||||
/// There are no restrictions on the host memory that is used as a source or
|
||||
/// destination in these copy methods, so there is no need to register that
|
||||
/// host memory with registerHostMemory.
|
||||
///
|
||||
/// Each of these methods has a single template parameter, T, that specifies
|
||||
/// the type of data being copied. The ElementCount arguments specify the
|
||||
/// number of objects of type T to be copied.
|
||||
///
|
||||
/// For ease of use, each of the methods is overloaded to take either a
|
||||
/// GlobalDeviceMemorySlice or a GlobalDeviceMemory argument in the device
|
||||
/// memory argument slots, and the GlobalDeviceMemory arguments are just
|
||||
/// converted to GlobalDeviceMemorySlice arguments internally by using
|
||||
/// GlobalDeviceMemory::asSlice.
|
||||
///
|
||||
/// These methods perform bounds checking to make sure that the ElementCount
|
||||
/// is not too large for the source or destination. For methods that do not
|
||||
/// take an ElementCount argument, an error is returned if the source size
|
||||
/// does not exactly match the destination size.
|
||||
///@{
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2H(GlobalDeviceMemorySlice<T> Src,
|
||||
llvm::MutableArrayRef<T> Dst, size_t ElementCount) {
|
||||
if (ElementCount > Src.getElementCount())
|
||||
return make_error("copying too many elements, " +
|
||||
llvm::Twine(ElementCount) +
|
||||
", from a device array of element count " +
|
||||
llvm::Twine(Src.getElementCount()));
|
||||
if (ElementCount > Dst.size())
|
||||
return make_error(
|
||||
"copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", to a host array of element count " + llvm::Twine(Dst.size()));
|
||||
return PDevice->synchronousCopyD2H(Src.getBaseMemory().getHandle(),
|
||||
Src.getElementOffset() * sizeof(T),
|
||||
Dst.data(), 0, ElementCount * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2H(GlobalDeviceMemorySlice<T> Src,
|
||||
llvm::MutableArrayRef<T> Dst) {
|
||||
if (Src.getElementCount() != Dst.size())
|
||||
return make_error(
|
||||
"array size mismatch for D2H, device source has element count " +
|
||||
llvm::Twine(Src.getElementCount()) +
|
||||
" but host destination has element count " + llvm::Twine(Dst.size()));
|
||||
return synchronousCopyD2H(Src, Dst, Src.getElementCount());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2H(GlobalDeviceMemorySlice<T> Src, T *Dst,
|
||||
size_t ElementCount) {
|
||||
return synchronousCopyD2H(Src, llvm::MutableArrayRef<T>(Dst, ElementCount),
|
||||
ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2H(const GlobalDeviceMemory<T> &Src,
|
||||
llvm::MutableArrayRef<T> Dst, size_t ElementCount) {
|
||||
return synchronousCopyD2H(Src.asSlice(), Dst, ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2H(const GlobalDeviceMemory<T> &Src,
|
||||
llvm::MutableArrayRef<T> Dst) {
|
||||
return synchronousCopyD2H(Src.asSlice(), Dst);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2H(const GlobalDeviceMemory<T> &Src, T *Dst,
|
||||
size_t ElementCount) {
|
||||
return synchronousCopyD2H(Src.asSlice(), Dst, ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyH2D(llvm::ArrayRef<T> Src,
|
||||
GlobalDeviceMemorySlice<T> Dst,
|
||||
size_t ElementCount) {
|
||||
if (ElementCount > Src.size())
|
||||
return make_error(
|
||||
"copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", from a host array of element count " + llvm::Twine(Src.size()));
|
||||
if (ElementCount > Dst.getElementCount())
|
||||
return make_error("copying too many elements, " +
|
||||
llvm::Twine(ElementCount) +
|
||||
", to a device array of element count " +
|
||||
llvm::Twine(Dst.getElementCount()));
|
||||
return PDevice->synchronousCopyH2D(
|
||||
Src.data(), 0, Dst.getBaseMemory().getHandle(),
|
||||
Dst.getElementOffset() * sizeof(T), ElementCount * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyH2D(llvm::ArrayRef<T> Src,
|
||||
GlobalDeviceMemorySlice<T> Dst) {
|
||||
if (Src.size() != Dst.getElementCount())
|
||||
return make_error(
|
||||
"array size mismatch for H2D, host source has element count " +
|
||||
llvm::Twine(Src.size()) +
|
||||
" but device destination has element count " +
|
||||
llvm::Twine(Dst.getElementCount()));
|
||||
return synchronousCopyH2D(Src, Dst, Dst.getElementCount());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyH2D(T *Src, GlobalDeviceMemorySlice<T> Dst,
|
||||
size_t ElementCount) {
|
||||
return synchronousCopyH2D(llvm::ArrayRef<T>(Src, ElementCount), Dst,
|
||||
ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemory<T> &Dst,
|
||||
size_t ElementCount) {
|
||||
return synchronousCopyH2D(Src, Dst.asSlice(), ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemory<T> &Dst) {
|
||||
return synchronousCopyH2D(Src, Dst.asSlice());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyH2D(T *Src, GlobalDeviceMemory<T> &Dst,
|
||||
size_t ElementCount) {
|
||||
return synchronousCopyH2D(Src, Dst.asSlice(), ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
|
||||
GlobalDeviceMemorySlice<T> Dst,
|
||||
size_t ElementCount) {
|
||||
if (ElementCount > Src.getElementCount())
|
||||
return make_error("copying too many elements, " +
|
||||
llvm::Twine(ElementCount) +
|
||||
", from a device array of element count " +
|
||||
llvm::Twine(Src.getElementCount()));
|
||||
if (ElementCount > Dst.getElementCount())
|
||||
return make_error("copying too many elements, " +
|
||||
llvm::Twine(ElementCount) +
|
||||
", to a device array of element count " +
|
||||
llvm::Twine(Dst.getElementCount()));
|
||||
return PDevice->synchronousCopyD2D(
|
||||
Src.getBaseMemory().getHandle(), Src.getElementOffset() * sizeof(T),
|
||||
Dst.getBaseMemory().getHandle(), Dst.getElementOffset() * sizeof(T),
|
||||
ElementCount * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
|
||||
GlobalDeviceMemorySlice<T> Dst) {
|
||||
if (Src.getElementCount() != Dst.getElementCount())
|
||||
return make_error(
|
||||
"array size mismatch for D2D, device source has element count " +
|
||||
llvm::Twine(Src.getElementCount()) +
|
||||
" but device destination has element count " +
|
||||
llvm::Twine(Dst.getElementCount()));
|
||||
return synchronousCopyD2D(Src, Dst, Src.getElementCount());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(const GlobalDeviceMemory<T> &Src,
|
||||
GlobalDeviceMemorySlice<T> Dst,
|
||||
size_t ElementCount) {
|
||||
return synchronousCopyD2D(Src.asSlice(), Dst, ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(const GlobalDeviceMemory<T> &Src,
|
||||
GlobalDeviceMemorySlice<T> Dst) {
|
||||
return synchronousCopyD2D(Src.asSlice(), Dst);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
|
||||
GlobalDeviceMemory<T> &Dst, size_t ElementCount) {
|
||||
return synchronousCopyD2D(Src, Dst.asSlice(), ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
|
||||
GlobalDeviceMemory<T> &Dst) {
|
||||
return synchronousCopyD2D(Src, Dst.asSlice());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(const GlobalDeviceMemory<T> &Src,
|
||||
GlobalDeviceMemory<T> &Dst, size_t ElementCount) {
|
||||
return synchronousCopyD2D(Src.asSlice(), Dst.asSlice(), ElementCount);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Error synchronousCopyD2D(const GlobalDeviceMemory<T> &Src,
|
||||
GlobalDeviceMemory<T> &Dst) {
|
||||
return synchronousCopyD2D(Src.asSlice(), Dst.asSlice());
|
||||
}
|
||||
|
||||
///@} End host-synchronous device memory copying functions
|
||||
|
||||
private:
|
||||
// Only a GlobalDeviceMemoryBase may free device memory.
|
||||
friend GlobalDeviceMemoryBase;
|
||||
Error freeDeviceMemory(const GlobalDeviceMemoryBase &Memory) {
|
||||
return PDevice->freeDeviceMemory(Memory.getHandle());
|
||||
}
|
||||
|
||||
// Only destroyRegisteredHostMemoryInternals may unregister host memory.
|
||||
friend void internal::destroyRegisteredHostMemoryInternals(Device *, void *);
|
||||
Error unregisterHostMemory(const void *Pointer) {
|
||||
return PDevice->unregisterHostMemory(Pointer);
|
||||
}
|
||||
|
||||
PlatformDevice *PDevice;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_DEVICE_H
|
|
@ -1,278 +0,0 @@
|
|||
//===-- DeviceMemory.h - Types representing device memory -------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file defines types that represent device memory buffers. Two memory
|
||||
/// spaces are represented here: global and shared. Host code can have a handle
|
||||
/// to device global memory, and that handle can be used to copy data to and
|
||||
/// from the device. Host code cannot have a handle to device shared memory
|
||||
/// because that memory only exists during the execution of a kernel.
|
||||
///
|
||||
/// GlobalDeviceMemory<T> is a handle to an array of elements of type T in
|
||||
/// global device memory. It is similar to a pair of a std::unique_ptr<T> and an
|
||||
/// element count to tell how many elements of type T fit in the memory pointed
|
||||
/// to by that T*.
|
||||
///
|
||||
/// SharedDeviceMemory<T> is just the size in elements of an array of elements
|
||||
/// of type T in device shared memory. No resources are actually attached to
|
||||
/// this class, it is just like a memo to the device to allocate space in shared
|
||||
/// memory.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_DEVICEMEMORY_H
|
||||
#define STREAMEXECUTOR_DEVICEMEMORY_H
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
|
||||
#include "streamexecutor/Error.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
class Device;
|
||||
|
||||
template <typename ElemT> class GlobalDeviceMemory;
|
||||
|
||||
/// Reference to a slice of device memory.
|
||||
///
|
||||
/// Contains a base memory handle, an element count offset into that base
|
||||
/// memory, and an element count for the size of the slice.
|
||||
template <typename ElemT> class GlobalDeviceMemorySlice {
|
||||
public:
|
||||
using ElementTy = ElemT;
|
||||
|
||||
/// Intentionally implicit so GlobalDeviceMemory<T> can be passed to functions
|
||||
/// expecting GlobalDeviceMemorySlice<T> arguments.
|
||||
GlobalDeviceMemorySlice(const GlobalDeviceMemory<ElemT> &Memory)
|
||||
: BaseMemory(Memory), ElementOffset(0),
|
||||
ElementCount(Memory.getElementCount()) {}
|
||||
|
||||
GlobalDeviceMemorySlice(const GlobalDeviceMemory<ElemT> &BaseMemory,
|
||||
size_t ElementOffset, size_t ElementCount)
|
||||
: BaseMemory(BaseMemory), ElementOffset(ElementOffset),
|
||||
ElementCount(ElementCount) {
|
||||
assert(ElementOffset + ElementCount <= BaseMemory.getElementCount() &&
|
||||
"slicing past the end of a GlobalDeviceMemory buffer");
|
||||
}
|
||||
|
||||
/// Gets the GlobalDeviceMemory backing this slice.
|
||||
const GlobalDeviceMemory<ElemT> &getBaseMemory() const { return BaseMemory; }
|
||||
|
||||
/// Gets the offset of this slice from the base memory.
|
||||
///
|
||||
/// The offset is measured in elements, not bytes.
|
||||
size_t getElementOffset() const { return ElementOffset; }
|
||||
|
||||
/// Gets the number of elements in this slice.
|
||||
size_t getElementCount() const { return ElementCount; }
|
||||
|
||||
/// Returns the number of bytes that can fit in this slice.
|
||||
size_t getByteCount() const { return ElementCount * sizeof(ElemT); }
|
||||
|
||||
/// Creates a slice of the memory with the first DropCount elements removed.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
GlobalDeviceMemorySlice<ElemT> slice(size_t DropCount) const {
|
||||
assert(DropCount <= ElementCount &&
|
||||
"dropping more than the size of a slice");
|
||||
return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset + DropCount,
|
||||
ElementCount - DropCount);
|
||||
}
|
||||
|
||||
/// Creates a slice of the memory with the last DropCount elements removed.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
GlobalDeviceMemorySlice<ElemT> drop_back(size_t DropCount) const {
|
||||
assert(DropCount <= ElementCount &&
|
||||
"dropping more than the size of a slice");
|
||||
return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset,
|
||||
ElementCount - DropCount);
|
||||
}
|
||||
|
||||
/// Creates a slice of the memory that chops off the first DropCount elements
|
||||
/// and keeps the next TakeCount elements.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
GlobalDeviceMemorySlice<ElemT> slice(size_t DropCount,
|
||||
size_t TakeCount) const {
|
||||
assert(DropCount + TakeCount <= ElementCount &&
|
||||
"sub-slice operation overruns slice bounds");
|
||||
return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset + DropCount,
|
||||
TakeCount);
|
||||
}
|
||||
|
||||
private:
|
||||
const GlobalDeviceMemory<ElemT> &BaseMemory;
|
||||
size_t ElementOffset;
|
||||
size_t ElementCount;
|
||||
};
|
||||
|
||||
/// Wrapper around a generic global device memory allocation.
|
||||
///
|
||||
/// This class represents a buffer of untyped bytes in the global memory space
|
||||
/// of a device. See GlobalDeviceMemory<T> for the corresponding type that
|
||||
/// includes type information for the elements in its buffer.
|
||||
///
|
||||
/// This is effectively a pair consisting of an opaque handle and a buffer size
|
||||
/// in bytes. The opaque handle is a platform-dependent handle to the actual
|
||||
/// memory that is allocated on the device.
|
||||
///
|
||||
/// In some cases, such as in the CUDA platform, the opaque handle may actually
|
||||
/// be a pointer in the virtual address space and it may be valid to perform
|
||||
/// arithmetic on it to obtain other device pointers, but this is not the case
|
||||
/// in general.
|
||||
///
|
||||
/// For example, in the OpenCL platform, the handle is a pointer to a _cl_mem
|
||||
/// handle object which really is completely opaque to the user.
|
||||
class GlobalDeviceMemoryBase {
|
||||
public:
|
||||
/// Returns an opaque handle to the underlying memory.
|
||||
const void *getHandle() const { return Handle; }
|
||||
|
||||
/// Returns the address of the opaque handle as stored by this object.
|
||||
const void *const *getHandleAddress() const { return &Handle; }
|
||||
|
||||
// Cannot copy because the handle must be owned by a single object.
|
||||
GlobalDeviceMemoryBase(const GlobalDeviceMemoryBase &) = delete;
|
||||
GlobalDeviceMemoryBase &operator=(const GlobalDeviceMemoryBase &) = delete;
|
||||
|
||||
protected:
|
||||
/// Creates a GlobalDeviceMemoryBase from a handle and a byte count.
|
||||
GlobalDeviceMemoryBase(Device *D, const void *Handle, size_t ByteCount)
|
||||
: TheDevice(D), Handle(Handle), ByteCount(ByteCount) {}
|
||||
|
||||
/// Transfer ownership of the underlying handle.
|
||||
GlobalDeviceMemoryBase(GlobalDeviceMemoryBase &&Other) noexcept
|
||||
: TheDevice(Other.TheDevice), Handle(Other.Handle),
|
||||
ByteCount(Other.ByteCount) {
|
||||
Other.TheDevice = nullptr;
|
||||
Other.Handle = nullptr;
|
||||
Other.ByteCount = 0;
|
||||
}
|
||||
|
||||
GlobalDeviceMemoryBase &operator=(GlobalDeviceMemoryBase &&Other) noexcept {
|
||||
TheDevice = Other.TheDevice;
|
||||
Handle = Other.Handle;
|
||||
ByteCount = Other.ByteCount;
|
||||
Other.TheDevice = nullptr;
|
||||
Other.Handle = nullptr;
|
||||
Other.ByteCount = 0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
~GlobalDeviceMemoryBase();
|
||||
|
||||
Device *TheDevice; // Pointer to the device on which this memory lives.
|
||||
const void *Handle; // Platform-dependent value representing allocated memory.
|
||||
size_t ByteCount; // Size in bytes of this allocation.
|
||||
};
|
||||
|
||||
/// Typed wrapper around the "void *"-like GlobalDeviceMemoryBase class.
|
||||
///
|
||||
/// For example, GlobalDeviceMemory<int> is a simple wrapper around
|
||||
/// GlobalDeviceMemoryBase that represents a buffer of integers stored in global
|
||||
/// device memory.
|
||||
template <typename ElemT>
|
||||
class GlobalDeviceMemory : public GlobalDeviceMemoryBase {
|
||||
public:
|
||||
using ElementTy = ElemT;
|
||||
|
||||
GlobalDeviceMemory(GlobalDeviceMemory &&) noexcept;
|
||||
GlobalDeviceMemory &operator=(GlobalDeviceMemory &&) noexcept;
|
||||
|
||||
/// Returns the number of elements of type ElemT that constitute this
|
||||
/// allocation.
|
||||
size_t getElementCount() const { return ByteCount / sizeof(ElemT); }
|
||||
|
||||
/// Returns the number of bytes that can fit in this memory buffer.
|
||||
size_t getByteCount() const { return ByteCount; }
|
||||
|
||||
/// Converts this memory object into a slice.
|
||||
GlobalDeviceMemorySlice<ElemT> asSlice() const {
|
||||
return GlobalDeviceMemorySlice<ElemT>(*this);
|
||||
}
|
||||
|
||||
private:
|
||||
GlobalDeviceMemory(const GlobalDeviceMemory &) = delete;
|
||||
GlobalDeviceMemory &operator=(const GlobalDeviceMemory &) = delete;
|
||||
|
||||
// Only a Device can create a GlobalDeviceMemory instance.
|
||||
friend Device;
|
||||
GlobalDeviceMemory(Device *D, const void *Handle, size_t ElementCount)
|
||||
: GlobalDeviceMemoryBase(D, Handle, ElementCount * sizeof(ElemT)) {}
|
||||
};
|
||||
|
||||
template <typename ElemT>
|
||||
GlobalDeviceMemory<ElemT>::GlobalDeviceMemory(
|
||||
GlobalDeviceMemory<ElemT> &&) noexcept = default;
|
||||
|
||||
template <typename ElemT>
|
||||
GlobalDeviceMemory<ElemT> &GlobalDeviceMemory<ElemT>::
|
||||
operator=(GlobalDeviceMemory<ElemT> &&) noexcept = default;
|
||||
|
||||
/// A class to represent the size of a dynamic shared memory buffer of elements
|
||||
/// of type T on a device.
|
||||
///
|
||||
/// Shared memory buffers exist only on the device and cannot be manipulated
|
||||
/// from the host, so instances of this class do not have an opaque handle, only
|
||||
/// a size.
|
||||
///
|
||||
/// This type of memory is called "local" memory in OpenCL and "shared" memory
|
||||
/// in CUDA, and both platforms follow the rule that the host code only knows
|
||||
/// the size of these buffers and does not have a handle to them.
|
||||
///
|
||||
/// The treatment of shared memory in StreamExecutor matches the way it is done
|
||||
/// in OpenCL, where a kernel takes any number of shared memory sizes as kernel
|
||||
/// function arguments.
|
||||
///
|
||||
/// In CUDA only one shared memory size argument is allowed per kernel call.
|
||||
/// StreamExecutor handles this by allowing CUDA kernel signatures that take
|
||||
/// multiple SharedDeviceMemory arguments, and simply adding together all the
|
||||
/// shared memory sizes to get the final shared memory size that is used to
|
||||
/// launch the kernel.
|
||||
template <typename ElemT> class SharedDeviceMemory {
|
||||
public:
|
||||
/// Creates a typed area of shared device memory with a given number of
|
||||
/// elements.
|
||||
static SharedDeviceMemory<ElemT> makeFromElementCount(size_t ElementCount) {
|
||||
return SharedDeviceMemory(ElementCount);
|
||||
}
|
||||
|
||||
/// Copyable because it is just an array size.
|
||||
SharedDeviceMemory(const SharedDeviceMemory &) = default;
|
||||
|
||||
/// Copy-assignable because it is just an array size.
|
||||
SharedDeviceMemory &operator=(const SharedDeviceMemory &) = default;
|
||||
|
||||
/// Returns the number of elements of type ElemT that can fit in this memory
|
||||
/// buffer.
|
||||
size_t getElementCount() const { return ElementCount; }
|
||||
|
||||
/// Returns the number of bytes that can fit in this memory buffer.
|
||||
size_t getByteCount() const { return ElementCount * sizeof(ElemT); }
|
||||
|
||||
/// Returns whether this is a single-element memory buffer.
|
||||
bool isScalar() const { return getElementCount() == 1; }
|
||||
|
||||
private:
|
||||
/// Constructs a SharedDeviceMemory instance from an element count.
|
||||
///
|
||||
/// This constructor is not public because there is a potential for confusion
|
||||
/// between the size of the buffer in bytes and the size of the buffer in
|
||||
/// elements.
|
||||
///
|
||||
/// The static method makeFromElementCount is provided for users of this class
|
||||
/// because its name makes the meaning of the size parameter clear.
|
||||
explicit SharedDeviceMemory(size_t ElementCount)
|
||||
: ElementCount(ElementCount) {}
|
||||
|
||||
size_t ElementCount;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_DEVICEMEMORY_H
|
|
@ -1,215 +0,0 @@
|
|||
//===-- Error.h - Error handling --------------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Error types used in the public API and internally in StreamExecutor.
|
||||
///
|
||||
/// StreamExecutor's error handling is based on the types streamexecutor::Error
|
||||
/// and streamexecutor::Expected<T>.
|
||||
///
|
||||
///
|
||||
/// \section error The Error Class
|
||||
///
|
||||
/// The Error class either represents success or contains an error message
|
||||
/// describing the cause of the error. Error instances are created by calling
|
||||
/// Error::success for successes or make_error for errors.
|
||||
///
|
||||
/// \code{.cpp}
|
||||
/// Error achieveWorldPeace() {
|
||||
/// if (WorldPeaceAlreadyAchieved) {
|
||||
/// return Error::success();
|
||||
/// } else {
|
||||
/// return make_error("Can't someone else do it?");
|
||||
/// }
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// Error instances are implicitly convertible to bool. Error values convert to
|
||||
/// true and successes convert to false. Error instances must have their boolean
|
||||
/// values checked or they must be moved before they go out of scope, otherwise
|
||||
/// their destruction will cause the program to abort with a warning about an
|
||||
/// unchecked Error.
|
||||
///
|
||||
/// If the Error represents success, then checking the boolean value is all that
|
||||
/// is required, but if the Error represents a real error, the Error value must
|
||||
/// be consumed. The function consumeAndGetMessage is the way to extract the
|
||||
/// error message from an Error and consume the Error at the same time, so
|
||||
/// typical error handling will first check whether there was an error and then
|
||||
/// extract the error message if so. Here is an example:
|
||||
///
|
||||
/// \code{.cpp}
|
||||
/// if (Error E = achieveWorldPeace()) {
|
||||
/// printf("An error occurred: %s\n", consumeAndGetMessage(E).c_str());
|
||||
/// exit(EXIT_FAILURE):
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// It is also common to simply pass an error along up the call stack if it
|
||||
/// cannot be handled in the current function.
|
||||
///
|
||||
/// \code{.cpp}
|
||||
/// Error doTask() {
|
||||
/// if (Error E = achieveWorldPeace()) {
|
||||
/// return E;
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// There is also a function consumeError that consumes an error value without
|
||||
/// fetching the error message. This is useful when we want to ignore an error.
|
||||
///
|
||||
/// The dieIfError function is also provided for quick-and-dirty error handling.
|
||||
///
|
||||
///
|
||||
/// \section expected The Expected Class
|
||||
///
|
||||
/// The Expected<T> class either represents a value of type T or an Error.
|
||||
/// Expected<T> has one constructor that takes a T value and another constructor
|
||||
/// that takes an Error rvalue reference, so Expected instances can be
|
||||
/// constructed either from values or from errors:
|
||||
///
|
||||
/// \code{.cpp}
|
||||
/// Expected<int> getMyFavoriteInt() {
|
||||
/// int MyFavorite = 42;
|
||||
/// if (IsThereAFavorite) {
|
||||
/// return MyFavorite;
|
||||
/// } else {
|
||||
/// return make_error("I don't have a favorite");
|
||||
/// }
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// Expected<T> instances are implicitly convertible to bool and are true if
|
||||
/// they contain a value and false if they contain an error. Note that this is
|
||||
/// the opposite convention of the Error type conversion to bool, where true
|
||||
/// meant error and false meant success.
|
||||
///
|
||||
/// If the Expected<T> instance is not an error, the stored value can be
|
||||
/// obtained by using operator*. If access to members of the value are desired
|
||||
/// instead of the value itself, operator-> can be used as well.
|
||||
///
|
||||
/// Expected<T> instances must have their boolean value checked or they must be
|
||||
/// moved before they go out of scope, otherwise they will cause the program to
|
||||
/// abort with a warning about an unchecked error. If the Expected<T> instance
|
||||
/// contains a value, then checking the boolean value is all that is required,
|
||||
/// but if it contains an Error object, that Error object must be handled by
|
||||
/// calling Expected<T>::takeError() to get the underlying error.
|
||||
///
|
||||
/// Here is an example of the use of an Expected<T> value returned from a
|
||||
/// function:
|
||||
///
|
||||
/// \code{.cpp}
|
||||
/// Expected<int> ExpectedInt = getMyFavoriteInt();
|
||||
/// if (ExpectedInt) {
|
||||
/// printf("My favorite integer is %d\n", *ExpectedInt);
|
||||
/// } else {
|
||||
/// printf("An error occurred: %s\n",
|
||||
/// consumeAndGetMessage(ExpectedInt.takeError()));
|
||||
/// exit(EXIT_FAILURE);
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// The following snippet shows some examples of how Errors and Expected values
|
||||
/// can be passed up the stack if they should not be handled in the current
|
||||
/// function.
|
||||
///
|
||||
/// \code{.cpp}
|
||||
/// Expected<double> doTask3() {
|
||||
/// Error WorldPeaceError = achieveWorldPeace();
|
||||
/// if (!WorldPeaceError) {
|
||||
/// return WorldPeaceError;
|
||||
/// }
|
||||
///
|
||||
/// Expected<martian> ExpectedMartian = getMyFavoriteMartian();
|
||||
/// if (!ExpectedMartian) {
|
||||
/// // Must extract the error because martian cannot be converted to double.
|
||||
/// return ExpectedMartian.takeError():
|
||||
/// }
|
||||
///
|
||||
/// // It's fine to return Expected<int> for Expected<double> because int can
|
||||
/// // be converted to double.
|
||||
/// return getMyFavoriteInt();
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// The getOrDie function is also available for quick-and-dirty error handling.
|
||||
///
|
||||
///
|
||||
/// \section llvm Relation to llvm::Error and llvm::Expected
|
||||
///
|
||||
/// The streamexecutor::Error and streamexecutor::Expected classes are actually
|
||||
/// just their LLVM counterparts redeclared in the streamexectuor namespace, but
|
||||
/// they should be treated as separate types, even so.
|
||||
///
|
||||
/// StreamExecutor does not support any underlying llvm::ErrorInfo class except
|
||||
/// the one it defines internally for itself, so a streamexecutor::Error can be
|
||||
/// thought of as a restricted llvm::Error that is guaranteed to hold a specific
|
||||
/// error type.
|
||||
///
|
||||
/// Although code may compile if llvm functions used to handle these
|
||||
/// StreamExecutor error types, it is likely that code will lead to runtime
|
||||
/// errors, so it is strongly recommended that only the functions from the
|
||||
/// streamexecutor namespace are used on these StreamExecutor error types.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_ERROR_H
|
||||
#define STREAMEXECUTOR_ERROR_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "llvm/Support/Error.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
using llvm::consumeError;
|
||||
using llvm::Error;
|
||||
using llvm::Expected;
|
||||
using llvm::Twine;
|
||||
|
||||
/// Makes an Error object from an error message.
|
||||
Error make_error(const Twine &Message);
|
||||
|
||||
/// Consumes the input error and returns its error message.
|
||||
///
|
||||
/// Assumes the input was created by the make_error function above.
|
||||
std::string consumeAndGetMessage(Error &&E);
|
||||
|
||||
/// Extracts the T value from an Expected<T> or prints an error message to
|
||||
/// stderr and exits the program with code EXIT_FAILURE if the Expected<T> is an
|
||||
/// error.
|
||||
///
|
||||
/// This function and the dieIfError function are provided for applications that
|
||||
/// are OK with aborting the program if an error occurs, and which don't have
|
||||
/// any special error logging needs. Applications with different error handling
|
||||
/// needs will likely want to declare their own functions with similar
|
||||
/// signatures but which log error messages in a different way or attempt to
|
||||
/// recover from errors instead of aborting the program.
|
||||
template <typename T> T getOrDie(Expected<T> &&E) {
|
||||
if (!E) {
|
||||
std::fprintf(stderr, "Error extracting an expected value: %s.\n",
|
||||
consumeAndGetMessage(E.takeError()).c_str());
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
return std::move(*E);
|
||||
}
|
||||
|
||||
/// Prints an error message to stderr and exits the program with code
|
||||
/// EXIT_FAILURE if the input is an error.
|
||||
///
|
||||
/// \sa getOrDie
|
||||
void dieIfError(Error &&E);
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_ERROR_H
|
|
@ -1,195 +0,0 @@
|
|||
//===-- HostMemory.h - Types for registered host memory ---------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
///
|
||||
/// This file defines types that represent registered host memory buffers. Host
|
||||
/// memory must be registered to participate in asynchronous copies to or from
|
||||
/// device memory.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_HOSTMEMORY_H
|
||||
#define STREAMEXECUTOR_HOSTMEMORY_H
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
class Device;
|
||||
template <typename ElemT> class RegisteredHostMemory;
|
||||
|
||||
/// A mutable slice of registered host memory.
|
||||
///
|
||||
/// The memory is registered in the sense of
|
||||
/// streamexecutor::Device::registerHostMemory.
|
||||
///
|
||||
/// Holds a reference to an underlying registered host memory buffer. Must not
|
||||
/// be used after the underlying buffer is freed or unregistered.
|
||||
template <typename ElemT> class MutableRegisteredHostMemorySlice {
|
||||
public:
|
||||
using ElementTy = ElemT;
|
||||
|
||||
MutableRegisteredHostMemorySlice(RegisteredHostMemory<ElemT> &Registered)
|
||||
: MutableArrayRef(Registered.getPointer(), Registered.getElementCount()) {
|
||||
}
|
||||
|
||||
ElemT *getPointer() const { return MutableArrayRef.data(); }
|
||||
size_t getElementCount() const { return MutableArrayRef.size(); }
|
||||
|
||||
/// Chops off the first DropCount elements of the slice.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
MutableRegisteredHostMemorySlice slice(size_t DropCount) const {
|
||||
return MutableRegisteredHostMemorySlice(MutableArrayRef.slice(DropCount));
|
||||
}
|
||||
|
||||
/// Chops off the first DropCount elements of the slice and keeps the next
|
||||
/// TakeCount elements.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
MutableRegisteredHostMemorySlice slice(size_t DropCount,
|
||||
size_t TakeCount) const {
|
||||
return MutableRegisteredHostMemorySlice(
|
||||
MutableArrayRef.slice(DropCount, TakeCount));
|
||||
}
|
||||
|
||||
/// Chops off the last DropCount elements of the slice.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
MutableRegisteredHostMemorySlice drop_back(size_t DropCount) const {
|
||||
return MutableRegisteredHostMemorySlice(
|
||||
MutableArrayRef.drop_back(DropCount));
|
||||
}
|
||||
|
||||
private:
|
||||
MutableRegisteredHostMemorySlice(llvm::MutableArrayRef<ElemT> MutableArrayRef)
|
||||
: MutableArrayRef(MutableArrayRef) {}
|
||||
|
||||
llvm::MutableArrayRef<ElemT> MutableArrayRef;
|
||||
};
|
||||
|
||||
/// An immutable slice of registered host memory.
|
||||
///
|
||||
/// The memory is registered in the sense of
|
||||
/// streamexecutor::Device::registerHostMemory.
|
||||
///
|
||||
/// Holds a reference to an underlying registered host memory buffer. Must not
|
||||
/// be used after the underlying buffer is freed or unregistered.
|
||||
template <typename ElemT> class RegisteredHostMemorySlice {
|
||||
public:
|
||||
using ElementTy = ElemT;
|
||||
|
||||
RegisteredHostMemorySlice(const RegisteredHostMemory<ElemT> &Registered)
|
||||
: ArrayRef(Registered.getPointer(), Registered.getElementCount()) {}
|
||||
|
||||
RegisteredHostMemorySlice(
|
||||
MutableRegisteredHostMemorySlice<ElemT> MutableSlice)
|
||||
: ArrayRef(MutableSlice.getPointer(), MutableSlice.getElementCount()) {}
|
||||
|
||||
const ElemT *getPointer() const { return ArrayRef.data(); }
|
||||
size_t getElementCount() const { return ArrayRef.size(); }
|
||||
|
||||
/// Chops off the first N elements of the slice.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
RegisteredHostMemorySlice slice(size_t N) const {
|
||||
return RegisteredHostMemorySlice(ArrayRef.slice(N));
|
||||
}
|
||||
|
||||
/// Chops off the first N elements of the slice and keeps the next M elements.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
RegisteredHostMemorySlice slice(size_t N, size_t M) const {
|
||||
return RegisteredHostMemorySlice(ArrayRef.slice(N, M));
|
||||
}
|
||||
|
||||
/// Chops off the last N elements of the slice.
|
||||
LLVM_ATTRIBUTE_UNUSED_RESULT
|
||||
RegisteredHostMemorySlice drop_back(size_t N) const {
|
||||
return RegisteredHostMemorySlice(ArrayRef.drop_back(N));
|
||||
}
|
||||
|
||||
private:
|
||||
llvm::ArrayRef<ElemT> ArrayRef;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// Helper function to unregister host memory.
|
||||
///
|
||||
/// This is a thin wrapper around streamexecutor::Device::unregisterHostMemory.
|
||||
/// It is defined so this operation can be performed from the destructor of the
|
||||
/// template class RegisteredHostMemory without including Device.h in this
|
||||
/// header and creating a header inclusion cycle.
|
||||
void destroyRegisteredHostMemoryInternals(Device *TheDevice, void *Pointer);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// Registered host memory that knows how to unregister itself upon destruction.
|
||||
///
|
||||
/// The memory is registered in the sense of
|
||||
/// streamexecutor::Device::registerHostMemory.
|
||||
///
|
||||
/// ElemT is the type of element stored in the host buffer.
|
||||
template <typename ElemT> class RegisteredHostMemory {
|
||||
public:
|
||||
using ElementTy = ElemT;
|
||||
|
||||
RegisteredHostMemory(Device *TheDevice, ElemT *Pointer, size_t ElementCount)
|
||||
: TheDevice(TheDevice), Pointer(Pointer), ElementCount(ElementCount) {
|
||||
assert(TheDevice != nullptr && "cannot construct a "
|
||||
"RegisteredHostMemoryBase with a null "
|
||||
"platform device");
|
||||
}
|
||||
|
||||
RegisteredHostMemory(const RegisteredHostMemory &) = delete;
|
||||
RegisteredHostMemory &operator=(const RegisteredHostMemory &) = delete;
|
||||
|
||||
RegisteredHostMemory(RegisteredHostMemory &&Other) noexcept
|
||||
: TheDevice(Other.TheDevice), Pointer(Other.Pointer),
|
||||
ElementCount(Other.ElementCount) {
|
||||
Other.TheDevice = nullptr;
|
||||
Other.Pointer = nullptr;
|
||||
}
|
||||
|
||||
RegisteredHostMemory &operator=(RegisteredHostMemory &&Other) noexcept {
|
||||
TheDevice = Other.TheDevice;
|
||||
Pointer = Other.Pointer;
|
||||
ElementCount = Other.ElementCount;
|
||||
Other.TheDevice = nullptr;
|
||||
Other.Pointer = nullptr;
|
||||
}
|
||||
|
||||
~RegisteredHostMemory() {
|
||||
internal::destroyRegisteredHostMemoryInternals(TheDevice, Pointer);
|
||||
}
|
||||
|
||||
ElemT *getPointer() { return static_cast<ElemT *>(Pointer); }
|
||||
const ElemT *getPointer() const { return static_cast<ElemT *>(Pointer); }
|
||||
size_t getElementCount() const { return ElementCount; }
|
||||
|
||||
/// Creates an immutable slice for the entire contents of this memory.
|
||||
RegisteredHostMemorySlice<ElemT> asSlice() const {
|
||||
return RegisteredHostMemorySlice<ElemT>(*this);
|
||||
}
|
||||
|
||||
/// Creates a mutable slice for the entire contents of this memory.
|
||||
MutableRegisteredHostMemorySlice<ElemT> asSlice() {
|
||||
return MutableRegisteredHostMemorySlice<ElemT>(*this);
|
||||
}
|
||||
|
||||
private:
|
||||
Device *TheDevice;
|
||||
void *Pointer;
|
||||
size_t ElementCount;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_HOSTMEMORY_H
|
|
@ -1,84 +0,0 @@
|
|||
//===-- Kernel.h - StreamExecutor kernel types ------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Types to represent device kernels (code compiled to run on GPU or other
|
||||
/// accelerator).
|
||||
///
|
||||
/// See the \ref index "main page" for an example of how a compiler-generated
|
||||
/// specialization of the Kernel class template can be used along with the
|
||||
/// streamexecutor::Stream::thenLaunch method to create a typesafe interface for
|
||||
/// kernel launches.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_KERNEL_H
|
||||
#define STREAMEXECUTOR_KERNEL_H
|
||||
|
||||
#include "streamexecutor/Error.h"
|
||||
#include "streamexecutor/KernelSpec.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
class PlatformDevice;
|
||||
|
||||
/// The base class for all kernel types.
|
||||
///
|
||||
/// Stores the name of the kernel in both mangled and demangled forms.
|
||||
class KernelBase {
|
||||
public:
|
||||
KernelBase(PlatformDevice *D, const void *PlatformKernelHandle,
|
||||
llvm::StringRef Name);
|
||||
|
||||
KernelBase(const KernelBase &Other) = delete;
|
||||
KernelBase &operator=(const KernelBase &Other) = delete;
|
||||
|
||||
KernelBase(KernelBase &&Other) noexcept;
|
||||
KernelBase &operator=(KernelBase &&Other) noexcept;
|
||||
|
||||
~KernelBase();
|
||||
|
||||
const void *getPlatformHandle() const { return PlatformKernelHandle; }
|
||||
const std::string &getName() const { return Name; }
|
||||
const std::string &getDemangledName() const { return DemangledName; }
|
||||
|
||||
private:
|
||||
PlatformDevice *PDevice;
|
||||
const void *PlatformKernelHandle;
|
||||
|
||||
std::string Name;
|
||||
std::string DemangledName;
|
||||
};
|
||||
|
||||
/// A StreamExecutor kernel.
|
||||
///
|
||||
/// The template parameters are the types of the parameters to the kernel
|
||||
/// function.
|
||||
template <typename... ParameterTs> class Kernel : public KernelBase {
|
||||
public:
|
||||
Kernel(PlatformDevice *D, const void *PlatformKernelHandle,
|
||||
llvm::StringRef Name)
|
||||
: KernelBase(D, PlatformKernelHandle, Name) {}
|
||||
|
||||
Kernel(Kernel &&Other) noexcept;
|
||||
Kernel &operator=(Kernel &&Other) noexcept;
|
||||
};
|
||||
|
||||
template <typename... ParameterTs>
|
||||
Kernel<ParameterTs...>::Kernel(Kernel<ParameterTs...> &&) noexcept = default;
|
||||
|
||||
template <typename... ParameterTs>
|
||||
Kernel<ParameterTs...> &Kernel<ParameterTs...>::
|
||||
operator=(Kernel<ParameterTs...> &&) noexcept = default;
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_KERNEL_H
|
|
@ -1,287 +0,0 @@
|
|||
//===-- KernelSpec.h - Kernel loader spec types -----------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// KernelLoaderSpec is the base class for types that know where to find the
|
||||
/// code for a data-parallel kernel in a particular format on a particular
|
||||
/// platform. So, for example, there will be one subclass that deals with CUDA
|
||||
/// PTX code, another subclass that deals with CUDA fatbin code, and yet another
|
||||
/// subclass that deals with OpenCL text code.
|
||||
///
|
||||
/// A MultiKernelLoaderSpec is basically a collection of KernelLoaderSpec
|
||||
/// instances. This is useful when code is available for the same kernel in
|
||||
/// several different formats or targeted for several different platforms. All
|
||||
/// the various KernelLoaderSpec instances for this kernel can be combined
|
||||
/// together in one MultiKernelLoaderSpec and the specific platform consumer can
|
||||
/// decide which instance of the code it wants to use.
|
||||
///
|
||||
/// MultiKernelLoaderSpec provides several helper functions to build and
|
||||
/// register KernelLoaderSpec instances all in a single operation. For example,
|
||||
/// MultiKernelLoaderSpec::addCUDAPTXInMemory can be used to construct and
|
||||
/// register a CUDAPTXInMemorySpec KernelLoaderSpec.
|
||||
///
|
||||
/// The loader spec classes declared here are designed primarily to be
|
||||
/// instantiated by the compiler, but they can also be instantiated directly by
|
||||
/// the user. A simplified example workflow which a compiler might follow in the
|
||||
/// case of a CUDA kernel that is compiled to CUDA fatbin code is as follows:
|
||||
///
|
||||
/// 1. The user defines a kernel function called \c UserKernel.
|
||||
/// 2. The compiler compiles the kernel code into CUDA fatbin data and embeds
|
||||
/// that data into the host code at address \c __UserKernelFatbinAddress.
|
||||
/// 3. The compiler adds code at the beginning of the host code to instantiate a
|
||||
/// MultiKernelLoaderSpec:
|
||||
/// \code
|
||||
/// namespace compiler_cuda_namespace {
|
||||
/// MultiKernelLoaderSpec UserKernelLoaderSpec;
|
||||
/// } // namespace compiler_cuda_namespace
|
||||
/// \endcode
|
||||
/// 4. The compiler then adds code to the host code to add the fatbin data to
|
||||
/// the new MultiKernelLoaderSpec, and to associate that data with the kernel
|
||||
/// name \c "UserKernel":
|
||||
/// \code
|
||||
/// namespace compiler_cuda_namespace {
|
||||
/// UserKernelLoaderSpec.addCUDAFatbinInMemory(
|
||||
/// __UserKernelFatbinAddress, "UserKernel");
|
||||
/// } // namespace compiler_cuda_namespace
|
||||
/// \endcode
|
||||
/// 5. The host code, having known beforehand that the compiler would initialize
|
||||
/// a MultiKernelLoaderSpec based on the name of the CUDA kernel, makes use
|
||||
/// of the symbol \c cudanamespace::UserKernelLoaderSpec without defining it.
|
||||
///
|
||||
/// In the example above, the MultiKernelLoaderSpec instance created by the
|
||||
/// compiler can be used by the host code to create StreamExecutor kernel
|
||||
/// objects. In turn, those StreamExecutor kernel objects can be used by the
|
||||
/// host code to launch the kernel on the device as desired.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_KERNELSPEC_H
|
||||
#define STREAMEXECUTOR_KERNELSPEC_H
|
||||
|
||||
#include <cassert>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
/// An object that knows how to find the code for a device kernel.
|
||||
///
|
||||
/// This is the base class for the hierarchy of loader specs. The different
|
||||
/// subclasses know how to find code in different formats (e.g. CUDA PTX, OpenCL
|
||||
/// binary).
|
||||
///
|
||||
/// This base class has functionality for storing and getting the name of the
|
||||
/// kernel as a string.
|
||||
class KernelLoaderSpec {
|
||||
public:
|
||||
/// Returns the name of the kernel this spec loads.
|
||||
const std::string &getKernelName() const { return KernelName; }
|
||||
|
||||
protected:
|
||||
explicit KernelLoaderSpec(llvm::StringRef KernelName);
|
||||
|
||||
private:
|
||||
std::string KernelName;
|
||||
|
||||
KernelLoaderSpec(const KernelLoaderSpec &) = delete;
|
||||
KernelLoaderSpec &operator=(const KernelLoaderSpec &) = delete;
|
||||
};
|
||||
|
||||
/// A KernelLoaderSpec for CUDA PTX code that resides in memory as a
|
||||
/// null-terminated string.
|
||||
class CUDAPTXInMemorySpec : public KernelLoaderSpec {
|
||||
public:
|
||||
/// First component is major version, second component is minor version.
|
||||
using ComputeCapability = std::pair<int, int>;
|
||||
|
||||
/// PTX code combined with its compute capability.
|
||||
struct PTXSpec {
|
||||
ComputeCapability TheComputeCapability;
|
||||
const char *PTXCode;
|
||||
};
|
||||
|
||||
/// Creates a CUDAPTXInMemorySpec from an array of PTXSpec objects.
|
||||
///
|
||||
/// Adds each item in SpecList to this object.
|
||||
///
|
||||
/// Does not take ownership of the PTXCode pointers in the SpecList elements.
|
||||
CUDAPTXInMemorySpec(
|
||||
llvm::StringRef KernelName,
|
||||
const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);
|
||||
|
||||
/// Returns a pointer to the PTX code for the greatest compute capability not
|
||||
/// exceeding the requested compute capability.
|
||||
///
|
||||
/// Returns nullptr on failed lookup (if the requested version is not
|
||||
/// available and no lower versions are available).
|
||||
const char *getCode(int ComputeCapabilityMajor,
|
||||
int ComputeCapabilityMinor) const;
|
||||
|
||||
private:
|
||||
/// PTX code contents in memory.
|
||||
///
|
||||
/// The key is a pair (cc_major, cc_minor), i.e., (2, 0), (3, 0), (3, 5).
|
||||
std::map<ComputeCapability, const char *> PTXByComputeCapability;
|
||||
|
||||
CUDAPTXInMemorySpec(const CUDAPTXInMemorySpec &) = delete;
|
||||
CUDAPTXInMemorySpec &operator=(const CUDAPTXInMemorySpec &) = delete;
|
||||
};
|
||||
|
||||
/// A KernelLoaderSpec for CUDA fatbin code that resides in memory.
|
||||
class CUDAFatbinInMemorySpec : public KernelLoaderSpec {
|
||||
public:
|
||||
/// Creates a CUDAFatbinInMemorySpec with a reference to the given fatbin
|
||||
/// bytes.
|
||||
///
|
||||
/// Does not take ownership of the Bytes pointer.
|
||||
CUDAFatbinInMemorySpec(llvm::StringRef KernelName, const void *Bytes);
|
||||
|
||||
/// Gets the fatbin data bytes.
|
||||
const void *getBytes() const { return Bytes; }
|
||||
|
||||
private:
|
||||
const void *Bytes;
|
||||
|
||||
CUDAFatbinInMemorySpec(const CUDAFatbinInMemorySpec &) = delete;
|
||||
CUDAFatbinInMemorySpec &operator=(const CUDAFatbinInMemorySpec &) = delete;
|
||||
};
|
||||
|
||||
/// A KernelLoaderSpec for OpenCL text that resides in memory as a
|
||||
/// null-terminated string.
|
||||
class OpenCLTextInMemorySpec : public KernelLoaderSpec {
|
||||
public:
|
||||
/// Creates a OpenCLTextInMemorySpec with a reference to the given OpenCL text
|
||||
/// code bytes.
|
||||
///
|
||||
/// Does not take ownership of the Text pointer.
|
||||
OpenCLTextInMemorySpec(llvm::StringRef KernelName, const char *Text);
|
||||
|
||||
/// Returns the OpenCL text contents.
|
||||
const char *getText() const { return Text; }
|
||||
|
||||
private:
|
||||
const char *Text;
|
||||
|
||||
OpenCLTextInMemorySpec(const OpenCLTextInMemorySpec &) = delete;
|
||||
OpenCLTextInMemorySpec &operator=(const OpenCLTextInMemorySpec &) = delete;
|
||||
};
|
||||
|
||||
/// An object to store several different KernelLoaderSpecs for the same kernel.
|
||||
///
|
||||
/// This allows code in different formats and for different platforms to be
|
||||
/// stored all together for a single kernel.
|
||||
///
|
||||
/// Various methods are available to add a new KernelLoaderSpec to a
|
||||
/// MultiKernelLoaderSpec. There are also methods to query which formats and
|
||||
/// platforms are supported by the currently added KernelLoaderSpec objects, and
|
||||
/// methods to get the KernelLoaderSpec objects for each format and platform.
|
||||
///
|
||||
/// Since all stored KernelLoaderSpecs are supposed to reference the same
|
||||
/// kernel, they are all assumed to take the same number and type of parameters,
|
||||
/// but no checking is done to enforce this. In debug mode, all
|
||||
/// KernelLoaderSpecs are checked to make sure they have the same kernel name,
|
||||
/// so passing in specs with different kernel names can cause the program to
|
||||
/// abort.
|
||||
///
|
||||
/// This interface is prone to errors, so it is better to leave
|
||||
/// MultiKernelLoaderSpec creation and initialization to the compiler rather
|
||||
/// than doing it by hand.
|
||||
class MultiKernelLoaderSpec {
|
||||
public:
|
||||
/// Type of functions used as host platform kernels.
|
||||
using HostFunctionTy = std::function<void(const void **)>;
|
||||
|
||||
std::string getKernelName() const {
|
||||
if (TheKernelName)
|
||||
return *TheKernelName;
|
||||
return "";
|
||||
}
|
||||
|
||||
// Convenience getters for testing whether these platform variants have
|
||||
// kernel loader specifications available.
|
||||
|
||||
bool hasCUDAPTXInMemory() const { return TheCUDAPTXInMemorySpec != nullptr; }
|
||||
bool hasCUDAFatbinInMemory() const {
|
||||
return TheCUDAFatbinInMemorySpec != nullptr;
|
||||
}
|
||||
bool hasOpenCLTextInMemory() const {
|
||||
return TheOpenCLTextInMemorySpec != nullptr;
|
||||
}
|
||||
bool hasHostFunction() const { return HostFunction != nullptr; }
|
||||
|
||||
// Accessors for platform variant kernel load specifications.
|
||||
//
|
||||
// Precondition: corresponding has* method returns true.
|
||||
|
||||
const CUDAPTXInMemorySpec &getCUDAPTXInMemory() const {
|
||||
assert(hasCUDAPTXInMemory() && "getting spec that is not present");
|
||||
return *TheCUDAPTXInMemorySpec;
|
||||
}
|
||||
const CUDAFatbinInMemorySpec &getCUDAFatbinInMemory() const {
|
||||
assert(hasCUDAFatbinInMemory() && "getting spec that is not present");
|
||||
return *TheCUDAFatbinInMemorySpec;
|
||||
}
|
||||
const OpenCLTextInMemorySpec &getOpenCLTextInMemory() const {
|
||||
assert(hasOpenCLTextInMemory() && "getting spec that is not present");
|
||||
return *TheOpenCLTextInMemorySpec;
|
||||
}
|
||||
|
||||
const HostFunctionTy &getHostFunction() const {
|
||||
assert(hasHostFunction() && "getting spec that is not present");
|
||||
return *HostFunction;
|
||||
}
|
||||
|
||||
// Builder-pattern-like methods for use in initializing a
|
||||
// MultiKernelLoaderSpec.
|
||||
//
|
||||
// Each of these should be used at most once for a single
|
||||
// MultiKernelLoaderSpec object. See file comment for example usage.
|
||||
//
|
||||
// Note that the KernelName parameter must be consistent with the kernel in
|
||||
// the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
|
||||
// name may be mangled by the compiler if it is not declared extern "C".
|
||||
|
||||
/// Does not take ownership of the PTXCode pointers in the SpecList elements.
|
||||
MultiKernelLoaderSpec &
|
||||
addCUDAPTXInMemory(llvm::StringRef KernelName,
|
||||
llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);
|
||||
|
||||
/// Does not take ownership of the FatbinBytes pointer.
|
||||
MultiKernelLoaderSpec &addCUDAFatbinInMemory(llvm::StringRef KernelName,
|
||||
const void *FatbinBytes);
|
||||
|
||||
/// Does not take ownership of the OpenCLText pointer.
|
||||
MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
|
||||
const char *OpenCLText);
|
||||
|
||||
MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName,
|
||||
HostFunctionTy Function) {
|
||||
HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function));
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void setKernelName(llvm::StringRef KernelName);
|
||||
|
||||
std::unique_ptr<std::string> TheKernelName;
|
||||
std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
|
||||
std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
|
||||
std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
|
||||
std::unique_ptr<HostFunctionTy> HostFunction;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_KERNELSPEC_H
|
|
@ -1,47 +0,0 @@
|
|||
//===-- LaunchDimensions.h - Kernel block and grid sizes --------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Structures to hold sizes for blocks and grids which are used as parameters
|
||||
/// for kernel launches.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_LAUNCHDIMENSIONS_H
|
||||
#define STREAMEXECUTOR_LAUNCHDIMENSIONS_H
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
/// The dimensions of a device block of execution.
|
||||
///
|
||||
/// A block is made up of an array of X by Y by Z threads.
|
||||
struct BlockDimensions {
|
||||
BlockDimensions(unsigned X = 1, unsigned Y = 1, unsigned Z = 1)
|
||||
: X(X), Y(Y), Z(Z) {}
|
||||
|
||||
unsigned X;
|
||||
unsigned Y;
|
||||
unsigned Z;
|
||||
};
|
||||
|
||||
/// The dimensions of a device grid of execution.
|
||||
///
|
||||
/// A grid is made up of an array of X by Y by Z blocks.
|
||||
struct GridDimensions {
|
||||
GridDimensions(unsigned X = 1, unsigned Y = 1, unsigned Z = 1)
|
||||
: X(X), Y(Y), Z(Z) {}
|
||||
|
||||
unsigned X;
|
||||
unsigned Y;
|
||||
unsigned Z;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_LAUNCHDIMENSIONS_H
|
|
@ -1,234 +0,0 @@
|
|||
//===-- PackedKernelArgumentArray.h - Packed kernel arg types ---*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// The types in this file are designed to deal with the fact that device memory
|
||||
/// kernel arguments are treated differently from other arguments during kernel
|
||||
/// argument packing.
|
||||
///
|
||||
/// GlobalDeviceMemory<T> arguments are passed to a kernel by passing their
|
||||
/// opaque handle. SharedDeviceMemory<T> arguments have no associated address,
|
||||
/// only a size, so the size is the only information that gets passed to the
|
||||
/// kernel launch.
|
||||
///
|
||||
/// The KernelArgumentType enum is used to keep track of the type of each
|
||||
/// argument.
|
||||
///
|
||||
/// The PackedKernelArgumentArray class uses template metaprogramming to convert
|
||||
/// each argument to a PackedKernelArgument with minimal runtime overhead.
|
||||
///
|
||||
/// The design of the PackedKernelArgumentArray class has a few idiosyncrasies
|
||||
/// due to the fact that parameter packing has been identified as
|
||||
/// performance-critical in some applications. The packed argument data is
|
||||
/// stored as a struct of arrays rather than an array of structs because CUDA
|
||||
/// kernel launches in the CUDA driver API take an array of argument addresses.
|
||||
/// Having created the array of argument addresses here, no further work will
|
||||
/// need to be done in the CUDA driver layer to unpack and repack the addresses.
|
||||
///
|
||||
/// The shared memory argument count is maintained separately because in the
|
||||
/// common case where it is zero, the CUDA layer doesn't have to loop through
|
||||
/// the argument array and sum up all the shared memory sizes. This is another
|
||||
/// performance optimization that shows up as a quirk in this class interface.
|
||||
///
|
||||
/// The platform-interface kernel launch function will take the following
|
||||
/// arguments, which are provided by this interface:
|
||||
/// * argument count,
|
||||
/// * array of argument address,
|
||||
/// * array of argument sizes,
|
||||
/// * array of argument types, and
|
||||
/// * shared pointer count.
|
||||
/// This information should be enough to allow any platform to launch the kernel
|
||||
/// efficiently, although it is probably more information than is needed for any
|
||||
/// specific platform.
|
||||
///
|
||||
/// The PackedKernelArgumentArrayBase class has no template parameters, so it
|
||||
/// does not benefit from compile-time type checking. However, since it has no
|
||||
/// template parameters, it can be passed as an argument to virtual functions,
|
||||
/// and this allows it to be passed to functions that use virtual function
|
||||
/// overloading to handle platform-specific kernel launching.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H
|
||||
#define STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "streamexecutor/DeviceMemory.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
enum class KernelArgumentType {
|
||||
VALUE, /// Non-device-memory argument.
|
||||
GLOBAL_DEVICE_MEMORY, /// Non-shared device memory argument.
|
||||
SHARED_DEVICE_MEMORY /// Shared device memory argument.
|
||||
};
|
||||
|
||||
/// An array of packed kernel arguments without compile-time type information.
|
||||
///
|
||||
/// This un-templated base class is useful because packed kernel arguments must
|
||||
/// at some point be passed to a virtual function that performs
|
||||
/// platform-specific kernel launches. Such a virtual function cannot be
|
||||
/// templated to handle all specializations of the
|
||||
/// PackedKernelArgumentArray<...> class template, so, instead, references to
|
||||
/// PackedKernelArgumentArray<...> are passed as references to this base class.
|
||||
class PackedKernelArgumentArrayBase {
|
||||
public:
|
||||
virtual ~PackedKernelArgumentArrayBase();
|
||||
|
||||
/// Gets the number of packed arguments.
|
||||
size_t getArgumentCount() const { return ArgumentCount; }
|
||||
|
||||
/// Gets the address of the argument at the given index.
|
||||
const void *getAddress(size_t Index) const { return AddressesData[Index]; }
|
||||
|
||||
/// Gets the size of the argument at the given index.
|
||||
size_t getSize(size_t Index) const { return SizesData[Index]; }
|
||||
|
||||
/// Gets the type of the argument at the given index.
|
||||
KernelArgumentType getType(size_t Index) const { return TypesData[Index]; }
|
||||
|
||||
/// Gets a pointer to the address array.
|
||||
const void *const *getAddresses() const { return AddressesData; }
|
||||
|
||||
/// Gets a pointer to the sizes array.
|
||||
const size_t *getSizes() const { return SizesData; }
|
||||
|
||||
/// Gets a pointer to the types array.
|
||||
const KernelArgumentType *getTypes() const { return TypesData; }
|
||||
|
||||
/// Gets the number of shared device memory arguments.
|
||||
size_t getSharedCount() const { return SharedCount; }
|
||||
|
||||
protected:
|
||||
PackedKernelArgumentArrayBase(size_t ArgumentCount)
|
||||
: ArgumentCount(ArgumentCount), SharedCount(0u) {}
|
||||
|
||||
size_t ArgumentCount;
|
||||
size_t SharedCount;
|
||||
const void *const *AddressesData;
|
||||
size_t *SizesData;
|
||||
KernelArgumentType *TypesData;
|
||||
};
|
||||
|
||||
/// An array of packed kernel arguments with compile-time type information.
|
||||
///
|
||||
/// This is used by the platform-independent StreamExecutor code to pack
|
||||
/// arguments in a compile-time type-safe way. In order to actually launch a
|
||||
/// kernel on a specific platform, however, a reference to this class will have
|
||||
/// to be passed to a virtual, platform-specific kernel launch function. Such a
|
||||
/// reference will be passed as a reference to the base class rather than a
|
||||
/// reference to this subclass itself because a virtual function cannot be
|
||||
/// templated in such a way to maintain the template parameter types of the
|
||||
/// subclass.
|
||||
template <typename... ParameterTs>
|
||||
class PackedKernelArgumentArray : public PackedKernelArgumentArrayBase {
|
||||
public:
|
||||
/// Constructs an instance by packing the specified arguments.
|
||||
///
|
||||
/// Rather than using this constructor directly, consider using the
|
||||
/// make_kernel_argument_pack function instead, to get the compiler to infer
|
||||
/// the parameter types for you.
|
||||
PackedKernelArgumentArray(const ParameterTs &... Arguments)
|
||||
: PackedKernelArgumentArrayBase(sizeof...(ParameterTs)) {
|
||||
AddressesData = Addresses.data();
|
||||
SizesData = Sizes.data();
|
||||
TypesData = Types.data();
|
||||
PackArguments(0, Arguments...);
|
||||
}
|
||||
|
||||
~PackedKernelArgumentArray() override = default;
|
||||
|
||||
private:
|
||||
// Base case for PackArguments when there are no arguments to pack.
|
||||
void PackArguments(size_t) {}
|
||||
|
||||
// Induction step for PackArguments.
|
||||
template <typename T, typename... RemainingParameterTs>
|
||||
void PackArguments(size_t Index, const T &Argument,
|
||||
const RemainingParameterTs &... RemainingArguments) {
|
||||
PackOneArgument(Index, Argument);
|
||||
PackArguments(Index + 1, RemainingArguments...);
|
||||
}
|
||||
|
||||
// Pack a normal, non-device-memory argument.
|
||||
template <typename T> void PackOneArgument(size_t Index, const T &Argument) {
|
||||
Addresses[Index] = &Argument;
|
||||
Sizes[Index] = sizeof(T);
|
||||
Types[Index] = KernelArgumentType::VALUE;
|
||||
}
|
||||
|
||||
// Pack a GlobalDeviceMemory<T> argument.
|
||||
template <typename T>
|
||||
void PackOneArgument(size_t Index, const GlobalDeviceMemory<T> &Argument) {
|
||||
Addresses[Index] = Argument.getHandleAddress();
|
||||
Sizes[Index] = sizeof(void *);
|
||||
Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
// Pack a GlobalDeviceMemory<T> pointer argument.
|
||||
template <typename T>
|
||||
void PackOneArgument(size_t Index, GlobalDeviceMemory<T> *Argument) {
|
||||
Addresses[Index] = Argument->getHandleAddress();
|
||||
Sizes[Index] = sizeof(void *);
|
||||
Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
// Pack a const GlobalDeviceMemory<T> pointer argument.
|
||||
template <typename T>
|
||||
void PackOneArgument(size_t Index, const GlobalDeviceMemory<T> *Argument) {
|
||||
Addresses[Index] = Argument->getHandleAddress();
|
||||
Sizes[Index] = sizeof(void *);
|
||||
Types[Index] = KernelArgumentType::GLOBAL_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
// Pack a SharedDeviceMemory argument.
|
||||
template <typename T>
|
||||
void PackOneArgument(size_t Index, const SharedDeviceMemory<T> &Argument) {
|
||||
++SharedCount;
|
||||
Addresses[Index] = nullptr;
|
||||
Sizes[Index] = Argument.getElementCount() * sizeof(T);
|
||||
Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
// Pack a SharedDeviceMemory pointer argument.
|
||||
template <typename T>
|
||||
void PackOneArgument(size_t Index, SharedDeviceMemory<T> *Argument) {
|
||||
++SharedCount;
|
||||
Addresses[Index] = nullptr;
|
||||
Sizes[Index] = Argument->getElementCount() * sizeof(T);
|
||||
Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
// Pack a const SharedDeviceMemory pointer argument.
|
||||
template <typename T>
|
||||
void PackOneArgument(size_t Index, const SharedDeviceMemory<T> *Argument) {
|
||||
++SharedCount;
|
||||
Addresses[Index] = nullptr;
|
||||
Sizes[Index] = Argument->getElementCount() * sizeof(T);
|
||||
Types[Index] = KernelArgumentType::SHARED_DEVICE_MEMORY;
|
||||
}
|
||||
|
||||
std::array<const void *, sizeof...(ParameterTs)> Addresses;
|
||||
std::array<size_t, sizeof...(ParameterTs)> Sizes;
|
||||
std::array<KernelArgumentType, sizeof...(ParameterTs)> Types;
|
||||
};
|
||||
|
||||
// Utility template function to call the PackedKernelArgumentArray constructor
|
||||
// with the template arguments matching the types of the arguments passed to
|
||||
// this function.
|
||||
template <typename... ParameterTs>
|
||||
PackedKernelArgumentArray<ParameterTs...>
|
||||
make_kernel_argument_pack(const ParameterTs &... Arguments) {
|
||||
return PackedKernelArgumentArray<ParameterTs...>(Arguments...);
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PACKEDKERNELARGUMENTARRAY_H
|
|
@ -1,40 +0,0 @@
|
|||
//===-- Platform.h - The Platform class -------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// The Platform class which represents a platform such as CUDA or OpenCL.
|
||||
///
|
||||
/// This is an abstract base class that will be overridden by each specific
|
||||
/// platform.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORM_H
|
||||
#define STREAMEXECUTOR_PLATFORM_H
|
||||
|
||||
#include "streamexecutor/Error.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
class Device;
|
||||
|
||||
class Platform {
|
||||
public:
|
||||
virtual ~Platform();
|
||||
|
||||
/// Gets the number of devices available for this platform.
|
||||
virtual size_t getDeviceCount() const = 0;
|
||||
|
||||
/// Gets a Device with the given index for this platform.
|
||||
virtual Expected<Device> getDevice(size_t DeviceIndex) = 0;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORM_H
|
|
@ -1,171 +0,0 @@
|
|||
//===-- PlatformDevice.h - PlatformDevice class -----------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the PlatformDevice class.
|
||||
///
|
||||
/// Each specific platform such as CUDA or OpenCL must subclass PlatformDevice
|
||||
/// and override streamexecutor::Platform::getDevice to return an instance of
|
||||
/// their PlatformDevice subclass.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMDEVICE_H
|
||||
#define STREAMEXECUTOR_PLATFORMDEVICE_H
|
||||
|
||||
#include "streamexecutor/DeviceMemory.h"
|
||||
#include "streamexecutor/Error.h"
|
||||
#include "streamexecutor/Kernel.h"
|
||||
#include "streamexecutor/LaunchDimensions.h"
|
||||
#include "streamexecutor/PackedKernelArgumentArray.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
/// Raw executor methods that must be implemented by each platform.
|
||||
///
|
||||
/// The public Device and Stream classes have the type-safe versions of the
|
||||
/// functions in this interface.
|
||||
class PlatformDevice {
|
||||
public:
|
||||
virtual ~PlatformDevice();
|
||||
|
||||
virtual std::string getName() const = 0;
|
||||
|
||||
virtual std::string getPlatformName() const = 0;
|
||||
|
||||
/// Creates a platform-specific kernel.
|
||||
virtual Expected<const void *>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) {
|
||||
return make_error("createKernel not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
virtual Error destroyKernel(const void *Handle) {
|
||||
return make_error("destroyKernel not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Creates a platform-specific stream.
|
||||
virtual Expected<const void *> createStream() {
|
||||
return make_error("createStream not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
virtual Error destroyStream(const void *Handle) {
|
||||
return make_error("destroyStream not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Launches a kernel on the given stream.
|
||||
virtual Error launch(const void *PlatformStreamHandle,
|
||||
BlockDimensions BlockSize, GridDimensions GridSize,
|
||||
const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) {
|
||||
return make_error("launch not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies data from the device to the host.
|
||||
///
|
||||
/// HostDst should have been registered with registerHostMemory.
|
||||
virtual Error copyD2H(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
void *HostDst, size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("copyD2H not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies data from the host to the device.
|
||||
///
|
||||
/// HostSrc should have been registered with registerHostMemory.
|
||||
virtual Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("copyH2D not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies data from one device location to another.
|
||||
virtual Error copyD2D(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
size_t ByteCount) {
|
||||
return make_error("copyD2D not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Blocks the host until the given stream completes all the work enqueued up
|
||||
/// to the point this function is called.
|
||||
virtual Error blockHostUntilDone(const void *PlatformStreamHandle) {
|
||||
return make_error("blockHostUntilDone not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Allocates untyped device memory of a given size in bytes.
|
||||
virtual Expected<void *> allocateDeviceMemory(size_t ByteCount) {
|
||||
return make_error("allocateDeviceMemory not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Frees device memory previously allocated by allocateDeviceMemory.
|
||||
virtual Error freeDeviceMemory(const void *Handle) {
|
||||
return make_error("freeDeviceMemory not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Registers previously allocated host memory so it can be used with copyH2D
|
||||
/// and copyD2H.
|
||||
virtual Error registerHostMemory(void *Memory, size_t ByteCount) {
|
||||
return make_error("registerHostMemory not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Unregisters host memory previously registered with registerHostMemory.
|
||||
virtual Error unregisterHostMemory(const void *Memory) {
|
||||
return make_error("unregisterHostMemory not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Copies the given number of bytes from device memory to host memory.
|
||||
///
|
||||
/// Blocks the calling host thread until the copy is completed. Can operate on
|
||||
/// any host memory, not just registered host memory. Does not block any
|
||||
/// ongoing device calls.
|
||||
virtual Error synchronousCopyD2H(const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, void *HostDst,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("synchronousCopyD2H not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Similar to synchronousCopyD2H(const void *, size_t, void
|
||||
/// *, size_t, size_t), but copies memory from host to device rather than
|
||||
/// device to host.
|
||||
virtual Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("synchronousCopyH2D not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
|
||||
/// Similar to synchronousCopyD2H(const void *, size_t, void
|
||||
/// *, size_t, size_t), but copies memory from one location in device memory
|
||||
/// to another rather than from device to host.
|
||||
virtual Error synchronousCopyD2D(const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return make_error("synchronousCopyD2D not implemented for platform " +
|
||||
getPlatformName());
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMDEVICE_H
|
|
@ -1,53 +0,0 @@
|
|||
//===-- PlatformManager.h - The PlatformManager class -----------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// PlatformManager is the entry point into the StreamExecutor API. A user
|
||||
/// begins be calling PlatformManager::getPlatformByName("cuda") where "cuda"
|
||||
/// can be replaced by any supported platform name. This gives the user a
|
||||
/// Platform object that can be used to create Device objects for that platform,
|
||||
/// etc.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMMANAGER_H
|
||||
#define STREAMEXECUTOR_PLATFORMMANAGER_H
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "streamexecutor/Error.h"
|
||||
#include "streamexecutor/Platform.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
/// A singleton that holds a reference to a Platform object for each
|
||||
/// supported StreamExecutor platform.
|
||||
class PlatformManager {
|
||||
public:
|
||||
/// Gets a reference to the Platform with the given name.
|
||||
///
|
||||
/// The name parameter is not case-sensitive, so the following arguments are
|
||||
/// all equivalent: "cuda", "CUDA", "Cuda", "cUdA".
|
||||
///
|
||||
/// Returns an error if no platform is present for the name.
|
||||
///
|
||||
/// Ownership of the platform is NOT transferred to the caller.
|
||||
static Expected<Platform *> getPlatformByName(llvm::StringRef Name);
|
||||
|
||||
private:
|
||||
PlatformManager();
|
||||
PlatformManager(const PlatformManager &) = delete;
|
||||
PlatformManager operator=(const PlatformManager &) = delete;
|
||||
|
||||
std::map<std::string, std::unique_ptr<Platform>> PlatformsByName;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMMANAGER_H
|
|
@ -1,23 +0,0 @@
|
|||
//===-- PlatformOptions.h - Platform option macros --------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This contents of this file are filled in at configuration time. This file
|
||||
/// defines macros that represent the platform configuration state of the build,
|
||||
/// e.g. which platforms are enabled.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMOPTIONS_H
|
||||
#define STREAMEXECUTOR_PLATFORMOPTIONS_H
|
||||
|
||||
#cmakedefine STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMOPTIONS_H
|
|
@ -1,313 +0,0 @@
|
|||
//===-- Stream.h - A stream of execution ------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
///
|
||||
/// A Stream instance represents a queue of sequential, host-asynchronous work
|
||||
/// to be performed on a device.
|
||||
///
|
||||
/// To enqueue work on a device, first create a Device instance then use that
|
||||
/// Device to create a Stream instance. The Stream instance will perform its
|
||||
/// work on the device managed by the Device object that created it.
|
||||
///
|
||||
/// The various "then" methods of the Stream object, such as thenCopyH2D and
|
||||
/// thenLaunch, may be used to enqueue work on the Stream, and the
|
||||
/// blockHostUntilDone() method may be used to block the host code until the
|
||||
/// Stream has completed all its work.
|
||||
///
|
||||
/// Multiple Stream instances can be created for the same Device. This allows
|
||||
/// several independent streams of computation to be performed simultaneously on
|
||||
/// a single device.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_STREAM_H
|
||||
#define STREAMEXECUTOR_STREAM_H
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include "streamexecutor/DeviceMemory.h"
|
||||
#include "streamexecutor/Error.h"
|
||||
#include "streamexecutor/HostMemory.h"
|
||||
#include "streamexecutor/Kernel.h"
|
||||
#include "streamexecutor/LaunchDimensions.h"
|
||||
#include "streamexecutor/PackedKernelArgumentArray.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "llvm/Support/RWMutex.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
/// Represents a stream of dependent computations on a device.
|
||||
///
|
||||
/// The operations within a stream execute sequentially and asynchronously until
|
||||
/// blockHostUntilDone() is invoked, which synchronously joins host code with
|
||||
/// the execution of the stream.
|
||||
///
|
||||
/// If any given operation fails when entraining work for the stream, isOK()
|
||||
/// will indicate that an error has occurred and getStatus() will get the first
|
||||
/// error that occurred on the stream. There is no way to clear the error state
|
||||
/// of a stream once it is in an error state.
|
||||
class Stream {
|
||||
public:
|
||||
Stream(PlatformDevice *D, const void *PlatformStreamHandle);
|
||||
|
||||
Stream(const Stream &Other) = delete;
|
||||
Stream &operator=(const Stream &Other) = delete;
|
||||
|
||||
Stream(Stream &&Other) noexcept;
|
||||
Stream &operator=(Stream &&Other) noexcept;
|
||||
|
||||
~Stream();
|
||||
|
||||
/// Returns whether any error has occurred while entraining work on this
|
||||
/// stream.
|
||||
bool isOK() const {
|
||||
llvm::sys::ScopedReader ReaderLock(*ErrorMessageMutex);
|
||||
return !ErrorMessage;
|
||||
}
|
||||
|
||||
/// Returns the status created by the first error that occurred while
|
||||
/// entraining work on this stream.
|
||||
Error getStatus() const {
|
||||
llvm::sys::ScopedReader ReaderLock(*ErrorMessageMutex);
|
||||
if (ErrorMessage)
|
||||
return make_error(*ErrorMessage);
|
||||
else
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
// Blocks the calling host thread until all work enqueued on this Stream
|
||||
// completes.
|
||||
//
|
||||
// Returns the result of getStatus() after the Stream work completes.
|
||||
Error blockHostUntilDone() {
|
||||
setError(PDevice->blockHostUntilDone(PlatformStreamHandle));
|
||||
return getStatus();
|
||||
}
|
||||
|
||||
/// Entrains onto the stream of operations a kernel launch with the given
|
||||
/// arguments.
|
||||
///
|
||||
/// These arguments can be device memory types like GlobalDeviceMemory<T> and
|
||||
/// SharedDeviceMemory<T>, or they can be primitive types such as int. The
|
||||
/// allowable argument types are determined by the template parameters to the
|
||||
/// Kernel argument.
|
||||
template <typename... ParameterTs>
|
||||
Stream &thenLaunch(BlockDimensions BlockSize, GridDimensions GridSize,
|
||||
const Kernel<ParameterTs...> &K,
|
||||
const ParameterTs &... Arguments) {
|
||||
auto ArgumentArray =
|
||||
make_kernel_argument_pack<ParameterTs...>(Arguments...);
|
||||
setError(PDevice->launch(PlatformStreamHandle, BlockSize, GridSize,
|
||||
K.getPlatformHandle(), ArgumentArray));
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \name Device memory copying functions
|
||||
///
|
||||
/// These methods enqueue a device memory copy operation on the stream and
|
||||
/// return without waiting for the operation to complete.
|
||||
///
|
||||
/// The arguments and bounds checking for these methods match the API of the
|
||||
/// \ref DeviceHostSyncCopyGroup
|
||||
/// "host-synchronous device memory copying functions" of Device.
|
||||
///
|
||||
/// The template types SrcTy and DstTy must match the following constraints:
|
||||
/// * Must define typename ElementTy (the type of element stored in the
|
||||
/// memory);
|
||||
/// * ElementTy for the source argument must be the same as ElementTy for
|
||||
/// the destination argument;
|
||||
/// * Must be convertible to the correct slice type:
|
||||
/// * GlobalDeviceMemorySlice<ElementTy> for device memory arguments,
|
||||
/// * RegisteredHostMemorySlice<ElementTy> for host memory source
|
||||
/// arguments,
|
||||
/// * MutableRegisteredHostMemorySlice<ElementT> for host memory
|
||||
/// destination arguments.
|
||||
///@{
|
||||
|
||||
// D2H
|
||||
|
||||
template <typename SrcTy, typename DstTy>
|
||||
Stream &thenCopyD2H(SrcTy &&Src, DstTy &&Dst, size_t ElementCount) {
|
||||
using SrcElemTy = typename std::remove_reference<SrcTy>::type::ElementTy;
|
||||
using DstElemTy = typename std::remove_reference<DstTy>::type::ElementTy;
|
||||
static_assert(std::is_same<SrcElemTy, DstElemTy>::value,
|
||||
"src/dst element type mismatch for thenCopyD2H");
|
||||
GlobalDeviceMemorySlice<SrcElemTy> SrcSlice(Src);
|
||||
MutableRegisteredHostMemorySlice<DstElemTy> DstSlice(Dst);
|
||||
if (ElementCount > Src.getElementCount())
|
||||
setError("copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", from a device array of element count " +
|
||||
llvm::Twine(SrcSlice.getElementCount()));
|
||||
else if (ElementCount > DstSlice.getElementCount())
|
||||
setError("copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", to a host array of element count " +
|
||||
llvm::Twine(DstSlice.getElementCount()));
|
||||
else
|
||||
setError(PDevice->copyD2H(
|
||||
PlatformStreamHandle, SrcSlice.getBaseMemory().getHandle(),
|
||||
SrcSlice.getElementOffset() * sizeof(SrcElemTy),
|
||||
DstSlice.getPointer(), 0, ElementCount * sizeof(DstElemTy)));
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename SrcTy, typename DstTy>
|
||||
Stream &thenCopyD2H(SrcTy &&Src, DstTy &&Dst) {
|
||||
using SrcElemTy = typename std::remove_reference<SrcTy>::type::ElementTy;
|
||||
using DstElemTy = typename std::remove_reference<DstTy>::type::ElementTy;
|
||||
static_assert(std::is_same<SrcElemTy, DstElemTy>::value,
|
||||
"src/dst element type mismatch for thenCopyD2H");
|
||||
GlobalDeviceMemorySlice<SrcElemTy> SrcSlice(Src);
|
||||
MutableRegisteredHostMemorySlice<DstElemTy> DstSlice(Dst);
|
||||
if (SrcSlice.getElementCount() != DstSlice.getElementCount())
|
||||
setError("array size mismatch for D2H, device source has element count " +
|
||||
llvm::Twine(SrcSlice.getElementCount()) +
|
||||
" but host destination has element count " +
|
||||
llvm::Twine(DstSlice.getElementCount()));
|
||||
else
|
||||
thenCopyD2H(SrcSlice, DstSlice, SrcSlice.getElementCount());
|
||||
return *this;
|
||||
}
|
||||
|
||||
// H2D
|
||||
|
||||
template <typename SrcTy, typename DstTy>
|
||||
Stream &thenCopyH2D(SrcTy &&Src, DstTy &&Dst, size_t ElementCount) {
|
||||
using SrcElemTy = typename std::remove_reference<SrcTy>::type::ElementTy;
|
||||
using DstElemTy = typename std::remove_reference<DstTy>::type::ElementTy;
|
||||
static_assert(std::is_same<SrcElemTy, DstElemTy>::value,
|
||||
"src/dst element type mismatch for thenCopyH2D");
|
||||
RegisteredHostMemorySlice<SrcElemTy> SrcSlice(Src);
|
||||
GlobalDeviceMemorySlice<DstElemTy> DstSlice(Dst);
|
||||
if (ElementCount > SrcSlice.getElementCount())
|
||||
setError("copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", from a host array of element count " +
|
||||
llvm::Twine(SrcSlice.getElementCount()));
|
||||
else if (ElementCount > DstSlice.getElementCount())
|
||||
setError("copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", to a device array of element count " +
|
||||
llvm::Twine(DstSlice.getElementCount()));
|
||||
else
|
||||
setError(PDevice->copyH2D(PlatformStreamHandle, SrcSlice.getPointer(), 0,
|
||||
DstSlice.getBaseMemory().getHandle(),
|
||||
DstSlice.getElementOffset() * sizeof(DstElemTy),
|
||||
ElementCount * sizeof(SrcElemTy)));
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename SrcTy, typename DstTy>
|
||||
Stream &thenCopyH2D(SrcTy &&Src, DstTy &&Dst) {
|
||||
using SrcElemTy = typename std::remove_reference<SrcTy>::type::ElementTy;
|
||||
using DstElemTy = typename std::remove_reference<DstTy>::type::ElementTy;
|
||||
static_assert(std::is_same<SrcElemTy, DstElemTy>::value,
|
||||
"src/dst element type mismatch for thenCopyH2D");
|
||||
RegisteredHostMemorySlice<SrcElemTy> SrcSlice(Src);
|
||||
GlobalDeviceMemorySlice<DstElemTy> DstSlice(Dst);
|
||||
if (SrcSlice.getElementCount() != DstSlice.getElementCount())
|
||||
setError("array size mismatch for H2D, host source has element count " +
|
||||
llvm::Twine(SrcSlice.getElementCount()) +
|
||||
" but device destination has element count " +
|
||||
llvm::Twine(DstSlice.getElementCount()));
|
||||
else
|
||||
thenCopyH2D(SrcSlice, DstSlice, DstSlice.getElementCount());
|
||||
return *this;
|
||||
}
|
||||
|
||||
// D2D
|
||||
|
||||
template <typename SrcTy, typename DstTy>
|
||||
Stream &thenCopyD2D(SrcTy &&Src, DstTy &&Dst, size_t ElementCount) {
|
||||
using SrcElemTy = typename std::remove_reference<SrcTy>::type::ElementTy;
|
||||
using DstElemTy = typename std::remove_reference<DstTy>::type::ElementTy;
|
||||
static_assert(std::is_same<SrcElemTy, DstElemTy>::value,
|
||||
"src/dst element type mismatch for thenCopyD2D");
|
||||
GlobalDeviceMemorySlice<SrcElemTy> SrcSlice(Src);
|
||||
GlobalDeviceMemorySlice<DstElemTy> DstSlice(Dst);
|
||||
if (ElementCount > SrcSlice.getElementCount())
|
||||
setError("copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", from a device array of element count " +
|
||||
llvm::Twine(SrcSlice.getElementCount()));
|
||||
else if (ElementCount > DstSlice.getElementCount())
|
||||
setError("copying too many elements, " + llvm::Twine(ElementCount) +
|
||||
", to a device array of element count " +
|
||||
llvm::Twine(DstSlice.getElementCount()));
|
||||
else
|
||||
setError(PDevice->copyD2D(PlatformStreamHandle,
|
||||
SrcSlice.getBaseMemory().getHandle(),
|
||||
SrcSlice.getElementOffset() * sizeof(SrcElemTy),
|
||||
DstSlice.getBaseMemory().getHandle(),
|
||||
DstSlice.getElementOffset() * sizeof(DstElemTy),
|
||||
ElementCount * sizeof(SrcElemTy)));
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename SrcTy, typename DstTy>
|
||||
Stream &thenCopyD2D(SrcTy &&Src, DstTy &&Dst) {
|
||||
using SrcElemTy = typename std::remove_reference<SrcTy>::type::ElementTy;
|
||||
using DstElemTy = typename std::remove_reference<DstTy>::type::ElementTy;
|
||||
static_assert(std::is_same<SrcElemTy, DstElemTy>::value,
|
||||
"src/dst element type mismatch for thenCopyD2D");
|
||||
GlobalDeviceMemorySlice<SrcElemTy> SrcSlice(Src);
|
||||
GlobalDeviceMemorySlice<DstElemTy> DstSlice(Dst);
|
||||
if (SrcSlice.getElementCount() != DstSlice.getElementCount())
|
||||
setError("array size mismatch for D2D, device source has element count " +
|
||||
llvm::Twine(SrcSlice.getElementCount()) +
|
||||
" but device destination has element count " +
|
||||
llvm::Twine(DstSlice.getElementCount()));
|
||||
else
|
||||
thenCopyD2D(SrcSlice, DstSlice, SrcSlice.getElementCount());
|
||||
return *this;
|
||||
}
|
||||
|
||||
///@} End device memory copying functions
|
||||
|
||||
private:
|
||||
/// Sets the error state from an Error object.
|
||||
///
|
||||
/// Does not overwrite the error if it is already set.
|
||||
void setError(Error &&E) {
|
||||
if (E) {
|
||||
llvm::sys::ScopedWriter WriterLock(*ErrorMessageMutex);
|
||||
if (!ErrorMessage)
|
||||
ErrorMessage = consumeAndGetMessage(std::move(E));
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the error state from an error message.
|
||||
///
|
||||
/// Does not overwrite the error if it is already set.
|
||||
void setError(const llvm::Twine &Message) {
|
||||
llvm::sys::ScopedWriter WriterLock(*ErrorMessageMutex);
|
||||
if (!ErrorMessage)
|
||||
ErrorMessage = Message.str();
|
||||
}
|
||||
|
||||
/// The PlatformDevice that supports the operations of this stream.
|
||||
PlatformDevice *PDevice;
|
||||
|
||||
/// The platform-specific stream handle for this instance.
|
||||
const void *PlatformStreamHandle;
|
||||
|
||||
/// Mutex that guards the error state flags.
|
||||
std::unique_ptr<llvm::sys::RWMutex> ErrorMessageMutex;
|
||||
|
||||
/// First error message for an operation in this stream or empty if there have
|
||||
/// been no errors.
|
||||
llvm::Optional<std::string> ErrorMessage;
|
||||
};
|
||||
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_STREAM_H
|
|
@ -1,75 +0,0 @@
|
|||
//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// \mainpage Welcome to StreamExecutor
|
||||
///
|
||||
/// \section Introduction
|
||||
/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming
|
||||
/// models (runtimes). This abstraction cleanly permits host code to target
|
||||
/// either CUDA or OpenCL devices with identically-functioning data parallel
|
||||
/// kernels. It manages the execution of concurrent work targeting the
|
||||
/// accelerator, similar to a host-side Executor.
|
||||
///
|
||||
/// This version of StreamExecutor can be built either as a sub-project of the
|
||||
/// LLVM project or as a standalone project depending on LLVM as an external
|
||||
/// package.
|
||||
///
|
||||
/// \subsection ExampleUsage Example Usage
|
||||
/// Below is an example of the use of the StreamExecutor API:
|
||||
///
|
||||
/// \snippet examples/CUDASaxpy.cpp Example saxpy host main
|
||||
///
|
||||
/// In the example, a couple of handler functions, \c getOrDie and \c
|
||||
/// dieIfError, are used to handle error return values in the StreamExecutor
|
||||
/// API. These functions are provided by StreamExecutor for quick-and-dirty
|
||||
/// error handling, but real applications will likely want to define their own
|
||||
/// versions of these handlers so that errors are handled more gracefully than
|
||||
/// just exiting the program.
|
||||
///
|
||||
/// \subsection CompilerGeneratedCode Compiler-Generated Code
|
||||
///
|
||||
/// The example also references some symbols from a compiler-generated
|
||||
/// namespace:
|
||||
///
|
||||
/// \snippet examples/CUDASaxpy.cpp Example saxpy compiler-generated
|
||||
///
|
||||
/// Instead of depending on the compiler to generate this code, you can
|
||||
/// technically write the code yourself, but this is not recommended because the
|
||||
/// code is very error-prone. For example, the template parameters for the
|
||||
/// Kernel specialization have to match the parameter types for the device
|
||||
/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid
|
||||
/// device code for the kernel. Errors in this code will not show up until
|
||||
/// runtime, and may only show up as garbage output rather than an explicit
|
||||
/// error, which can be very hard to debug, so again, it is strongly advised not
|
||||
/// to write this code yourself.
|
||||
///
|
||||
/// The example compiler-generated code uses a PTX string in the source code to
|
||||
/// store the device code, but the device code can also be stored in other
|
||||
/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be
|
||||
/// stored for other platforms such as OpenCL, and StreamExecutor will pick the
|
||||
/// right device code at runtime based on the user's platform selection. See
|
||||
/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be
|
||||
/// stored for different platforms, but again, the code to set up the
|
||||
/// MultiKernelLoaderSpec instance should be generated by the compiler if
|
||||
/// possible, not by the user.
|
||||
|
||||
/// \example examples/CUDASaxpy.cpp
|
||||
/// Running saxpy on a CUDA device.
|
||||
|
||||
#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H
|
||||
#define STREAMEXECUTOR_STREAMEXECUTOR_H
|
||||
|
||||
#include "Device.h"
|
||||
#include "Kernel.h"
|
||||
#include "KernelSpec.h"
|
||||
#include "Platform.h"
|
||||
#include "PlatformManager.h"
|
||||
#include "Stream.h"
|
||||
|
||||
#endif // STREAMEXECUTOR_STREAMEXECUTOR_H
|
|
@ -1,42 +0,0 @@
|
|||
//===-- CUDAPlatform.h - CUDA platform subclass -----------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the CUDAPlatform class.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
|
||||
#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
|
||||
|
||||
#include "streamexecutor/Platform.h"
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
|
||||
|
||||
#include "llvm/Support/Mutex.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
class CUDAPlatform : public Platform {
|
||||
public:
|
||||
size_t getDeviceCount() const override;
|
||||
|
||||
Expected<Device> getDevice(size_t DeviceIndex) override;
|
||||
|
||||
private:
|
||||
llvm::sys::Mutex Mutex;
|
||||
std::map<size_t, CUDAPlatformDevice> PlatformDevices;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
|
|
@ -1,93 +0,0 @@
|
|||
//===-- CUDAPlatformDevice.h - CUDAPlatformDevice class ---------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the CUDAPlatformDevice class.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
|
||||
#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
|
||||
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
Error CUresultToError(int CUResult, const llvm::Twine &Message);
|
||||
|
||||
class CUDAPlatformDevice : public PlatformDevice {
|
||||
public:
|
||||
static Expected<CUDAPlatformDevice> create(size_t DeviceIndex);
|
||||
|
||||
CUDAPlatformDevice(const CUDAPlatformDevice &) = delete;
|
||||
CUDAPlatformDevice &operator=(const CUDAPlatformDevice &) = delete;
|
||||
|
||||
CUDAPlatformDevice(CUDAPlatformDevice &&) noexcept;
|
||||
CUDAPlatformDevice &operator=(CUDAPlatformDevice &&) noexcept;
|
||||
|
||||
~CUDAPlatformDevice() override;
|
||||
|
||||
std::string getName() const override;
|
||||
|
||||
std::string getPlatformName() const override { return "CUDA"; }
|
||||
|
||||
Expected<const void *>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) override;
|
||||
Error destroyKernel(const void *Handle) override;
|
||||
|
||||
Expected<const void *> createStream() override;
|
||||
Error destroyStream(const void *Handle) override;
|
||||
|
||||
Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
|
||||
GridDimensions GridSize, const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) override;
|
||||
|
||||
Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) override;
|
||||
|
||||
Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) override;
|
||||
|
||||
Error blockHostUntilDone(const void *PlatformStreamHandle) override;
|
||||
|
||||
Expected<void *> allocateDeviceMemory(size_t ByteCount) override;
|
||||
Error freeDeviceMemory(const void *Handle) override;
|
||||
|
||||
Error registerHostMemory(void *Memory, size_t ByteCount) override;
|
||||
Error unregisterHostMemory(const void *Memory) override;
|
||||
|
||||
Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
void *HostDst, size_t DstByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
size_t ByteCount) override;
|
||||
|
||||
private:
|
||||
CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {}
|
||||
|
||||
int DeviceIndex;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
|
|
@ -1,53 +0,0 @@
|
|||
//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the HostPlatform class.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
|
||||
#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
|
||||
|
||||
#include "HostPlatformDevice.h"
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/Platform.h"
|
||||
|
||||
#include "llvm/Support/Mutex.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace host {
|
||||
|
||||
/// Platform that performs work on the host rather than offloading to an
|
||||
/// accelerator.
|
||||
class HostPlatform : public Platform {
|
||||
public:
|
||||
size_t getDeviceCount() const override { return 1; }
|
||||
|
||||
Expected<Device> getDevice(size_t DeviceIndex) override {
|
||||
if (DeviceIndex != 0) {
|
||||
return make_error(
|
||||
"Requested device index " + llvm::Twine(DeviceIndex) +
|
||||
" from host platform which only supports device index 0");
|
||||
}
|
||||
llvm::sys::ScopedLock Lock(Mutex);
|
||||
if (!ThePlatformDevice)
|
||||
ThePlatformDevice = llvm::make_unique<HostPlatformDevice>();
|
||||
return Device(ThePlatformDevice.get());
|
||||
}
|
||||
|
||||
private:
|
||||
llvm::sys::Mutex Mutex;
|
||||
std::unique_ptr<HostPlatformDevice> ThePlatformDevice;
|
||||
};
|
||||
|
||||
} // namespace host
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
|
|
@ -1,161 +0,0 @@
|
|||
//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Declaration of the HostPlatformDevice class.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
|
||||
#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace host {
|
||||
|
||||
/// A concrete PlatformDevice subclass that performs its work on the host rather
|
||||
/// than offloading to an accelerator.
|
||||
class HostPlatformDevice : public PlatformDevice {
|
||||
public:
|
||||
std::string getName() const override { return "host"; }
|
||||
|
||||
std::string getPlatformName() const override { return "host"; }
|
||||
|
||||
Expected<const void *>
|
||||
createKernel(const MultiKernelLoaderSpec &Spec) override {
|
||||
if (!Spec.hasHostFunction()) {
|
||||
return make_error("no host implementation available for kernel " +
|
||||
Spec.getKernelName());
|
||||
}
|
||||
return static_cast<const void *>(&Spec.getHostFunction());
|
||||
}
|
||||
|
||||
Error destroyKernel(const void *Handle) override { return Error::success(); }
|
||||
|
||||
Expected<const void *> createStream() override {
|
||||
// TODO(jhen): Do something with threads to allow multiple streams.
|
||||
return this;
|
||||
}
|
||||
|
||||
Error destroyStream(const void *Handle) override { return Error::success(); }
|
||||
|
||||
Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
|
||||
GridDimensions GridSize, const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) override {
|
||||
// TODO(jhen): Can we do something with BlockSize and GridSize?
|
||||
if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) {
|
||||
return make_error(
|
||||
"Block dimensions were (" + llvm::Twine(BlockSize.X) + "," +
|
||||
llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) +
|
||||
"), but only size (1,1,1) is permitted for this platform");
|
||||
}
|
||||
if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) {
|
||||
return make_error(
|
||||
"Grid dimensions were (" + llvm::Twine(GridSize.X) + "," +
|
||||
llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) +
|
||||
"), but only size (1,1,1) is permitted for this platform");
|
||||
}
|
||||
|
||||
(*static_cast<const std::function<void(const void *const *)> *>(
|
||||
PKernelHandle))(ArgumentArray.getAddresses());
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
|
||||
size_t ByteCount) override {
|
||||
std::memcpy(offset(HostDst, DstByteOffset),
|
||||
offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) override {
|
||||
std::memcpy(offset(DeviceDstHandle, DstByteOffset),
|
||||
offset(HostSrc, SrcByteOffset), ByteCount);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) override {
|
||||
std::memcpy(offset(DeviceDstHandle, DstByteOffset),
|
||||
offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error blockHostUntilDone(const void *PlatformStreamHandle) override {
|
||||
// All host operations are synchronous anyway.
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Expected<void *> allocateDeviceMemory(size_t ByteCount) override {
|
||||
return std::malloc(ByteCount);
|
||||
}
|
||||
|
||||
Error freeDeviceMemory(const void *Handle) override {
|
||||
std::free(const_cast<void *>(Handle));
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error registerHostMemory(void *Memory, size_t ByteCount) override {
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error unregisterHostMemory(const void *Memory) override {
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
void *HostDst, size_t DstByteOffset,
|
||||
size_t ByteCount) override {
|
||||
std::memcpy(offset(HostDst, DstByteOffset),
|
||||
offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
size_t ByteCount) override {
|
||||
std::memcpy(offset(DeviceDstHandle, DstByteOffset),
|
||||
offset(HostSrc, SrcByteOffset), ByteCount);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle, size_t DstByteOffset,
|
||||
size_t ByteCount) override {
|
||||
std::memcpy(offset(DeviceDstHandle, DstByteOffset),
|
||||
offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
/// Gets the value at the given index from a GlobalDeviceMemory<T> instance
|
||||
/// created by this class.
|
||||
template <typename T>
|
||||
static T getDeviceValue(const streamexecutor::GlobalDeviceMemory<T> &Memory,
|
||||
size_t Index) {
|
||||
return static_cast<const T *>(Memory.getHandle())[Index];
|
||||
}
|
||||
|
||||
private:
|
||||
static void *offset(const void *Base, size_t Offset) {
|
||||
return const_cast<char *>(static_cast<const char *>(Base) + Offset);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace host
|
||||
} // namespace streamexecutor
|
||||
|
||||
#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
|
|
@ -1,25 +0,0 @@
|
|||
macro(add_se_library name)
|
||||
add_llvm_library(${name} ${ARGN})
|
||||
set_target_properties(${name} PROPERTIES FOLDER "streamexecutor libraries")
|
||||
endmacro(add_se_library)
|
||||
|
||||
add_subdirectory(platforms)
|
||||
|
||||
add_se_library(
|
||||
streamexecutor
|
||||
Device.cpp
|
||||
DeviceMemory.cpp
|
||||
Error.cpp
|
||||
HostMemory.cpp
|
||||
Kernel.cpp
|
||||
KernelSpec.cpp
|
||||
PackedKernelArgumentArray.cpp
|
||||
Platform.cpp
|
||||
PlatformDevice.cpp
|
||||
PlatformManager.cpp
|
||||
Stream.cpp
|
||||
${STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT}
|
||||
LINK_LIBS
|
||||
${STREAM_EXECUTOR_LIBCUDA_LIBRARIES})
|
||||
|
||||
install(TARGETS streamexecutor DESTINATION lib)
|
|
@ -1,37 +0,0 @@
|
|||
//===-- Device.cpp - Device implementation --------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of Device class internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/Device.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
#include "streamexecutor/Stream.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
Device::Device(PlatformDevice *PDevice) : PDevice(PDevice) {}
|
||||
|
||||
Device::~Device() = default;
|
||||
|
||||
Expected<Stream> Device::createStream() {
|
||||
Expected<const void *> MaybePlatformStream = PDevice->createStream();
|
||||
if (!MaybePlatformStream)
|
||||
return MaybePlatformStream.takeError();
|
||||
return Stream(PDevice, *MaybePlatformStream);
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,27 +0,0 @@
|
|||
//===-- DeviceMemory.cpp - DeviceMemory implementation --------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of DeviceMemory class internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/DeviceMemory.h"
|
||||
|
||||
#include "streamexecutor/Device.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
GlobalDeviceMemoryBase::~GlobalDeviceMemoryBase() {
|
||||
if (Handle)
|
||||
// TODO(jhen): How to handle errors here.
|
||||
consumeError(TheDevice->freeDeviceMemory(*this));
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,70 +0,0 @@
|
|||
//===-- Error.cpp - Error handling ----------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Types for returning recoverable errors.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/Error.h"
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// An error with a string message describing the cause.
|
||||
class StreamExecutorError : public llvm::ErrorInfo<StreamExecutorError> {
|
||||
public:
|
||||
StreamExecutorError(llvm::StringRef Message) : Message(Message.str()) {}
|
||||
|
||||
void log(llvm::raw_ostream &OS) const override { OS << Message; }
|
||||
|
||||
std::error_code convertToErrorCode() const override {
|
||||
llvm_unreachable(
|
||||
"StreamExecutorError does not support conversion to std::error_code");
|
||||
}
|
||||
|
||||
std::string getErrorMessage() const { return Message; }
|
||||
|
||||
static char ID;
|
||||
|
||||
private:
|
||||
std::string Message;
|
||||
};
|
||||
|
||||
char StreamExecutorError::ID = 0;
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
Error make_error(const Twine &Message) {
|
||||
return llvm::make_error<StreamExecutorError>(Message.str());
|
||||
}
|
||||
|
||||
std::string consumeAndGetMessage(Error &&E) {
|
||||
if (!E)
|
||||
return "success";
|
||||
std::string Message;
|
||||
llvm::handleAllErrors(std::move(E),
|
||||
[&Message](const StreamExecutorError &SEE) {
|
||||
Message = SEE.getErrorMessage();
|
||||
});
|
||||
return Message;
|
||||
}
|
||||
|
||||
void dieIfError(Error &&E) {
|
||||
if (E) {
|
||||
std::fprintf(stderr, "Error encountered: %s.\n",
|
||||
streamexecutor::consumeAndGetMessage(std::move(E)).c_str());
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,28 +0,0 @@
|
|||
//===-- HostMemory.cpp - HostMemory implementation ------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of HostMemory internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/HostMemory.h"
|
||||
#include "streamexecutor/Device.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace internal {
|
||||
|
||||
void destroyRegisteredHostMemoryInternals(Device *TheDevice, void *Pointer) {
|
||||
// TODO(jhen): How to handle errors here?
|
||||
if (Pointer)
|
||||
consumeError(TheDevice->unregisterHostMemory(Pointer));
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace streamexecutor
|
|
@ -1,60 +0,0 @@
|
|||
//===-- Kernel.cpp - General kernel implementation ------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the implementation details for kernel types.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/Kernel.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
#include "llvm/DebugInfo/Symbolize/Symbolize.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
KernelBase::KernelBase(PlatformDevice *D, const void *PlatformKernelHandle,
|
||||
llvm::StringRef Name)
|
||||
: PDevice(D), PlatformKernelHandle(PlatformKernelHandle), Name(Name),
|
||||
DemangledName(
|
||||
llvm::symbolize::LLVMSymbolizer::DemangleName(Name, nullptr)) {
|
||||
assert(D != nullptr &&
|
||||
"cannot construct a kernel object with a null platform device");
|
||||
assert(PlatformKernelHandle != nullptr &&
|
||||
"cannot construct a kernel object with a null platform kernel handle");
|
||||
}
|
||||
|
||||
KernelBase::KernelBase(KernelBase &&Other) noexcept
|
||||
: PDevice(Other.PDevice), PlatformKernelHandle(Other.PlatformKernelHandle),
|
||||
Name(std::move(Other.Name)),
|
||||
DemangledName(std::move(Other.DemangledName)) {
|
||||
Other.PDevice = nullptr;
|
||||
Other.PlatformKernelHandle = nullptr;
|
||||
}
|
||||
|
||||
KernelBase &KernelBase::operator=(KernelBase &&Other) noexcept {
|
||||
PDevice = Other.PDevice;
|
||||
PlatformKernelHandle = Other.PlatformKernelHandle;
|
||||
Name = std::move(Other.Name);
|
||||
DemangledName = std::move(Other.DemangledName);
|
||||
Other.PDevice = nullptr;
|
||||
Other.PlatformKernelHandle = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KernelBase::~KernelBase() {
|
||||
if (PlatformKernelHandle)
|
||||
// TODO(jhen): Handle the error here.
|
||||
consumeError(PDevice->destroyKernel(PlatformKernelHandle));
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,92 +0,0 @@
|
|||
//===-- KernelSpec.cpp - General kernel spec implementation ---------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the implementation details for kernel loader specs.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/KernelSpec.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
KernelLoaderSpec::KernelLoaderSpec(llvm::StringRef KernelName)
|
||||
: KernelName(KernelName) {}
|
||||
|
||||
CUDAPTXInMemorySpec::CUDAPTXInMemorySpec(
|
||||
llvm::StringRef KernelName,
|
||||
const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList)
|
||||
: KernelLoaderSpec(KernelName) {
|
||||
for (const auto &Spec : SpecList)
|
||||
PTXByComputeCapability.emplace(Spec.TheComputeCapability, Spec.PTXCode);
|
||||
}
|
||||
|
||||
const char *CUDAPTXInMemorySpec::getCode(int ComputeCapabilityMajor,
|
||||
int ComputeCapabilityMinor) const {
|
||||
auto Iterator =
|
||||
PTXByComputeCapability.upper_bound(CUDAPTXInMemorySpec::ComputeCapability{
|
||||
ComputeCapabilityMajor, ComputeCapabilityMinor});
|
||||
if (Iterator == PTXByComputeCapability.begin())
|
||||
return nullptr;
|
||||
--Iterator;
|
||||
return Iterator->second;
|
||||
}
|
||||
|
||||
CUDAFatbinInMemorySpec::CUDAFatbinInMemorySpec(llvm::StringRef KernelName,
|
||||
const void *Bytes)
|
||||
: KernelLoaderSpec(KernelName), Bytes(Bytes) {}
|
||||
|
||||
OpenCLTextInMemorySpec::OpenCLTextInMemorySpec(llvm::StringRef KernelName,
|
||||
const char *Text)
|
||||
: KernelLoaderSpec(KernelName), Text(Text) {}
|
||||
|
||||
void MultiKernelLoaderSpec::setKernelName(llvm::StringRef KernelName) {
|
||||
if (TheKernelName)
|
||||
assert(KernelName.equals(*TheKernelName) &&
|
||||
"different kernel names in one MultiKernelLoaderSpec");
|
||||
else
|
||||
TheKernelName = llvm::make_unique<std::string>(KernelName);
|
||||
}
|
||||
|
||||
MultiKernelLoaderSpec &MultiKernelLoaderSpec::addCUDAPTXInMemory(
|
||||
llvm::StringRef KernelName,
|
||||
llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList) {
|
||||
assert((TheCUDAPTXInMemorySpec == nullptr) &&
|
||||
"illegal loader spec overwrite");
|
||||
setKernelName(KernelName);
|
||||
TheCUDAPTXInMemorySpec =
|
||||
llvm::make_unique<CUDAPTXInMemorySpec>(KernelName, SpecList);
|
||||
return *this;
|
||||
}
|
||||
|
||||
MultiKernelLoaderSpec &
|
||||
MultiKernelLoaderSpec::addCUDAFatbinInMemory(llvm::StringRef KernelName,
|
||||
const void *Bytes) {
|
||||
assert((TheCUDAFatbinInMemorySpec == nullptr) &&
|
||||
"illegal loader spec overwrite");
|
||||
setKernelName(KernelName);
|
||||
TheCUDAFatbinInMemorySpec =
|
||||
llvm::make_unique<CUDAFatbinInMemorySpec>(KernelName, Bytes);
|
||||
return *this;
|
||||
}
|
||||
|
||||
MultiKernelLoaderSpec &
|
||||
MultiKernelLoaderSpec::addOpenCLTextInMemory(llvm::StringRef KernelName,
|
||||
const char *OpenCLText) {
|
||||
assert((TheOpenCLTextInMemorySpec == nullptr) &&
|
||||
"illegal loader spec overwrite");
|
||||
setKernelName(KernelName);
|
||||
TheOpenCLTextInMemorySpec =
|
||||
llvm::make_unique<OpenCLTextInMemorySpec>(KernelName, OpenCLText);
|
||||
return *this;
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,21 +0,0 @@
|
|||
//===-- PackedKernelArgumentArray.cpp - Packed argument array impl --------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation details for classes from PackedKernelArgumentArray.h.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/PackedKernelArgumentArray.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
PackedKernelArgumentArrayBase::~PackedKernelArgumentArrayBase() = default;
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,21 +0,0 @@
|
|||
//===-- Platform.cpp - Platform implementation ----------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of Platform class internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/Platform.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
Platform::~Platform() = default;
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,21 +0,0 @@
|
|||
//===-- PlatformDevice.cpp - Platform interface implementations -----------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation file for PlatformDevice.h.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
PlatformDevice::~PlatformDevice() = default;
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,49 +0,0 @@
|
|||
//===-- PlatformManager.cpp - PlatformManager implementation --------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of PlatformManager class internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/PlatformManager.h"
|
||||
|
||||
#include "streamexecutor/PlatformOptions.h"
|
||||
#include "streamexecutor/platforms/host/HostPlatform.h"
|
||||
|
||||
#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
|
||||
#endif
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
PlatformManager::PlatformManager() {
|
||||
// TODO(jhen): Register known platforms by name.
|
||||
// We have a couple of options here:
|
||||
// * Use build-system flags to set preprocessor macros that select the
|
||||
// appropriate code to include here.
|
||||
// * Use static initialization tricks to have platform libraries register
|
||||
// themselves when they are loaded.
|
||||
|
||||
PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
|
||||
|
||||
#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
|
||||
PlatformsByName.emplace("cuda", llvm::make_unique<cuda::CUDAPlatform>());
|
||||
#endif
|
||||
}
|
||||
|
||||
Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {
|
||||
static PlatformManager Instance;
|
||||
auto Iterator = Instance.PlatformsByName.find(Name.lower());
|
||||
if (Iterator != Instance.PlatformsByName.end())
|
||||
return Iterator->second.get();
|
||||
return make_error("no available platform with name " + Name);
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,54 +0,0 @@
|
|||
//===-- Stream.cpp - General stream implementation ------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the implementation details for a general stream object.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "streamexecutor/Stream.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
|
||||
Stream::Stream(PlatformDevice *D, const void *PlatformStreamHandle)
|
||||
: PDevice(D), PlatformStreamHandle(PlatformStreamHandle),
|
||||
ErrorMessageMutex(llvm::make_unique<llvm::sys::RWMutex>()) {
|
||||
assert(D != nullptr &&
|
||||
"cannot construct a stream object with a null platform device");
|
||||
assert(PlatformStreamHandle != nullptr &&
|
||||
"cannot construct a stream object with a null platform stream handle");
|
||||
}
|
||||
|
||||
Stream::Stream(Stream &&Other) noexcept
|
||||
: PDevice(Other.PDevice), PlatformStreamHandle(Other.PlatformStreamHandle),
|
||||
ErrorMessageMutex(std::move(Other.ErrorMessageMutex)),
|
||||
ErrorMessage(std::move(Other.ErrorMessage)) {
|
||||
Other.PDevice = nullptr;
|
||||
Other.PlatformStreamHandle = nullptr;
|
||||
}
|
||||
|
||||
Stream &Stream::operator=(Stream &&Other) noexcept {
|
||||
PDevice = Other.PDevice;
|
||||
PlatformStreamHandle = Other.PlatformStreamHandle;
|
||||
ErrorMessageMutex = std::move(Other.ErrorMessageMutex);
|
||||
ErrorMessage = std::move(Other.ErrorMessage);
|
||||
Other.PDevice = nullptr;
|
||||
Other.PlatformStreamHandle = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Stream::~Stream() {
|
||||
if (PlatformStreamHandle)
|
||||
// TODO(jhen): Handle error condition here.
|
||||
consumeError(PDevice->destroyStream(PlatformStreamHandle));
|
||||
}
|
||||
|
||||
} // namespace streamexecutor
|
|
@ -1,3 +0,0 @@
|
|||
if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
add_subdirectory(cuda)
|
||||
endif()
|
|
@ -1,5 +0,0 @@
|
|||
add_library(
|
||||
streamexecutor_cuda_platform
|
||||
OBJECT
|
||||
CUDAPlatform.cpp
|
||||
CUDAPlatformDevice.cpp)
|
|
@ -1,65 +0,0 @@
|
|||
//===-- CUDAPlatform.cpp - CUDA platform implementation -------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of CUDA platform internals.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/Platform.h"
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
|
||||
|
||||
#include "llvm/Support/Mutex.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
static CUresult ensureCUDAInitialized() {
|
||||
static CUresult InitResult = []() { return cuInit(0); }();
|
||||
return InitResult;
|
||||
}
|
||||
|
||||
size_t CUDAPlatform::getDeviceCount() const {
|
||||
if (ensureCUDAInitialized())
|
||||
// TODO(jhen): Log an error.
|
||||
return 0;
|
||||
|
||||
int DeviceCount = 0;
|
||||
CUresult Result = cuDeviceGetCount(&DeviceCount);
|
||||
(void)Result;
|
||||
// TODO(jhen): Log an error.
|
||||
|
||||
return DeviceCount;
|
||||
}
|
||||
|
||||
Expected<Device> CUDAPlatform::getDevice(size_t DeviceIndex) {
|
||||
if (CUresult InitResult = ensureCUDAInitialized())
|
||||
return CUresultToError(InitResult, "cached cuInit return value");
|
||||
|
||||
llvm::sys::ScopedLock Lock(Mutex);
|
||||
auto Iterator = PlatformDevices.find(DeviceIndex);
|
||||
if (Iterator == PlatformDevices.end()) {
|
||||
if (auto MaybePDevice = CUDAPlatformDevice::create(DeviceIndex)) {
|
||||
Iterator =
|
||||
PlatformDevices.emplace(DeviceIndex, std::move(*MaybePDevice)).first;
|
||||
} else {
|
||||
return MaybePDevice.takeError();
|
||||
}
|
||||
}
|
||||
return Device(&Iterator->second);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
|
@ -1,307 +0,0 @@
|
|||
//===-- CUDAPlatformDevice.cpp - CUDAPlatformDevice implementation --------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Implementation of CUDAPlatformDevice.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
namespace streamexecutor {
|
||||
namespace cuda {
|
||||
|
||||
static void *offset(const void *Base, size_t Offset) {
|
||||
return const_cast<char *>(static_cast<const char *>(Base) + Offset);
|
||||
}
|
||||
|
||||
Error CUresultToError(int CUResult, const llvm::Twine &Message) {
|
||||
CUresult Result = static_cast<CUresult>(CUResult);
|
||||
if (Result) {
|
||||
const char *ErrorName;
|
||||
if (cuGetErrorName(Result, &ErrorName))
|
||||
ErrorName = "UNKNOWN ERROR NAME";
|
||||
const char *ErrorString;
|
||||
if (cuGetErrorString(Result, &ErrorString))
|
||||
ErrorString = "UNKNOWN ERROR DESCRIPTION";
|
||||
return make_error("CUDA driver error: '" + Message + "', error code = " +
|
||||
llvm::Twine(static_cast<int>(Result)) + ", name = " +
|
||||
ErrorName + ", description = '" + ErrorString + "'");
|
||||
} else
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
std::string CUDAPlatformDevice::getName() const {
|
||||
static std::string CachedName = [](int DeviceIndex) {
|
||||
static constexpr size_t MAX_DRIVER_NAME_BYTES = 1024;
|
||||
std::string Name = "CUDA device " + std::to_string(DeviceIndex);
|
||||
char NameFromDriver[MAX_DRIVER_NAME_BYTES];
|
||||
if (!cuDeviceGetName(NameFromDriver, MAX_DRIVER_NAME_BYTES - 1,
|
||||
DeviceIndex)) {
|
||||
NameFromDriver[MAX_DRIVER_NAME_BYTES - 1] = '\0';
|
||||
Name.append(": ").append(NameFromDriver);
|
||||
}
|
||||
return Name;
|
||||
}(DeviceIndex);
|
||||
return CachedName;
|
||||
}
|
||||
|
||||
Expected<CUDAPlatformDevice> CUDAPlatformDevice::create(size_t DeviceIndex) {
|
||||
CUdevice DeviceHandle;
|
||||
if (CUresult Result = cuDeviceGet(&DeviceHandle, DeviceIndex))
|
||||
return CUresultToError(Result, "cuDeviceGet");
|
||||
|
||||
CUcontext ContextHandle;
|
||||
if (CUresult Result = cuDevicePrimaryCtxRetain(&ContextHandle, DeviceHandle))
|
||||
return CUresultToError(Result, "cuDevicePrimaryCtxRetain");
|
||||
|
||||
if (CUresult Result = cuCtxSetCurrent(ContextHandle))
|
||||
return CUresultToError(Result, "cuCtxSetCurrent");
|
||||
|
||||
return CUDAPlatformDevice(DeviceIndex);
|
||||
}
|
||||
|
||||
CUDAPlatformDevice::CUDAPlatformDevice(CUDAPlatformDevice &&Other) noexcept
|
||||
: DeviceIndex(Other.DeviceIndex) {
|
||||
Other.DeviceIndex = -1;
|
||||
}
|
||||
|
||||
CUDAPlatformDevice &CUDAPlatformDevice::
|
||||
operator=(CUDAPlatformDevice &&Other) noexcept {
|
||||
DeviceIndex = Other.DeviceIndex;
|
||||
Other.DeviceIndex = -1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDAPlatformDevice::~CUDAPlatformDevice() {
|
||||
CUresult Result = cuDevicePrimaryCtxRelease(DeviceIndex);
|
||||
(void)Result;
|
||||
// TODO(jhen): Log error.
|
||||
}
|
||||
|
||||
Expected<const void *>
|
||||
CUDAPlatformDevice::createKernel(const MultiKernelLoaderSpec &Spec) {
|
||||
// TODO(jhen): Maybe first check loaded modules?
|
||||
if (!Spec.hasCUDAPTXInMemory())
|
||||
return make_error("no CUDA code available to create kernel");
|
||||
|
||||
CUdevice Device = static_cast<int>(DeviceIndex);
|
||||
int ComputeCapabilityMajor = 0;
|
||||
int ComputeCapabilityMinor = 0;
|
||||
if (CUresult Result = cuDeviceGetAttribute(
|
||||
&ComputeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
||||
Device))
|
||||
return CUresultToError(
|
||||
Result,
|
||||
"cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR");
|
||||
if (CUresult Result = cuDeviceGetAttribute(
|
||||
&ComputeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
||||
Device))
|
||||
return CUresultToError(
|
||||
Result,
|
||||
"cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR");
|
||||
const char *Code = Spec.getCUDAPTXInMemory().getCode(ComputeCapabilityMajor,
|
||||
ComputeCapabilityMinor);
|
||||
|
||||
if (!Code)
|
||||
return make_error("no suitable CUDA source found for compute capability " +
|
||||
llvm::Twine(ComputeCapabilityMajor) + "." +
|
||||
llvm::Twine(ComputeCapabilityMinor));
|
||||
|
||||
CUmodule Module;
|
||||
if (CUresult Result = cuModuleLoadData(&Module, Code))
|
||||
return CUresultToError(Result, "cuModuleLoadData");
|
||||
|
||||
CUfunction Function;
|
||||
if (CUresult Result =
|
||||
cuModuleGetFunction(&Function, Module, Spec.getKernelName().c_str()))
|
||||
return CUresultToError(Result, "cuModuleGetFunction");
|
||||
|
||||
// TODO(jhen): Should I save this function pointer in case someone asks for
|
||||
// it again?
|
||||
|
||||
// TODO(jhen): Should I save the module pointer so I can unload it when I
|
||||
// destroy this device?
|
||||
|
||||
return static_cast<const void *>(Function);
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::destroyKernel(const void *Handle) {
|
||||
// TODO(jhen): Maybe keep track of kernels for each module and unload the
|
||||
// module after they are all destroyed.
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Expected<const void *> CUDAPlatformDevice::createStream() {
|
||||
CUstream Stream;
|
||||
if (CUresult Result = cuStreamCreate(&Stream, CU_STREAM_DEFAULT))
|
||||
return CUresultToError(Result, "cuStreamCreate");
|
||||
return Stream;
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::destroyStream(const void *Handle) {
|
||||
return CUresultToError(
|
||||
cuStreamDestroy(static_cast<CUstream>(const_cast<void *>(Handle))),
|
||||
"cuStreamDestroy");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::launch(
|
||||
const void *PlatformStreamHandle, BlockDimensions BlockSize,
|
||||
GridDimensions GridSize, const void *PKernelHandle,
|
||||
const PackedKernelArgumentArrayBase &ArgumentArray) {
|
||||
CUfunction Function =
|
||||
reinterpret_cast<CUfunction>(const_cast<void *>(PKernelHandle));
|
||||
CUstream Stream =
|
||||
reinterpret_cast<CUstream>(const_cast<void *>(PlatformStreamHandle));
|
||||
|
||||
auto Launch = [Function, Stream, BlockSize,
|
||||
GridSize](size_t SharedMemoryBytes, void **ArgumentAddresses) {
|
||||
return CUresultToError(
|
||||
cuLaunchKernel(Function, //
|
||||
GridSize.X, GridSize.Y, GridSize.Z, //
|
||||
BlockSize.X, BlockSize.Y, BlockSize.Z, //
|
||||
SharedMemoryBytes, Stream, ArgumentAddresses, nullptr),
|
||||
"cuLaunchKernel");
|
||||
};
|
||||
|
||||
void **ArgumentAddresses = const_cast<void **>(ArgumentArray.getAddresses());
|
||||
size_t SharedArgumentCount = ArgumentArray.getSharedCount();
|
||||
if (SharedArgumentCount) {
|
||||
// The argument handling in this case is not very efficient. We may need to
|
||||
// come back and optimize it later.
|
||||
//
|
||||
// Perhaps introduce another branch for the case where there is exactly one
|
||||
// shared memory argument and it is the first one. This is the only case
|
||||
// that will be used for compiler-generated CUDA kernels, and OpenCL users
|
||||
// can choose to take advantage of it by combining their dynamic shared
|
||||
// memory arguments and putting them first in the kernel signature.
|
||||
unsigned SharedMemoryBytes = 0;
|
||||
size_t ArgumentCount = ArgumentArray.getArgumentCount();
|
||||
llvm::SmallVector<void *, 16> NonSharedArgumentAddresses(
|
||||
ArgumentCount - SharedArgumentCount);
|
||||
size_t NonSharedIndex = 0;
|
||||
for (size_t I = 0; I < ArgumentCount; ++I)
|
||||
if (ArgumentArray.getType(I) == KernelArgumentType::SHARED_DEVICE_MEMORY)
|
||||
SharedMemoryBytes += ArgumentArray.getSize(I);
|
||||
else
|
||||
NonSharedArgumentAddresses[NonSharedIndex++] = ArgumentAddresses[I];
|
||||
return Launch(SharedMemoryBytes, NonSharedArgumentAddresses.data());
|
||||
}
|
||||
return Launch(0, ArgumentAddresses);
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::copyD2H(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset, void *HostDst,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyDtoHAsync(
|
||||
offset(HostDst, DstByteOffset),
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount,
|
||||
static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuMemcpyDtoHAsync");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::copyH2D(const void *PlatformStreamHandle,
|
||||
const void *HostSrc, size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyHtoDAsync(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
offset(HostSrc, SrcByteOffset), ByteCount,
|
||||
static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuMemcpyHtoDAsync");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::copyD2D(const void *PlatformStreamHandle,
|
||||
const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset, size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyDtoDAsync(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount,
|
||||
static_cast<CUstream>(const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuMemcpyDtoDAsync");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::blockHostUntilDone(const void *PlatformStreamHandle) {
|
||||
return CUresultToError(cuStreamSynchronize(static_cast<CUstream>(
|
||||
const_cast<void *>(PlatformStreamHandle))),
|
||||
"cuStreamSynchronize");
|
||||
}
|
||||
|
||||
Expected<void *> CUDAPlatformDevice::allocateDeviceMemory(size_t ByteCount) {
|
||||
CUdeviceptr Pointer;
|
||||
if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
|
||||
return CUresultToError(Result, "cuMemAlloc");
|
||||
return reinterpret_cast<void *>(Pointer);
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::freeDeviceMemory(const void *Handle) {
|
||||
return CUresultToError(cuMemFree(reinterpret_cast<CUdeviceptr>(Handle)),
|
||||
"cuMemFree");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::registerHostMemory(void *Memory, size_t ByteCount) {
|
||||
return CUresultToError(cuMemHostRegister(Memory, ByteCount, 0u),
|
||||
"cuMemHostRegister");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::unregisterHostMemory(const void *Memory) {
|
||||
return CUresultToError(cuMemHostUnregister(const_cast<void *>(Memory)),
|
||||
"cuMemHostUnregister");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::synchronousCopyD2H(const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
void *HostDst,
|
||||
size_t DstByteOffset,
|
||||
size_t ByteCount) {
|
||||
return CUresultToError(cuMemcpyDtoH(offset(HostDst, DstByteOffset),
|
||||
reinterpret_cast<CUdeviceptr>(offset(
|
||||
DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount),
|
||||
"cuMemcpyDtoH");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::synchronousCopyH2D(const void *HostSrc,
|
||||
size_t SrcByteOffset,
|
||||
const void *DeviceDstHandle,
|
||||
size_t DstByteOffset,
|
||||
size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyHtoD(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
offset(HostSrc, SrcByteOffset), ByteCount),
|
||||
"cuMemcpyHtoD");
|
||||
}
|
||||
|
||||
Error CUDAPlatformDevice::synchronousCopyD2D(const void *DeviceDstHandle,
|
||||
size_t DstByteOffset,
|
||||
const void *DeviceSrcHandle,
|
||||
size_t SrcByteOffset,
|
||||
size_t ByteCount) {
|
||||
return CUresultToError(
|
||||
cuMemcpyDtoD(
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceDstHandle, DstByteOffset)),
|
||||
reinterpret_cast<CUdeviceptr>(offset(DeviceSrcHandle, SrcByteOffset)),
|
||||
ByteCount),
|
||||
"cuMemcpyDtoD");
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace streamexecutor
|
|
@ -1,3 +0,0 @@
|
|||
find_package(PythonInterp REQUIRED)
|
||||
configure_file(streamexecutor-config.in streamexecutor-config)
|
||||
install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/streamexecutor-config DESTINATION bin)
|
|
@ -1,231 +0,0 @@
|
|||
#!@PYTHON_EXECUTABLE@
|
||||
#
|
||||
#===- streamexecutor-config - Build config script for SE -----*- python -*--===#
|
||||
#
|
||||
# The LLVM Compiler Infrastructure
|
||||
#
|
||||
# This file is distributed under the University of Illinois Open Source
|
||||
# License. See LICENSE.TXT for details.
|
||||
#
|
||||
#===------------------------------------------------------------------------===#
|
||||
|
||||
r"""
|
||||
Get configuration info needed to compile programs which use StreamExecutor.
|
||||
|
||||
Runs llvm-config and adds StreamExecutor-specific flags to the output. Supports
|
||||
only the subset of llvm-config flags that are relevant for applications
|
||||
compiling against StreamExecutor.
|
||||
|
||||
This utility will typically be used to construct a compile command line for an
|
||||
application which depends on the StreamExecutor library.
|
||||
|
||||
For example:
|
||||
c++ example.cpp -o example \
|
||||
$(streamexecutor-config \
|
||||
--cppflags --cxxflags --ldflags --libs --system-libs)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import errno
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# The following functions are configured by cmake. They use raw triple-quoted
|
||||
# strings to surround values that are substituted by cmake at configure time.
|
||||
# This kind of quoting should allow for paths that contain spaces.
|
||||
|
||||
def get_llvm_config_dir():
|
||||
"""Gets the path to the llvm-config executable."""
|
||||
return r"""@LLVM_BINARY_DIR@/bin"""
|
||||
|
||||
def get_cmake_install_prefix():
|
||||
"""Gets the value of the cmake variable CMAKE_INSTALL_PREFIX."""
|
||||
return r"""@CMAKE_INSTALL_PREFIX@"""
|
||||
|
||||
def convert_library_name(library_name):
|
||||
"""Converts a library name ending in '.framework' into a '-framework' flag.
|
||||
|
||||
This is used to support OS X.
|
||||
|
||||
>>> convert_library_name('')
|
||||
''
|
||||
|
||||
>>> convert_library_name('/usr/local/lib64/libcuda.so')
|
||||
'/usr/local/lib64/libcuda.so'
|
||||
|
||||
>>> convert_library_name('/Library/Frameworks/cuda.framework')
|
||||
'-framework cuda'
|
||||
"""
|
||||
framework_suffix = '.framework'
|
||||
if library_name.endswith(framework_suffix):
|
||||
framework_name = os.path.basename(library_name)[:-len(framework_suffix)]
|
||||
library_name = '-framework ' + framework_name
|
||||
return library_name
|
||||
|
||||
def get_cuda_driver_library():
|
||||
"""Gets the value of the cmake variable CUDA_DRIVER_LIBRARY."""
|
||||
return convert_library_name(r"""@CUDA_DRIVER_LIBRARY@""")
|
||||
|
||||
def cuddle_flag(flag, tokens):
|
||||
"""If flag appears by itself in tokens, combines it with the next token.
|
||||
|
||||
>>> tokens = ['-I', '/usr/include']
|
||||
>>> cuddle_flag('-I', tokens)
|
||||
>>> tokens
|
||||
['-I/usr/include']
|
||||
|
||||
>>> tokens = ['-L', '/usr/lib']
|
||||
>>> cuddle_flag('-L', tokens)
|
||||
>>> tokens
|
||||
['-L/usr/lib']
|
||||
|
||||
>>> tokens = ['-I']
|
||||
>>> cuddle_flag('-I', tokens)
|
||||
>>> tokens
|
||||
['-I']
|
||||
|
||||
>>> tokens = ['-I', '/usr/include', '-I', '/usr/local/include']
|
||||
>>> cuddle_flag('-I', tokens)
|
||||
>>> tokens
|
||||
['-I/usr/include', '-I/usr/local/include']
|
||||
"""
|
||||
start = 0
|
||||
while True:
|
||||
try:
|
||||
index = tokens.index(flag, start)
|
||||
except ValueError:
|
||||
return
|
||||
if index + 1 < len(tokens):
|
||||
follower = tokens.pop(index + 1)
|
||||
tokens[index] = flag + follower
|
||||
start = index + 1
|
||||
|
||||
def get_llvm_config_output_for_dir(llvm_config_dir, flags_string):
|
||||
"""Calls llvm-config at the given path and returns the output with -I and -L
|
||||
flags cuddled."""
|
||||
output = subprocess.check_output(
|
||||
['%s/llvm-config' % llvm_config_dir] + flags_string.split()).strip()
|
||||
tokens = shlex.split(output)
|
||||
cuddle_flag('-I', tokens)
|
||||
cuddle_flag('-L', tokens)
|
||||
return ' '.join(tokens)
|
||||
|
||||
def has_token(token, string):
|
||||
"""Checks if the given token appears in the string.
|
||||
|
||||
The token argument must be a single shell token.
|
||||
|
||||
>>> string = '-I/usr/include -L"/usr/lib"'
|
||||
>>> has_token('-I/usr/include', string)
|
||||
True
|
||||
>>> has_token('-I/usr/local/include', string)
|
||||
False
|
||||
>>> has_token('-I"/usr/include"', string)
|
||||
True
|
||||
>>> has_token('-L"/usr/lib"', string)
|
||||
True
|
||||
>>> has_token('-L/usr/lib', string)
|
||||
True
|
||||
"""
|
||||
split_token = shlex.split(token)
|
||||
if len(split_token) > 1:
|
||||
raise ValueError('has_token called with a multi-token token: ' + token)
|
||||
escaped_token = split_token[0]
|
||||
return escaped_token in shlex.split(string)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='streamexecutor-config',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=__doc__)
|
||||
|
||||
parser.add_argument(
|
||||
'--cppflags',
|
||||
action='store_true',
|
||||
help=
|
||||
'C preprocessor flags for files that include StreamExecutor headers.')
|
||||
|
||||
parser.add_argument(
|
||||
'--cxxflags',
|
||||
action='store_true',
|
||||
help='C++ compiler flags for files that include StreamExecutor headers.')
|
||||
|
||||
parser.add_argument(
|
||||
'--ldflags',
|
||||
action='store_true',
|
||||
help='Print linker flags.')
|
||||
|
||||
parser.add_argument(
|
||||
'--libs',
|
||||
action='store_true',
|
||||
help='Libraries needed to link against StreamExecutor.')
|
||||
|
||||
parser.add_argument(
|
||||
'--system-libs',
|
||||
action='store_true',
|
||||
help='System libraries needed to link against StreamExecutor.')
|
||||
|
||||
parser.add_argument(
|
||||
'--llvm-config-dir',
|
||||
default=get_llvm_config_dir(),
|
||||
help='Directory containing the llvm-config executable. '\
|
||||
'If not specified, defaults to the cmake-configured location')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Print the help message if the user did not pass any flag arguments.
|
||||
if not any(
|
||||
getattr(args, flag)
|
||||
for flag in ('cppflags', 'cxxflags', 'ldflags', 'libs', 'system_libs')):
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Check for the presence of the llvm-config executable.
|
||||
if not os.path.isfile('%s/llvm-config' % args.llvm_config_dir):
|
||||
sys.exit('llvm-config not found in: ' + args.llvm_config_dir)
|
||||
if not os.access('%s/llvm-config' % args.llvm_config_dir, os.X_OK):
|
||||
sys.exit('llvm-config not executable in: ' + args.llvm_config_dir)
|
||||
|
||||
# We will always use args.llvm_config_dir as the second argument to
|
||||
# get_llvm_config_output_for_path.
|
||||
get_llvm_config_output = lambda flags : get_llvm_config_output_for_dir(
|
||||
args.llvm_config_dir, flags)
|
||||
|
||||
all_flags = []
|
||||
|
||||
if args.cppflags:
|
||||
llvm_flags = get_llvm_config_output('--cppflags')
|
||||
all_flags.append(llvm_flags)
|
||||
se_flag = "-I%s/include" % get_cmake_install_prefix()
|
||||
if not has_token(token=se_flag, string=llvm_flags):
|
||||
all_flags.append(se_flag)
|
||||
|
||||
if args.cxxflags:
|
||||
all_flags.append(get_llvm_config_output('--cxxflags'))
|
||||
|
||||
if args.ldflags:
|
||||
llvm_flags = get_llvm_config_output('--ldflags')
|
||||
all_flags.append(llvm_flags)
|
||||
se_flag = "-L%s/lib" % get_cmake_install_prefix()
|
||||
if not has_token(token=se_flag, string=llvm_flags):
|
||||
all_flags.append(se_flag)
|
||||
|
||||
if args.libs:
|
||||
llvm_flags = get_llvm_config_output('--libs support symbolize')
|
||||
se_flag = '-lstreamexecutor'
|
||||
if not has_token(token=se_flag, string=llvm_flags):
|
||||
all_flags.append(se_flag)
|
||||
cuda_driver_library = get_cuda_driver_library()
|
||||
if cuda_driver_library:
|
||||
all_flags.append(cuda_driver_library)
|
||||
all_flags.append(llvm_flags)
|
||||
|
||||
if args.system_libs:
|
||||
all_flags.append(get_llvm_config_output('--system-libs'))
|
||||
|
||||
print(' '.join(all_flags))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,9 +0,0 @@
|
|||
add_custom_target(StreamExecutorUnitTests)
|
||||
set_target_properties(StreamExecutorUnitTests PROPERTIES FOLDER "streamexecutor tests")
|
||||
|
||||
function(add_se_unittest testdir_name)
|
||||
add_unittest(StreamExecutorUnitTests ${testdir_name} ${ARGN})
|
||||
target_link_libraries(${testdir_name} streamexecutor)
|
||||
endfunction()
|
||||
|
||||
add_subdirectory(CoreTests)
|
|
@ -1,12 +0,0 @@
|
|||
if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
|
||||
set(CUDA_TEST_SOURCES cuda/CUDATest.cpp)
|
||||
endif()
|
||||
|
||||
add_se_unittest(
|
||||
StreamExecutorCoreTests
|
||||
DeviceTest.cpp
|
||||
KernelSpecTest.cpp
|
||||
PackedKernelArgumentArrayTest.cpp
|
||||
StreamTest.cpp
|
||||
${CUDA_TEST_SOURCES}
|
||||
)
|
|
@ -1,378 +0,0 @@
|
|||
//===-- DeviceTest.cpp - Tests for Device ---------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the unit tests for Device code.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
#include "streamexecutor/platforms/host/HostPlatformDevice.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace se = ::streamexecutor;
|
||||
|
||||
const auto &getDeviceValue = se::host::HostPlatformDevice::getDeviceValue<int>;
|
||||
|
||||
/// Test fixture to hold objects used by tests.
|
||||
class DeviceTest : public ::testing::Test {
|
||||
public:
|
||||
DeviceTest()
|
||||
: Device(&PDevice), HostA5{0, 1, 2, 3, 4}, HostB5{5, 6, 7, 8, 9},
|
||||
HostA7{10, 11, 12, 13, 14, 15, 16}, HostB7{17, 18, 19, 20, 21, 22, 23},
|
||||
DeviceA5(getOrDie(Device.allocateDeviceMemory<int>(5))),
|
||||
DeviceB5(getOrDie(Device.allocateDeviceMemory<int>(5))),
|
||||
DeviceA7(getOrDie(Device.allocateDeviceMemory<int>(7))),
|
||||
DeviceB7(getOrDie(Device.allocateDeviceMemory<int>(7))),
|
||||
Host5{24, 25, 26, 27, 28}, Host7{29, 30, 31, 32, 33, 34, 35} {
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostA5, DeviceA5));
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostB5, DeviceB5));
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostA7, DeviceA7));
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostB7, DeviceB7));
|
||||
}
|
||||
|
||||
se::host::HostPlatformDevice PDevice;
|
||||
se::Device Device;
|
||||
|
||||
// Device memory is backed by host arrays.
|
||||
int HostA5[5];
|
||||
int HostB5[5];
|
||||
int HostA7[7];
|
||||
int HostB7[7];
|
||||
se::GlobalDeviceMemory<int> DeviceA5;
|
||||
se::GlobalDeviceMemory<int> DeviceB5;
|
||||
se::GlobalDeviceMemory<int> DeviceA7;
|
||||
se::GlobalDeviceMemory<int> DeviceB7;
|
||||
|
||||
// Host memory to be used as actual host memory.
|
||||
int Host5[5];
|
||||
int Host7[7];
|
||||
};
|
||||
|
||||
#define EXPECT_NO_ERROR(E) EXPECT_FALSE(static_cast<bool>(E))
|
||||
#define EXPECT_ERROR(E) \
|
||||
do { \
|
||||
se::Error E__ = E; \
|
||||
EXPECT_TRUE(static_cast<bool>(E__)); \
|
||||
consumeError(std::move(E__)); \
|
||||
} while (false)
|
||||
|
||||
using llvm::ArrayRef;
|
||||
using llvm::MutableArrayRef;
|
||||
|
||||
TEST_F(DeviceTest, GetName) { EXPECT_EQ(Device.getName(), "host"); }
|
||||
|
||||
TEST_F(DeviceTest, AllocateAndFreeDeviceMemory) {
|
||||
se::Expected<se::GlobalDeviceMemory<int>> MaybeMemory =
|
||||
Device.allocateDeviceMemory<int>(10);
|
||||
EXPECT_TRUE(static_cast<bool>(MaybeMemory));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, RegisterAndUnregisterHostMemory) {
|
||||
std::vector<int> Data(10);
|
||||
se::Expected<se::RegisteredHostMemory<int>> MaybeMemory =
|
||||
Device.registerHostMemory<int>(Data);
|
||||
EXPECT_TRUE(static_cast<bool>(MaybeMemory));
|
||||
}
|
||||
|
||||
// D2H tests
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2HToMutableArrayRefByCount) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host5), 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceB5, MutableArrayRef<int>(Host5), 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(HostB5[I], Host5[I]);
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA7, MutableArrayRef<int>(Host5), 7));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host7), 7));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host5), 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2HToMutableArrayRef) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host5)));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA7, MutableArrayRef<int>(Host5)));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host7)));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2HToPointer) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2H(DeviceA5, Host5, 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA5, Host7, 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2HSliceToMutableArrayRefByCount) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2H(
|
||||
DeviceA5.asSlice().slice(1), MutableArrayRef<int>(Host5 + 1, 4), 4));
|
||||
for (int I = 1; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2H(DeviceB5.asSlice().drop_back(1),
|
||||
MutableArrayRef<int>(Host5), 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(HostB5[I], Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA7.asSlice(),
|
||||
MutableArrayRef<int>(Host5), 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA5.asSlice(),
|
||||
MutableArrayRef<int>(Host7), 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA5.asSlice(),
|
||||
MutableArrayRef<int>(Host5), 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2HSliceToMutableArrayRef) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2H(DeviceA7.asSlice().slice(1, 5),
|
||||
MutableArrayRef<int>(Host5)));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA7[I + 1], Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA7.asSlice().drop_back(1),
|
||||
MutableArrayRef<int>(Host5)));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA5.asSlice(),
|
||||
MutableArrayRef<int>(Host7)));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2HSliceToPointer) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2H(DeviceA5.asSlice().slice(1), Host5 + 1, 4));
|
||||
for (int I = 1; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2H(DeviceA5.asSlice(), Host7, 7));
|
||||
}
|
||||
|
||||
// H2D tests
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyH2DToArrayRefByCount) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5, 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceB5, 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceB5, I), Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5, 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7, 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5, 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyH2DToArrayRef) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyH2DToPointer) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(Host5, DeviceA5, 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(Host7, DeviceA5, 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyH2DSliceToArrayRefByCount) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(ArrayRef<int>(Host5 + 1, 4),
|
||||
DeviceA5.asSlice().slice(1), 4));
|
||||
for (int I = 1; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(
|
||||
ArrayRef<int>(Host5), DeviceB5.asSlice().drop_back(1), 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceB5, I), Host5[I]);
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5.asSlice(), 7));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7.asSlice(), 7));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5.asSlice(), 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyH2DSliceToArrayRef) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5.asSlice()));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7.asSlice()));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5.asSlice()));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyH2DSliceToPointer) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyH2D(Host5, DeviceA5.asSlice(), 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyH2D(Host7, DeviceA5.asSlice(), 7));
|
||||
}
|
||||
|
||||
// D2D tests
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2DByCount) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB5, 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2D(DeviceA7, DeviceB7, 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB5, 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA7, DeviceB5, 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB7, 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2D) {
|
||||
EXPECT_NO_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA7, DeviceB5));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopySliceD2DByCount) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice().slice(1), DeviceB5, 4));
|
||||
for (int I = 0; I < 4; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I + 1), getDeviceValue(DeviceB5, I));
|
||||
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7.asSlice().drop_back(1), DeviceB7, 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5, 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB5, 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB7, 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopySliceD2D) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7.asSlice().drop_back(2), DeviceB5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7.asSlice().slice(1), DeviceB5));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice().drop_back(1), DeviceB7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2DSliceByCount) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice().slice(2), 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB7, I + 2));
|
||||
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7, DeviceB7.asSlice().drop_back(3), 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB5.asSlice(), 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA7, DeviceB5.asSlice(), 7));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice(), 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopyD2DSlice) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice().drop_back(2)));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA7, DeviceB5.asSlice()));
|
||||
|
||||
EXPECT_ERROR(Device.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice()));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopySliceD2DSliceByCount) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice(), 5));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB7.asSlice(), 2));
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice(), 7));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB5.asSlice(), 7));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB7.asSlice(), 7));
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SyncCopySliceD2DSlice) {
|
||||
EXPECT_NO_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice()));
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB5.asSlice()));
|
||||
|
||||
EXPECT_ERROR(
|
||||
Device.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB7.asSlice()));
|
||||
}
|
||||
|
||||
} // namespace
|
|
@ -1,135 +0,0 @@
|
|||
//===-- KernelSpecTest.cpp - Tests for KernelSpec -------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the unit tests for the code in KernelSpec.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/KernelSpec.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace se = ::streamexecutor;
|
||||
|
||||
TEST(CUDAPTXInMemorySpec, NoCode) {
|
||||
se::CUDAPTXInMemorySpec Spec("KernelName", {});
|
||||
EXPECT_EQ("KernelName", Spec.getKernelName());
|
||||
EXPECT_EQ(nullptr, Spec.getCode(1, 0));
|
||||
}
|
||||
|
||||
TEST(CUDAPTXInMemorySpec, SingleComputeCapability) {
|
||||
const char *PTXCodeString = "Dummy PTX code";
|
||||
se::CUDAPTXInMemorySpec Spec("KernelName", {{{1, 0}, PTXCodeString}});
|
||||
EXPECT_EQ("KernelName", Spec.getKernelName());
|
||||
EXPECT_EQ(nullptr, Spec.getCode(0, 5));
|
||||
EXPECT_EQ(PTXCodeString, Spec.getCode(1, 0));
|
||||
EXPECT_EQ(PTXCodeString, Spec.getCode(2, 0));
|
||||
}
|
||||
|
||||
TEST(CUDAPTXInMemorySpec, TwoComputeCapabilities) {
|
||||
const char *PTXCodeString10 = "Dummy PTX code 10";
|
||||
const char *PTXCodeString30 = "Dummy PTX code 30";
|
||||
se::CUDAPTXInMemorySpec Spec(
|
||||
"KernelName", {{{1, 0}, PTXCodeString10}, {{3, 0}, PTXCodeString30}});
|
||||
EXPECT_EQ("KernelName", Spec.getKernelName());
|
||||
EXPECT_EQ(nullptr, Spec.getCode(0, 5));
|
||||
EXPECT_EQ(PTXCodeString10, Spec.getCode(1, 0));
|
||||
EXPECT_EQ(PTXCodeString30, Spec.getCode(3, 0));
|
||||
EXPECT_EQ(PTXCodeString10, Spec.getCode(2, 0));
|
||||
}
|
||||
|
||||
TEST(CUDAFatbinInMemorySpec, BasicUsage) {
|
||||
const char *FatbinBytes = "Dummy fatbin bytes";
|
||||
se::CUDAFatbinInMemorySpec Spec("KernelName", FatbinBytes);
|
||||
EXPECT_EQ("KernelName", Spec.getKernelName());
|
||||
EXPECT_EQ(FatbinBytes, Spec.getBytes());
|
||||
}
|
||||
|
||||
TEST(OpenCLTextInMemorySpec, BasicUsage) {
|
||||
const char *OpenCLText = "Dummy OpenCL text";
|
||||
se::OpenCLTextInMemorySpec Spec("KernelName", OpenCLText);
|
||||
EXPECT_EQ("KernelName", Spec.getKernelName());
|
||||
EXPECT_EQ(OpenCLText, Spec.getText());
|
||||
}
|
||||
|
||||
TEST(MultiKernelLoaderSpec, NoCode) {
|
||||
se::MultiKernelLoaderSpec MultiSpec;
|
||||
EXPECT_FALSE(MultiSpec.hasCUDAPTXInMemory());
|
||||
EXPECT_FALSE(MultiSpec.hasCUDAFatbinInMemory());
|
||||
EXPECT_FALSE(MultiSpec.hasOpenCLTextInMemory());
|
||||
|
||||
EXPECT_DEBUG_DEATH(MultiSpec.getCUDAPTXInMemory(),
|
||||
"getting spec that is not present");
|
||||
EXPECT_DEBUG_DEATH(MultiSpec.getCUDAFatbinInMemory(),
|
||||
"getting spec that is not present");
|
||||
EXPECT_DEBUG_DEATH(MultiSpec.getOpenCLTextInMemory(),
|
||||
"getting spec that is not present");
|
||||
}
|
||||
|
||||
TEST(MultiKernelLoaderSpec, Registration) {
|
||||
se::MultiKernelLoaderSpec MultiSpec;
|
||||
const char *KernelName = "KernelName";
|
||||
const char *PTXCodeString = "Dummy PTX code";
|
||||
const char *FatbinBytes = "Dummy fatbin bytes";
|
||||
const char *OpenCLText = "Dummy OpenCL text";
|
||||
|
||||
MultiSpec.addCUDAPTXInMemory(KernelName, {{{1, 0}, PTXCodeString}})
|
||||
.addCUDAFatbinInMemory(KernelName, FatbinBytes)
|
||||
.addOpenCLTextInMemory(KernelName, OpenCLText);
|
||||
|
||||
EXPECT_TRUE(MultiSpec.hasCUDAPTXInMemory());
|
||||
EXPECT_TRUE(MultiSpec.hasCUDAFatbinInMemory());
|
||||
EXPECT_TRUE(MultiSpec.hasOpenCLTextInMemory());
|
||||
|
||||
EXPECT_EQ(KernelName, MultiSpec.getCUDAPTXInMemory().getKernelName());
|
||||
EXPECT_EQ(nullptr, MultiSpec.getCUDAPTXInMemory().getCode(0, 5));
|
||||
EXPECT_EQ(PTXCodeString, MultiSpec.getCUDAPTXInMemory().getCode(1, 0));
|
||||
EXPECT_EQ(PTXCodeString, MultiSpec.getCUDAPTXInMemory().getCode(2, 0));
|
||||
|
||||
EXPECT_EQ(KernelName, MultiSpec.getCUDAFatbinInMemory().getKernelName());
|
||||
EXPECT_EQ(FatbinBytes, MultiSpec.getCUDAFatbinInMemory().getBytes());
|
||||
|
||||
EXPECT_EQ(KernelName, MultiSpec.getOpenCLTextInMemory().getKernelName());
|
||||
EXPECT_EQ(OpenCLText, MultiSpec.getOpenCLTextInMemory().getText());
|
||||
}
|
||||
|
||||
TEST(MultiKernelLoaderSpec, RegisterTwice) {
|
||||
se::MultiKernelLoaderSpec MultiSpec;
|
||||
const char *KernelName = "KernelName";
|
||||
const char *FatbinBytes = "Dummy fatbin bytes";
|
||||
|
||||
MultiSpec.addCUDAFatbinInMemory(KernelName, FatbinBytes);
|
||||
|
||||
EXPECT_DEBUG_DEATH(MultiSpec.addCUDAFatbinInMemory(KernelName, FatbinBytes),
|
||||
"illegal loader spec overwrite");
|
||||
}
|
||||
|
||||
TEST(MultiKernelLoaderSpec, ConflictingKernelNames) {
|
||||
se::MultiKernelLoaderSpec MultiSpec;
|
||||
const char *KernelNameA = "KernelName";
|
||||
std::string KernelNameB = KernelNameA;
|
||||
const char *PTXCodeString = "Dummy PTX code";
|
||||
const char *FatbinBytes = "Dummy fatbin bytes";
|
||||
|
||||
// Check that names don't conflict if they are equivalent strings in different
|
||||
// locations.
|
||||
MultiSpec.addCUDAPTXInMemory(KernelNameA, {{{1, 0}, PTXCodeString}})
|
||||
.addCUDAFatbinInMemory(KernelNameB, FatbinBytes);
|
||||
|
||||
const char *OtherKernelName = "OtherKernelName";
|
||||
const char *OpenCLText = "Dummy OpenCL text";
|
||||
EXPECT_DEBUG_DEATH(
|
||||
MultiSpec.addOpenCLTextInMemory(OtherKernelName, OpenCLText),
|
||||
"different kernel names in one MultiKernelLoaderSpec");
|
||||
}
|
||||
|
||||
} // namespace
|
|
@ -1,150 +0,0 @@
|
|||
//===-- PackedKernelArgumentArrayTest.cpp - tests for kernel arg packing --===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// Unit tests for kernel argument packing.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/DeviceMemory.h"
|
||||
#include "streamexecutor/PackedKernelArgumentArray.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
#include "streamexecutor/platforms/host/HostPlatformDevice.h"
|
||||
|
||||
#include "llvm/ADT/Twine.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace se = ::streamexecutor;
|
||||
|
||||
using Type = se::KernelArgumentType;
|
||||
|
||||
// Test fixture class for testing argument packing.
|
||||
//
|
||||
// Basically defines a bunch of types to be packed so they don't have to be
|
||||
// defined separately in each test.
|
||||
class DeviceMemoryPackingTest : public ::testing::Test {
|
||||
public:
|
||||
DeviceMemoryPackingTest()
|
||||
: Device(&PDevice), Value(42), Handle(&Value), ByteCount(15),
|
||||
ElementCount(5),
|
||||
TypedGlobal(getOrDie(Device.allocateDeviceMemory<int>(ElementCount))),
|
||||
TypedShared(
|
||||
se::SharedDeviceMemory<int>::makeFromElementCount(ElementCount)) {}
|
||||
|
||||
se::host::HostPlatformDevice PDevice;
|
||||
se::Device Device;
|
||||
int Value;
|
||||
void *Handle;
|
||||
size_t ByteCount;
|
||||
size_t ElementCount;
|
||||
se::GlobalDeviceMemory<int> TypedGlobal;
|
||||
se::SharedDeviceMemory<int> TypedShared;
|
||||
};
|
||||
|
||||
// Utility method to check the expected address, size, and type for a packed
|
||||
// argument at the given index of a PackedKernelArgumentArray.
|
||||
template <typename... ParameterTs>
|
||||
static void
|
||||
ExpectEqual(const void *ExpectedAddress, size_t ExpectedSize, Type ExpectedType,
|
||||
const se::PackedKernelArgumentArray<ParameterTs...> &Observed,
|
||||
size_t Index) {
|
||||
SCOPED_TRACE(("Index = " + llvm::Twine(Index)).str());
|
||||
EXPECT_EQ(ExpectedAddress, Observed.getAddress(Index));
|
||||
EXPECT_EQ(ExpectedAddress, Observed.getAddresses()[Index]);
|
||||
EXPECT_EQ(ExpectedSize, Observed.getSize(Index));
|
||||
EXPECT_EQ(ExpectedSize, Observed.getSizes()[Index]);
|
||||
EXPECT_EQ(ExpectedType, Observed.getType(Index));
|
||||
EXPECT_EQ(ExpectedType, Observed.getTypes()[Index]);
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleValue) {
|
||||
auto Array = se::make_kernel_argument_pack(Value);
|
||||
ExpectEqual(&Value, sizeof(Value), Type::VALUE, Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(0u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleTypedGlobal) {
|
||||
auto Array = se::make_kernel_argument_pack(TypedGlobal);
|
||||
ExpectEqual(TypedGlobal.getHandleAddress(), sizeof(void *),
|
||||
Type::GLOBAL_DEVICE_MEMORY, Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(0u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleTypedGlobalPointer) {
|
||||
auto Array = se::make_kernel_argument_pack(&TypedGlobal);
|
||||
ExpectEqual(TypedGlobal.getHandleAddress(), sizeof(void *),
|
||||
Type::GLOBAL_DEVICE_MEMORY, Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(0u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleConstTypedGlobalPointer) {
|
||||
const se::GlobalDeviceMemory<int> *ArgumentPointer = &TypedGlobal;
|
||||
auto Array = se::make_kernel_argument_pack(ArgumentPointer);
|
||||
ExpectEqual(TypedGlobal.getHandleAddress(), sizeof(void *),
|
||||
Type::GLOBAL_DEVICE_MEMORY, Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(0u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleTypedShared) {
|
||||
auto Array = se::make_kernel_argument_pack(TypedShared);
|
||||
ExpectEqual(nullptr, TypedShared.getByteCount(), Type::SHARED_DEVICE_MEMORY,
|
||||
Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(1u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleTypedSharedPointer) {
|
||||
auto Array = se::make_kernel_argument_pack(&TypedShared);
|
||||
ExpectEqual(nullptr, TypedShared.getByteCount(), Type::SHARED_DEVICE_MEMORY,
|
||||
Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(1u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, SingleConstTypedSharedPointer) {
|
||||
const se::SharedDeviceMemory<int> *ArgumentPointer = &TypedShared;
|
||||
auto Array = se::make_kernel_argument_pack(ArgumentPointer);
|
||||
ExpectEqual(nullptr, TypedShared.getByteCount(), Type::SHARED_DEVICE_MEMORY,
|
||||
Array, 0);
|
||||
EXPECT_EQ(1u, Array.getArgumentCount());
|
||||
EXPECT_EQ(1u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
TEST_F(DeviceMemoryPackingTest, PackSeveralArguments) {
|
||||
const se::GlobalDeviceMemory<int> *TypedGlobalPointer = &TypedGlobal;
|
||||
const se::SharedDeviceMemory<int> *TypedSharedPointer = &TypedShared;
|
||||
auto Array = se::make_kernel_argument_pack(Value, TypedGlobal, &TypedGlobal,
|
||||
TypedGlobalPointer, TypedShared,
|
||||
&TypedShared, TypedSharedPointer);
|
||||
ExpectEqual(&Value, sizeof(Value), Type::VALUE, Array, 0);
|
||||
ExpectEqual(TypedGlobal.getHandleAddress(), sizeof(void *),
|
||||
Type::GLOBAL_DEVICE_MEMORY, Array, 1);
|
||||
ExpectEqual(TypedGlobal.getHandleAddress(), sizeof(void *),
|
||||
Type::GLOBAL_DEVICE_MEMORY, Array, 2);
|
||||
ExpectEqual(TypedGlobal.getHandleAddress(), sizeof(void *),
|
||||
Type::GLOBAL_DEVICE_MEMORY, Array, 3);
|
||||
ExpectEqual(nullptr, TypedShared.getByteCount(), Type::SHARED_DEVICE_MEMORY,
|
||||
Array, 4);
|
||||
ExpectEqual(nullptr, TypedShared.getByteCount(), Type::SHARED_DEVICE_MEMORY,
|
||||
Array, 5);
|
||||
ExpectEqual(nullptr, TypedShared.getByteCount(), Type::SHARED_DEVICE_MEMORY,
|
||||
Array, 6);
|
||||
EXPECT_EQ(7u, Array.getArgumentCount());
|
||||
EXPECT_EQ(3u, Array.getSharedCount());
|
||||
}
|
||||
|
||||
} // namespace
|
|
@ -1,290 +0,0 @@
|
|||
//===-- StreamTest.cpp - Tests for Stream ---------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the unit tests for Stream code.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "streamexecutor/Device.h"
|
||||
#include "streamexecutor/Kernel.h"
|
||||
#include "streamexecutor/KernelSpec.h"
|
||||
#include "streamexecutor/PlatformDevice.h"
|
||||
#include "streamexecutor/Stream.h"
|
||||
#include "streamexecutor/platforms/host/HostPlatformDevice.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace se = ::streamexecutor;
|
||||
|
||||
const auto &getDeviceValue = se::host::HostPlatformDevice::getDeviceValue<int>;
|
||||
|
||||
/// Test fixture to hold objects used by tests.
|
||||
class StreamTest : public ::testing::Test {
|
||||
public:
|
||||
StreamTest()
|
||||
: DummyPlatformStream(1), Device(&PDevice),
|
||||
Stream(&PDevice, &DummyPlatformStream), HostA5{0, 1, 2, 3, 4},
|
||||
HostB5{5, 6, 7, 8, 9}, HostA7{10, 11, 12, 13, 14, 15, 16},
|
||||
HostB7{17, 18, 19, 20, 21, 22, 23}, Host5{24, 25, 26, 27, 28},
|
||||
Host7{29, 30, 31, 32, 33, 34, 35},
|
||||
RegisteredHost5(getOrDie(
|
||||
Device.registerHostMemory(llvm::MutableArrayRef<int>(Host5)))),
|
||||
RegisteredHost7(getOrDie(
|
||||
Device.registerHostMemory(llvm::MutableArrayRef<int>(Host7)))),
|
||||
DeviceA5(getOrDie(Device.allocateDeviceMemory<int>(5))),
|
||||
DeviceB5(getOrDie(Device.allocateDeviceMemory<int>(5))),
|
||||
DeviceA7(getOrDie(Device.allocateDeviceMemory<int>(7))),
|
||||
DeviceB7(getOrDie(Device.allocateDeviceMemory<int>(7))) {
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostA5, DeviceA5));
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostB5, DeviceB5));
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostA7, DeviceA7));
|
||||
se::dieIfError(Device.synchronousCopyH2D<int>(HostB7, DeviceB7));
|
||||
}
|
||||
|
||||
protected:
|
||||
int DummyPlatformStream; // Mimicking a platform where the platform stream
|
||||
// handle is just a stream number.
|
||||
se::host::HostPlatformDevice PDevice;
|
||||
se::Device Device;
|
||||
se::Stream Stream;
|
||||
|
||||
// Device memory is matched by host arrays.
|
||||
int HostA5[5];
|
||||
int HostB5[5];
|
||||
int HostA7[7];
|
||||
int HostB7[7];
|
||||
|
||||
// Host memory to be used as actual host memory.
|
||||
int Host5[5];
|
||||
int Host7[7];
|
||||
|
||||
se::RegisteredHostMemory<int> RegisteredHost5;
|
||||
se::RegisteredHostMemory<int> RegisteredHost7;
|
||||
|
||||
// Device memory.
|
||||
se::GlobalDeviceMemory<int> DeviceA5;
|
||||
se::GlobalDeviceMemory<int> DeviceB5;
|
||||
se::GlobalDeviceMemory<int> DeviceA7;
|
||||
se::GlobalDeviceMemory<int> DeviceB7;
|
||||
};
|
||||
|
||||
// D2H tests
|
||||
|
||||
TEST_F(StreamTest, CopyD2HToRegisteredRefByCount) {
|
||||
Stream.thenCopyD2H(DeviceA5, RegisteredHost5, 5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
Stream.thenCopyD2H(DeviceB5, RegisteredHost5, 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(HostB5[I], Host5[I]);
|
||||
|
||||
Stream.thenCopyD2H(DeviceA7, RegisteredHost5, 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyD2HToRegistered) {
|
||||
Stream.thenCopyD2H(DeviceA5, RegisteredHost5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
Stream.thenCopyD2H(DeviceA5, RegisteredHost7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyD2HSliceToRegiseredSliceByCount) {
|
||||
Stream.thenCopyD2H(DeviceA5.asSlice().slice(1),
|
||||
RegisteredHost5.asSlice().slice(1, 4), 4);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 1; I < 5; ++I)
|
||||
EXPECT_EQ(HostA5[I], Host5[I]);
|
||||
|
||||
Stream.thenCopyD2H(DeviceB5.asSlice().drop_back(1), RegisteredHost5, 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(HostB5[I], Host5[I]);
|
||||
|
||||
Stream.thenCopyD2H(DeviceA5.asSlice(), RegisteredHost7, 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyD2HSliceToRegistered) {
|
||||
Stream.thenCopyD2H(DeviceA7.asSlice().slice(1, 5), RegisteredHost5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(HostA7[I + 1], Host5[I]);
|
||||
|
||||
Stream.thenCopyD2H(DeviceA5.asSlice(), RegisteredHost7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
// H2D tests
|
||||
|
||||
TEST_F(StreamTest, CopyH2DFromRegisterdByCount) {
|
||||
Stream.thenCopyH2D(RegisteredHost5, DeviceA5, 5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredHost5, DeviceB5, 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceB5, I), Host5[I]);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredHost7, DeviceA5, 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyH2DFromRegistered) {
|
||||
Stream.thenCopyH2D(RegisteredHost5, DeviceA5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredHost7, DeviceA5);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyH2DFromRegisteredSliceToSlice) {
|
||||
Stream.thenCopyH2D(RegisteredHost5.asSlice().slice(1, 4),
|
||||
DeviceA5.asSlice().slice(1), 4);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 1; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredHost5, DeviceB5.asSlice().drop_back(1), 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceB5, I), Host5[I]);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredHost5, DeviceA5.asSlice(), 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyH2DRegisteredToSlice) {
|
||||
Stream.thenCopyH2D(RegisteredHost5, DeviceA5.asSlice());
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), Host5[I]);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredHost7, DeviceA5.asSlice());
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
// D2D tests
|
||||
|
||||
TEST_F(StreamTest, CopyD2DByCount) {
|
||||
Stream.thenCopyD2D(DeviceA5, DeviceB5, 5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7, DeviceB7, 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7, DeviceB5, 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyD2D) {
|
||||
Stream.thenCopyD2D(DeviceA5, DeviceB5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7, DeviceB5);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopySliceD2DByCount) {
|
||||
Stream.thenCopyD2D(DeviceA5.asSlice().slice(1), DeviceB5, 4);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 4; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I + 1), getDeviceValue(DeviceB5, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7.asSlice().drop_back(1), DeviceB7, 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB5, 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopySliceD2D) {
|
||||
Stream.thenCopyD2D(DeviceA7.asSlice().drop_back(2), DeviceB5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA5.asSlice().drop_back(1), DeviceB7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyD2DSliceByCount) {
|
||||
Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice().slice(2), 5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB7, I + 2));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7, DeviceB7.asSlice().drop_back(3), 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice(), 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopyD2DSlice) {
|
||||
Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice().drop_back(2));
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice());
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopySliceD2DSliceByCount) {
|
||||
Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice(), 5);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7.asSlice(), DeviceB7.asSlice(), 2);
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 2; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA7, I), getDeviceValue(DeviceB7, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA7.asSlice(), DeviceB5.asSlice(), 7);
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
TEST_F(StreamTest, CopySliceD2DSlice) {
|
||||
Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice());
|
||||
EXPECT_TRUE(Stream.isOK());
|
||||
for (int I = 0; I < 5; ++I)
|
||||
EXPECT_EQ(getDeviceValue(DeviceA5, I), getDeviceValue(DeviceB5, I));
|
||||
|
||||
Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB7.asSlice());
|
||||
EXPECT_FALSE(Stream.isOK());
|
||||
}
|
||||
|
||||
} // namespace
|
|
@ -1,215 +0,0 @@
|
|||
//===-- CUDATest.cpp - Tests for CUDA platform ----------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains the unit tests for CUDA platform code.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "streamexecutor/StreamExecutor.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace {
|
||||
|
||||
namespace compilergen {
|
||||
using SaxpyKernel =
|
||||
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
|
||||
streamexecutor::GlobalDeviceMemory<float>>;
|
||||
|
||||
const char *SaxpyPTX = R"(
|
||||
.version 4.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
|
||||
.reg .f32 %AValue;
|
||||
.reg .f32 %XValue;
|
||||
.reg .f32 %YValue;
|
||||
.reg .f32 %Result;
|
||||
|
||||
.reg .b64 %XBaseAddrGeneric;
|
||||
.reg .b64 %YBaseAddrGeneric;
|
||||
.reg .b64 %XBaseAddrGlobal;
|
||||
.reg .b64 %YBaseAddrGlobal;
|
||||
.reg .b64 %XAddr;
|
||||
.reg .b64 %YAddr;
|
||||
.reg .b64 %ThreadByteOffset;
|
||||
|
||||
.reg .b32 %TID;
|
||||
|
||||
ld.param.f32 %AValue, [A];
|
||||
ld.param.u64 %XBaseAddrGeneric, [X];
|
||||
ld.param.u64 %YBaseAddrGeneric, [Y];
|
||||
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
|
||||
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
|
||||
mov.u32 %TID, %tid.x;
|
||||
mul.wide.u32 %ThreadByteOffset, %TID, 4;
|
||||
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
|
||||
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
|
||||
ld.global.f32 %XValue, [%XAddr];
|
||||
ld.global.f32 %YValue, [%YAddr];
|
||||
fma.rn.f32 %Result, %AValue, %XValue, %YValue;
|
||||
st.global.f32 [%XAddr], %Result;
|
||||
ret;
|
||||
}
|
||||
)";
|
||||
|
||||
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
|
||||
streamexecutor::MultiKernelLoaderSpec Spec;
|
||||
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
|
||||
return Spec;
|
||||
}();
|
||||
|
||||
using SwapPairsKernel =
|
||||
streamexecutor::Kernel<streamexecutor::SharedDeviceMemory<int>,
|
||||
streamexecutor::GlobalDeviceMemory<int>, int>;
|
||||
|
||||
const char *SwapPairsPTX = R"(
|
||||
.version 4.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
.extern .shared .align 4 .b8 SwapSpace[];
|
||||
|
||||
.visible .entry SwapPairs(.param .u64 InOut, .param .u32 InOutSize) {
|
||||
.reg .b64 %InOutGeneric;
|
||||
.reg .b32 %InOutSizeValue;
|
||||
|
||||
.reg .b32 %LocalIndex;
|
||||
.reg .b32 %PartnerIndex;
|
||||
.reg .b32 %ThreadsPerBlock;
|
||||
.reg .b32 %BlockIndex;
|
||||
.reg .b32 %GlobalIndex;
|
||||
|
||||
.reg .b32 %GlobalIndexBound;
|
||||
.reg .pred %GlobalIndexTooHigh;
|
||||
|
||||
.reg .b64 %InOutGlobal;
|
||||
.reg .b64 %GlobalByteOffset;
|
||||
.reg .b64 %GlobalAddress;
|
||||
|
||||
.reg .b32 %InitialValue;
|
||||
.reg .b32 %SwappedValue;
|
||||
|
||||
.reg .b64 %SharedBaseAddr;
|
||||
.reg .b64 %LocalWriteByteOffset;
|
||||
.reg .b64 %LocalReadByteOffset;
|
||||
.reg .b64 %SharedWriteAddr;
|
||||
.reg .b64 %SharedReadAddr;
|
||||
|
||||
ld.param.u64 %InOutGeneric, [InOut];
|
||||
ld.param.u32 %InOutSizeValue, [InOutSize];
|
||||
mov.u32 %LocalIndex, %tid.x;
|
||||
mov.u32 %ThreadsPerBlock, %ntid.x;
|
||||
mov.u32 %BlockIndex, %ctaid.x;
|
||||
mad.lo.s32 %GlobalIndex, %ThreadsPerBlock, %BlockIndex, %LocalIndex;
|
||||
and.b32 %GlobalIndexBound, %InOutSizeValue, -2;
|
||||
setp.ge.s32 %GlobalIndexTooHigh, %GlobalIndex, %GlobalIndexBound;
|
||||
@%GlobalIndexTooHigh bra END;
|
||||
|
||||
cvta.to.global.u64 %InOutGlobal, %InOutGeneric;
|
||||
mul.wide.s32 %GlobalByteOffset, %GlobalIndex, 4;
|
||||
add.s64 %GlobalAddress, %InOutGlobal, %GlobalByteOffset;
|
||||
ld.global.u32 %InitialValue, [%GlobalAddress];
|
||||
mul.wide.s32 %LocalWriteByteOffset, %LocalIndex, 4;
|
||||
mov.u64 %SharedBaseAddr, SwapSpace;
|
||||
add.s64 %SharedWriteAddr, %SharedBaseAddr, %LocalWriteByteOffset;
|
||||
st.shared.u32 [%SharedWriteAddr], %InitialValue;
|
||||
bar.sync 0;
|
||||
xor.b32 %PartnerIndex, %LocalIndex, 1;
|
||||
mul.wide.s32 %LocalReadByteOffset, %PartnerIndex, 4;
|
||||
add.s64 %SharedReadAddr, %SharedBaseAddr, %LocalReadByteOffset;
|
||||
ld.shared.u32 %SwappedValue, [%SharedReadAddr];
|
||||
st.global.u32 [%GlobalAddress], %SwappedValue;
|
||||
|
||||
END:
|
||||
ret;
|
||||
}
|
||||
)";
|
||||
|
||||
static streamexecutor::MultiKernelLoaderSpec SwapPairsLoaderSpec = []() {
|
||||
streamexecutor::MultiKernelLoaderSpec Spec;
|
||||
Spec.addCUDAPTXInMemory("SwapPairs", {{{2, 0}, SwapPairsPTX}});
|
||||
return Spec;
|
||||
}();
|
||||
} // namespace compilergen
|
||||
|
||||
namespace se = ::streamexecutor;
|
||||
namespace cg = ::compilergen;
|
||||
|
||||
class CUDATest : public ::testing::Test {
|
||||
public:
|
||||
CUDATest()
|
||||
: Platform(getOrDie(se::PlatformManager::getPlatformByName("CUDA"))),
|
||||
Device(getOrDie(Platform->getDevice(0))),
|
||||
Stream(getOrDie(Device.createStream())) {}
|
||||
|
||||
se::Platform *Platform;
|
||||
se::Device Device;
|
||||
se::Stream Stream;
|
||||
};
|
||||
|
||||
TEST_F(CUDATest, Saxpy) {
|
||||
float A = 42.0f;
|
||||
std::vector<float> HostX = {0, 1, 2, 3};
|
||||
std::vector<float> HostY = {4, 5, 6, 7};
|
||||
size_t ArraySize = HostX.size();
|
||||
|
||||
cg::SaxpyKernel Kernel =
|
||||
getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
|
||||
|
||||
se::RegisteredHostMemory<float> RegisteredX =
|
||||
getOrDie(Device.registerHostMemory<float>(HostX));
|
||||
se::RegisteredHostMemory<float> RegisteredY =
|
||||
getOrDie(Device.registerHostMemory<float>(HostY));
|
||||
|
||||
se::GlobalDeviceMemory<float> X =
|
||||
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
||||
se::GlobalDeviceMemory<float> Y =
|
||||
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
||||
|
||||
Stream.thenCopyH2D(RegisteredX, X)
|
||||
.thenCopyH2D(RegisteredY, Y)
|
||||
.thenLaunch(ArraySize, 1, Kernel, A, X, Y)
|
||||
.thenCopyD2H(X, RegisteredX);
|
||||
se::dieIfError(Stream.blockHostUntilDone());
|
||||
|
||||
std::vector<float> ExpectedX = {4, 47, 90, 133};
|
||||
EXPECT_EQ(ExpectedX, HostX);
|
||||
}
|
||||
|
||||
TEST_F(CUDATest, DynamicSharedMemory) {
|
||||
std::vector<int> HostPairs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
|
||||
std::vector<int> HostResult(HostPairs.size(), 0);
|
||||
int ArraySize = HostPairs.size();
|
||||
|
||||
cg::SwapPairsKernel Kernel = getOrDie(
|
||||
Device.createKernel<cg::SwapPairsKernel>(cg::SwapPairsLoaderSpec));
|
||||
|
||||
se::RegisteredHostMemory<int> RegisteredPairs =
|
||||
getOrDie(Device.registerHostMemory<int>(HostPairs));
|
||||
se::RegisteredHostMemory<int> RegisteredResult =
|
||||
getOrDie(Device.registerHostMemory<int>(HostResult));
|
||||
|
||||
se::GlobalDeviceMemory<int> Pairs =
|
||||
getOrDie(Device.allocateDeviceMemory<int>(ArraySize));
|
||||
auto SharedMemory =
|
||||
se::SharedDeviceMemory<int>::makeFromElementCount(ArraySize);
|
||||
|
||||
Stream.thenCopyH2D(RegisteredPairs, Pairs)
|
||||
.thenLaunch(ArraySize, 1, Kernel, SharedMemory, Pairs, ArraySize)
|
||||
.thenCopyD2H(Pairs, RegisteredResult);
|
||||
se::dieIfError(Stream.blockHostUntilDone());
|
||||
|
||||
std::vector<int> ExpectedPairs = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10};
|
||||
EXPECT_EQ(ExpectedPairs, HostResult);
|
||||
}
|
||||
|
||||
} // namespace
|
Loading…
Reference in New Issue