mirror of https://github.com/microsoft/clang.git
Add action builder for HIP
To support separate compile/link and linking across device IR in different source files, a new HIP action builder is introduced. Basically it compiles/links host and device code separately, and embed fat binary in host linking stage through linker script. Differential Revision: https://reviews.llvm.org/D46476 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@333483 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
722b9995ca
commit
68d42ff0b5
|
@ -2151,9 +2151,10 @@ class OffloadingActionBuilder final {
|
|||
}
|
||||
};
|
||||
|
||||
/// CUDA action builder. It injects device code in the host backend
|
||||
/// action.
|
||||
class CudaActionBuilder final : public DeviceActionBuilder {
|
||||
/// Base class for CUDA/HIP action builder. It injects device code in
|
||||
/// the host backend action.
|
||||
class CudaActionBuilderBase : public DeviceActionBuilder {
|
||||
protected:
|
||||
/// Flags to signal if the user requested host-only or device-only
|
||||
/// compilation.
|
||||
bool CompileHostOnly = false;
|
||||
|
@ -2170,11 +2171,185 @@ class OffloadingActionBuilder final {
|
|||
|
||||
/// Flag that is set to true if this builder acted on the current input.
|
||||
bool IsActive = false;
|
||||
public:
|
||||
CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
|
||||
const Driver::InputList &Inputs,
|
||||
Action::OffloadKind OFKind)
|
||||
: DeviceActionBuilder(C, Args, Inputs, OFKind) {}
|
||||
|
||||
ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
|
||||
// While generating code for CUDA, we only depend on the host input action
|
||||
// to trigger the creation of all the CUDA device actions.
|
||||
|
||||
// If we are dealing with an input action, replicate it for each GPU
|
||||
// architecture. If we are in host-only mode we return 'success' so that
|
||||
// the host uses the CUDA offload kind.
|
||||
if (auto *IA = dyn_cast<InputAction>(HostAction)) {
|
||||
assert(!GpuArchList.empty() &&
|
||||
"We should have at least one GPU architecture.");
|
||||
|
||||
// If the host input is not CUDA or HIP, we don't need to bother about
|
||||
// this input.
|
||||
if (IA->getType() != types::TY_CUDA &&
|
||||
IA->getType() != types::TY_HIP) {
|
||||
// The builder will ignore this input.
|
||||
IsActive = false;
|
||||
return ABRT_Inactive;
|
||||
}
|
||||
|
||||
// Set the flag to true, so that the builder acts on the current input.
|
||||
IsActive = true;
|
||||
|
||||
if (CompileHostOnly)
|
||||
return ABRT_Success;
|
||||
|
||||
// Replicate inputs for each GPU architecture.
|
||||
auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE
|
||||
: types::TY_CUDA_DEVICE;
|
||||
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
|
||||
CudaDeviceActions.push_back(
|
||||
C.MakeAction<InputAction>(IA->getInputArg(), Ty));
|
||||
}
|
||||
|
||||
return ABRT_Success;
|
||||
}
|
||||
|
||||
// If this is an unbundling action use it as is for each CUDA toolchain.
|
||||
if (auto *UA = dyn_cast<OffloadUnbundlingJobAction>(HostAction)) {
|
||||
CudaDeviceActions.clear();
|
||||
for (auto Arch : GpuArchList) {
|
||||
CudaDeviceActions.push_back(UA);
|
||||
UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch),
|
||||
AssociatedOffloadKind);
|
||||
}
|
||||
return ABRT_Success;
|
||||
}
|
||||
|
||||
return IsActive ? ABRT_Success : ABRT_Inactive;
|
||||
}
|
||||
|
||||
void appendTopLevelActions(ActionList &AL) override {
|
||||
// Utility to append actions to the top level list.
|
||||
auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
|
||||
OffloadAction::DeviceDependences Dep;
|
||||
Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
|
||||
AssociatedOffloadKind);
|
||||
AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
|
||||
};
|
||||
|
||||
// If we have a fat binary, add it to the list.
|
||||
if (CudaFatBinary) {
|
||||
AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN);
|
||||
CudaDeviceActions.clear();
|
||||
CudaFatBinary = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
if (CudaDeviceActions.empty())
|
||||
return;
|
||||
|
||||
// If we have CUDA actions at this point, that's because we have a have
|
||||
// partial compilation, so we should have an action for each GPU
|
||||
// architecture.
|
||||
assert(CudaDeviceActions.size() == GpuArchList.size() &&
|
||||
"Expecting one action per GPU architecture.");
|
||||
assert(ToolChains.size() == 1 &&
|
||||
"Expecting to have a sing CUDA toolchain.");
|
||||
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
|
||||
AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
|
||||
|
||||
CudaDeviceActions.clear();
|
||||
}
|
||||
|
||||
bool initialize() override {
|
||||
assert(AssociatedOffloadKind == Action::OFK_Cuda ||
|
||||
AssociatedOffloadKind == Action::OFK_HIP);
|
||||
|
||||
// We don't need to support CUDA.
|
||||
if (AssociatedOffloadKind == Action::OFK_Cuda &&
|
||||
!C.hasOffloadToolChain<Action::OFK_Cuda>())
|
||||
return false;
|
||||
|
||||
// We don't need to support HIP.
|
||||
if (AssociatedOffloadKind == Action::OFK_HIP &&
|
||||
!C.hasOffloadToolChain<Action::OFK_HIP>())
|
||||
return false;
|
||||
|
||||
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
|
||||
assert(HostTC && "No toolchain for host compilation.");
|
||||
if (HostTC->getTriple().isNVPTX() ||
|
||||
HostTC->getTriple().getArch() == llvm::Triple::amdgcn) {
|
||||
// We do not support targeting NVPTX/AMDGCN for host compilation. Throw
|
||||
// an error and abort pipeline construction early so we don't trip
|
||||
// asserts that assume device-side compilation.
|
||||
C.getDriver().Diag(diag::err_drv_cuda_host_arch)
|
||||
<< HostTC->getTriple().getArchName();
|
||||
return true;
|
||||
}
|
||||
|
||||
ToolChains.push_back(
|
||||
AssociatedOffloadKind == Action::OFK_Cuda
|
||||
? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
|
||||
: C.getSingleOffloadToolChain<Action::OFK_HIP>());
|
||||
|
||||
Arg *PartialCompilationArg = Args.getLastArg(
|
||||
options::OPT_cuda_host_only, options::OPT_cuda_device_only,
|
||||
options::OPT_cuda_compile_host_device);
|
||||
CompileHostOnly = PartialCompilationArg &&
|
||||
PartialCompilationArg->getOption().matches(
|
||||
options::OPT_cuda_host_only);
|
||||
CompileDeviceOnly = PartialCompilationArg &&
|
||||
PartialCompilationArg->getOption().matches(
|
||||
options::OPT_cuda_device_only);
|
||||
|
||||
// Collect all cuda_gpu_arch parameters, removing duplicates.
|
||||
std::set<CudaArch> GpuArchs;
|
||||
bool Error = false;
|
||||
for (Arg *A : Args) {
|
||||
if (!(A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) ||
|
||||
A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ)))
|
||||
continue;
|
||||
A->claim();
|
||||
|
||||
const StringRef ArchStr = A->getValue();
|
||||
if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ) &&
|
||||
ArchStr == "all") {
|
||||
GpuArchs.clear();
|
||||
continue;
|
||||
}
|
||||
CudaArch Arch = StringToCudaArch(ArchStr);
|
||||
if (Arch == CudaArch::UNKNOWN) {
|
||||
C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
|
||||
Error = true;
|
||||
} else if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
|
||||
GpuArchs.insert(Arch);
|
||||
else if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ))
|
||||
GpuArchs.erase(Arch);
|
||||
else
|
||||
llvm_unreachable("Unexpected option.");
|
||||
}
|
||||
|
||||
// Collect list of GPUs remaining in the set.
|
||||
for (CudaArch Arch : GpuArchs)
|
||||
GpuArchList.push_back(Arch);
|
||||
|
||||
// Default to sm_20 which is the lowest common denominator for
|
||||
// supported GPUs. sm_20 code should work correctly, if
|
||||
// suboptimally, on all newer GPUs.
|
||||
if (GpuArchList.empty())
|
||||
GpuArchList.push_back(CudaArch::SM_20);
|
||||
|
||||
return Error;
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief CUDA action builder. It injects device code in the host backend
|
||||
/// action.
|
||||
class CudaActionBuilder final : public CudaActionBuilderBase {
|
||||
public:
|
||||
CudaActionBuilder(Compilation &C, DerivedArgList &Args,
|
||||
const Driver::InputList &Inputs)
|
||||
: DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
|
||||
: CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {}
|
||||
|
||||
ActionBuilderReturnCode
|
||||
getDeviceDependences(OffloadAction::DeviceDependences &DA,
|
||||
|
@ -2279,147 +2454,73 @@ class OffloadingActionBuilder final {
|
|||
|
||||
return ABRT_Success;
|
||||
}
|
||||
};
|
||||
/// \brief HIP action builder. It injects device code in the host backend
|
||||
/// action.
|
||||
class HIPActionBuilder final : public CudaActionBuilderBase {
|
||||
/// The linker inputs obtained for each device arch.
|
||||
SmallVector<ActionList, 8> DeviceLinkerInputs;
|
||||
|
||||
ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
|
||||
// While generating code for CUDA, we only depend on the host input action
|
||||
// to trigger the creation of all the CUDA device actions.
|
||||
public:
|
||||
HIPActionBuilder(Compilation &C, DerivedArgList &Args,
|
||||
const Driver::InputList &Inputs)
|
||||
: CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
|
||||
|
||||
// If we are dealing with an input action, replicate it for each GPU
|
||||
// architecture. If we are in host-only mode we return 'success' so that
|
||||
// the host uses the CUDA offload kind.
|
||||
if (auto *IA = dyn_cast<InputAction>(HostAction)) {
|
||||
assert(!GpuArchList.empty() &&
|
||||
"We should have at least one GPU architecture.");
|
||||
bool canUseBundlerUnbundler() const override { return true; }
|
||||
|
||||
// If the host input is not CUDA or HIP, we don't need to bother about
|
||||
// this input.
|
||||
if (IA->getType() != types::TY_CUDA &&
|
||||
IA->getType() != types::TY_HIP) {
|
||||
// The builder will ignore this input.
|
||||
IsActive = false;
|
||||
return ABRT_Inactive;
|
||||
}
|
||||
|
||||
// Set the flag to true, so that the builder acts on the current input.
|
||||
IsActive = true;
|
||||
|
||||
if (CompileHostOnly)
|
||||
return ABRT_Success;
|
||||
|
||||
// Replicate inputs for each GPU architecture.
|
||||
auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE
|
||||
: types::TY_CUDA_DEVICE;
|
||||
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
|
||||
CudaDeviceActions.push_back(
|
||||
C.MakeAction<InputAction>(IA->getInputArg(), Ty));
|
||||
ActionBuilderReturnCode
|
||||
getDeviceDependences(OffloadAction::DeviceDependences &DA,
|
||||
phases::ID CurPhase, phases::ID FinalPhase,
|
||||
PhasesTy &Phases) override {
|
||||
// amdgcn does not support linking of object files, therefore we skip
|
||||
// backend and assemble phases to output LLVM IR.
|
||||
if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
|
||||
CurPhase == phases::Assemble)
|
||||
return ABRT_Success;
|
||||
|
||||
assert((CurPhase == phases::Link ||
|
||||
CudaDeviceActions.size() == GpuArchList.size()) &&
|
||||
"Expecting one action per GPU architecture.");
|
||||
assert(!CompileHostOnly &&
|
||||
"Not expecting CUDA actions in host-only compilation.");
|
||||
|
||||
// Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
|
||||
// This happens to each device action originated from each input file.
|
||||
// Later on, device actions in DeviceLinkerInputs are used to create
|
||||
// device link actions in appendLinkDependences and the created device
|
||||
// link actions are passed to the offload action as device dependence.
|
||||
if (CurPhase == phases::Link) {
|
||||
DeviceLinkerInputs.resize(CudaDeviceActions.size());
|
||||
auto LI = DeviceLinkerInputs.begin();
|
||||
for (auto *A : CudaDeviceActions) {
|
||||
LI->push_back(A);
|
||||
++LI;
|
||||
}
|
||||
|
||||
// We will pass the device action as a host dependence, so we don't
|
||||
// need to do anything else with them.
|
||||
CudaDeviceActions.clear();
|
||||
return ABRT_Success;
|
||||
}
|
||||
|
||||
return IsActive ? ABRT_Success : ABRT_Inactive;
|
||||
// By default, we produce an action for each device arch.
|
||||
for (Action *&A : CudaDeviceActions)
|
||||
A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
|
||||
AssociatedOffloadKind);
|
||||
|
||||
return ABRT_Success;
|
||||
}
|
||||
|
||||
void appendTopLevelActions(ActionList &AL) override {
|
||||
// Utility to append actions to the top level list.
|
||||
auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
|
||||
OffloadAction::DeviceDependences Dep;
|
||||
Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
|
||||
Action::OFK_Cuda);
|
||||
AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
|
||||
};
|
||||
|
||||
// If we have a fat binary, add it to the list.
|
||||
if (CudaFatBinary) {
|
||||
AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN);
|
||||
CudaDeviceActions.clear();
|
||||
CudaFatBinary = nullptr;
|
||||
return;
|
||||
void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
|
||||
// Append a new link action for each device.
|
||||
unsigned I = 0;
|
||||
for (auto &LI : DeviceLinkerInputs) {
|
||||
auto *DeviceLinkAction =
|
||||
C.MakeAction<LinkJobAction>(LI, types::TY_Image);
|
||||
DA.add(*DeviceLinkAction, *ToolChains[0],
|
||||
CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
|
||||
++I;
|
||||
}
|
||||
|
||||
if (CudaDeviceActions.empty())
|
||||
return;
|
||||
|
||||
// If we have CUDA actions at this point, that's because we have a have
|
||||
// partial compilation, so we should have an action for each GPU
|
||||
// architecture.
|
||||
assert(CudaDeviceActions.size() == GpuArchList.size() &&
|
||||
"Expecting one action per GPU architecture.");
|
||||
assert(ToolChains.size() == 1 &&
|
||||
"Expecting to have a sing CUDA toolchain.");
|
||||
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
|
||||
AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
|
||||
|
||||
CudaDeviceActions.clear();
|
||||
}
|
||||
|
||||
bool initialize() override {
|
||||
// We don't need to support CUDA.
|
||||
if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
|
||||
return false;
|
||||
|
||||
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
|
||||
assert(HostTC && "No toolchain for host compilation.");
|
||||
if (HostTC->getTriple().isNVPTX() ||
|
||||
HostTC->getTriple().getArch() == llvm::Triple::amdgcn) {
|
||||
// We do not support targeting NVPTX/AMDGCN for host compilation. Throw
|
||||
// an error and abort pipeline construction early so we don't trip
|
||||
// asserts that assume device-side compilation.
|
||||
C.getDriver().Diag(diag::err_drv_cuda_host_arch)
|
||||
<< HostTC->getTriple().getArchName();
|
||||
return true;
|
||||
}
|
||||
|
||||
ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
|
||||
|
||||
Arg *PartialCompilationArg = Args.getLastArg(
|
||||
options::OPT_cuda_host_only, options::OPT_cuda_device_only,
|
||||
options::OPT_cuda_compile_host_device);
|
||||
CompileHostOnly = PartialCompilationArg &&
|
||||
PartialCompilationArg->getOption().matches(
|
||||
options::OPT_cuda_host_only);
|
||||
CompileDeviceOnly = PartialCompilationArg &&
|
||||
PartialCompilationArg->getOption().matches(
|
||||
options::OPT_cuda_device_only);
|
||||
|
||||
// Collect all cuda_gpu_arch parameters, removing duplicates.
|
||||
std::set<CudaArch> GpuArchs;
|
||||
bool Error = false;
|
||||
for (Arg *A : Args) {
|
||||
if (!(A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) ||
|
||||
A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ)))
|
||||
continue;
|
||||
A->claim();
|
||||
|
||||
const StringRef ArchStr = A->getValue();
|
||||
if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ) &&
|
||||
ArchStr == "all") {
|
||||
GpuArchs.clear();
|
||||
continue;
|
||||
}
|
||||
CudaArch Arch = StringToCudaArch(ArchStr);
|
||||
if (Arch == CudaArch::UNKNOWN) {
|
||||
C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
|
||||
Error = true;
|
||||
} else if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
|
||||
GpuArchs.insert(Arch);
|
||||
else if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ))
|
||||
GpuArchs.erase(Arch);
|
||||
else
|
||||
llvm_unreachable("Unexpected option.");
|
||||
}
|
||||
|
||||
// Collect list of GPUs remaining in the set.
|
||||
for (CudaArch Arch : GpuArchs)
|
||||
GpuArchList.push_back(Arch);
|
||||
|
||||
// Default to sm_20 which is the lowest common denominator for
|
||||
// supported GPUs. sm_20 code should work correctly, if
|
||||
// suboptimally, on all newer GPUs.
|
||||
if (GpuArchList.empty())
|
||||
GpuArchList.push_back(CudaArch::SM_20);
|
||||
|
||||
return Error;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2589,6 +2690,9 @@ public:
|
|||
// Create a specialized builder for CUDA.
|
||||
SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
|
||||
|
||||
// Create a specialized builder for HIP.
|
||||
SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs));
|
||||
|
||||
// Create a specialized builder for OpenMP.
|
||||
SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs));
|
||||
|
||||
|
|
|
@ -7,195 +7,267 @@
|
|||
// REQUIRES: clang-driver
|
||||
// REQUIRES: powerpc-registered-target
|
||||
// REQUIRES: nvptx-registered-target
|
||||
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
//
|
||||
// Test single gpu architecture with complete compilation.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=BIN %s
|
||||
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
|
||||
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
|
||||
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
|
||||
// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
|
||||
// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
|
||||
// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
|
||||
// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
|
||||
// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
|
||||
// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
|
||||
// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
|
||||
// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
|
||||
// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
|
||||
// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
|
||||
// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
|
||||
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
|
||||
// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:sm_30]])
|
||||
// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:gfx803]])
|
||||
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
|
||||
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
|
||||
// BIN_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
|
||||
// BIN_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
|
||||
// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object
|
||||
// BIN_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler
|
||||
// BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
|
||||
// BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
|
||||
// BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
|
||||
// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
|
||||
// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
|
||||
// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
|
||||
// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
|
||||
// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
|
||||
// BIN_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
|
||||
|
||||
//
|
||||
// Test single gpu architecture up to the assemble phase.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=ASM %s
|
||||
// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
|
||||
// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
|
||||
// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
|
||||
// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
|
||||
// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
|
||||
// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
|
||||
// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
|
||||
// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
|
||||
// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
|
||||
// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
|
||||
// ASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
|
||||
// ASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
|
||||
// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
|
||||
// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-[[T]])
|
||||
// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]])
|
||||
|
||||
//
|
||||
// Test two gpu architectures with complete compilation.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=BIN2 %s
|
||||
// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
|
||||
// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
|
||||
// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
|
||||
// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
|
||||
// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
|
||||
// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
|
||||
// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
|
||||
// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
|
||||
// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
|
||||
// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
|
||||
// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
|
||||
// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
|
||||
// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
|
||||
// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
|
||||
// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
|
||||
// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
|
||||
// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
|
||||
// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
|
||||
// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
|
||||
// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
|
||||
// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
|
||||
// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
|
||||
// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH1:sm_30|gfx803]])
|
||||
// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
|
||||
// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
|
||||
// BIN2_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]])
|
||||
// BIN2_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]])
|
||||
// BIN2_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH1]])" {[[P7]]}, object
|
||||
// BIN2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler
|
||||
// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
|
||||
// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
|
||||
// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
|
||||
// BIN2_NV-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]])
|
||||
// BIN2_NV-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]])
|
||||
// BIN2_NV-DAG: [[P15:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object
|
||||
// BIN2_NV-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler
|
||||
// BIN2_NV-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-[[T]])
|
||||
// BIN2_NV-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P17]]}, ir
|
||||
// BIN2_NV-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]])
|
||||
// BIN2_AMD-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
|
||||
// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
|
||||
// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]])
|
||||
// BIN2_AMD-DAG: [[P22:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH1]])
|
||||
// BIN2_AMD-DAG: [[P23:[0-9]+]]: linker, {[[P12]]}, image, (device-[[T]], [[ARCH2]])
|
||||
// BIN2_AMD-DAG: [[P24:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P21]]},
|
||||
// BIN2_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P22]]},
|
||||
// BIN2_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH2]])" {[[P23]]}, object
|
||||
|
||||
//
|
||||
// Test two gpu architecturess up to the assemble phase.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=ASM2 %s
|
||||
// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
|
||||
// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
|
||||
// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
|
||||
// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
|
||||
// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
|
||||
// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
|
||||
// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
|
||||
// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
|
||||
// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
|
||||
// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
|
||||
// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
|
||||
// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
|
||||
// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
|
||||
// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
|
||||
// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH1]])
|
||||
// ASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH1]])
|
||||
// ASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler
|
||||
// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
|
||||
// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
|
||||
// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
|
||||
// ASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
|
||||
// ASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
|
||||
// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
|
||||
// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-[[T]])
|
||||
// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]])
|
||||
|
||||
//
|
||||
// Test single gpu architecture with complete compilation in host-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=HBIN %s
|
||||
// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
|
||||
// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
|
||||
// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
|
||||
// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
|
||||
// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=HBIN,HBIN_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=HBIN,HBIN_AMD %s
|
||||
// HBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
|
||||
// HBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
|
||||
// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
|
||||
// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
|
||||
// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
|
||||
// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
|
||||
//
|
||||
// Test single gpu architecture up to the assemble phase in host-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=HASM %s
|
||||
// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
|
||||
// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
|
||||
// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=HASM,HASM_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=HASM,HASM_AMD %s
|
||||
// HASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
|
||||
// HASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
|
||||
// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
|
||||
// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
|
||||
|
||||
//
|
||||
// Test two gpu architectures with complete compilation in host-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=HBIN2 %s
|
||||
// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
|
||||
// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
|
||||
// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
|
||||
// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
|
||||
// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_AMD %s
|
||||
// HBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
|
||||
// HBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
|
||||
// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
|
||||
// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
|
||||
// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
|
||||
// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
|
||||
|
||||
//
|
||||
// Test two gpu architectures up to the assemble phase in host-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=HASM2 %s
|
||||
// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
|
||||
// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
|
||||
// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
|
||||
// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S \
|
||||
// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S \
|
||||
// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_AMD %s
|
||||
// HASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
|
||||
// HASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
|
||||
// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
|
||||
// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
|
||||
// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
|
||||
|
||||
//
|
||||
// Test single gpu architecture with complete compilation in device-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=DBIN %s
|
||||
// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
|
||||
// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
|
||||
// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
|
||||
// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=DBIN,DBIN_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=DBIN,DBIN_AMD %s
|
||||
// DBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
|
||||
// DBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
|
||||
// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
|
||||
// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
|
||||
// DBIN_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
|
||||
// DBIN_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
|
||||
// DBIN_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object
|
||||
|
||||
//
|
||||
// Test single gpu architecture up to the assemble phase in device-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=DASM %s
|
||||
// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
|
||||
// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
|
||||
// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=DASM,DASM_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=DASM,DASM_AMD %s
|
||||
// DASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
|
||||
// DASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
|
||||
// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
|
||||
// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
|
||||
// DASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
|
||||
// DASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
|
||||
|
||||
//
|
||||
// Test two gpu architectures with complete compilation in device-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=DBIN2 %s
|
||||
// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
|
||||
// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
|
||||
// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
|
||||
// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
|
||||
// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
|
||||
// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
|
||||
// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
|
||||
// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
|
||||
// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
|
||||
// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only \
|
||||
// RUN: 2>&1 | FileCheck -check-prefixes=DBIN2,DBIN2_AMD %s
|
||||
// DBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
|
||||
// DBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
|
||||
// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
|
||||
// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
|
||||
// DBIN2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
|
||||
// DBIN2_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
|
||||
// DBIN2_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P4]]}, object
|
||||
// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
|
||||
// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
|
||||
// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
|
||||
// DBIN2_NV-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]])
|
||||
// DBIN2_NV-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]])
|
||||
// DBIN2_NV-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object
|
||||
|
||||
//
|
||||
// Test two gpu architectures up to the assemble phase in device-only
|
||||
// compilation mode.
|
||||
//
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=DASM2 %s
|
||||
// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
|
||||
// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
|
||||
// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
|
||||
// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
|
||||
// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
|
||||
// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
|
||||
// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
|
||||
// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
|
||||
// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
|
||||
// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
|
||||
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
|
||||
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S \
|
||||
// RUN: 2>&1 | FileCheck -check-prefixes=DASM2,DASM2_NV %s
|
||||
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu \
|
||||
// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
|
||||
// RUN: --cuda-device-only -S 2>&1 \
|
||||
// RUN: | FileCheck -check-prefixes=DASM2,DASM2_AMD %s
|
||||
// DASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
|
||||
// DASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
|
||||
// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
|
||||
// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
|
||||
// DASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
|
||||
// DASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
|
||||
// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
|
||||
// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
|
||||
// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
|
||||
// DASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
|
||||
// DASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
|
||||
|
|
Loading…
Reference in New Issue