Add action builder for HIP

To support separate compile/link and linking across device IR in different source files, a new HIP action builder is introduced. Basically it compiles/links host and device code separately, and embed fat binary in host linking stage through linker script. Differential Revision: https://reviews.llvm.org/D46476 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@333483 91177308-0d34-0410-b5e6-96231b3b80d8
2018-05-30 00:49:10 +00:00 · 2018-05-30 00:49:10 +00:00 · 68d42ff0b5
parent 722b9995ca
commit 68d42ff0b5
2 changed files with 448 additions and 272 deletions
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@ -2151,9 +2151,10 @@ class OffloadingActionBuilder final {
    }
  };

-  /// CUDA action builder. It injects device code in the host backend
-  /// action.
-  class CudaActionBuilder final : public DeviceActionBuilder {
+  /// Base class for CUDA/HIP action builder. It injects device code in
+  /// the host backend action.
+  class CudaActionBuilderBase : public DeviceActionBuilder {
+  protected:
    /// Flags to signal if the user requested host-only or device-only
    /// compilation.
    bool CompileHostOnly = false;
@ -2170,11 +2171,185 @@ class OffloadingActionBuilder final {

    /// Flag that is set to true if this builder acted on the current input.
    bool IsActive = false;
+  public:
+    CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
+                          const Driver::InputList &Inputs,
+                          Action::OffloadKind OFKind)
+        : DeviceActionBuilder(C, Args, Inputs, OFKind) {}

+    ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
+      // While generating code for CUDA, we only depend on the host input action
+      // to trigger the creation of all the CUDA device actions.
+
+      // If we are dealing with an input action, replicate it for each GPU
+      // architecture. If we are in host-only mode we return 'success' so that
+      // the host uses the CUDA offload kind.
+      if (auto *IA = dyn_cast<InputAction>(HostAction)) {
+        assert(!GpuArchList.empty() &&
+               "We should have at least one GPU architecture.");
+
+        // If the host input is not CUDA or HIP, we don't need to bother about
+        // this input.
+        if (IA->getType() != types::TY_CUDA &&
+            IA->getType() != types::TY_HIP) {
+          // The builder will ignore this input.
+          IsActive = false;
+          return ABRT_Inactive;
+        }
+
+        // Set the flag to true, so that the builder acts on the current input.
+        IsActive = true;
+
+        if (CompileHostOnly)
+          return ABRT_Success;
+
+        // Replicate inputs for each GPU architecture.
+        auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE
+                                                 : types::TY_CUDA_DEVICE;
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          CudaDeviceActions.push_back(
+              C.MakeAction<InputAction>(IA->getInputArg(), Ty));
+        }
+
+        return ABRT_Success;
+      }
+
+      // If this is an unbundling action use it as is for each CUDA toolchain.
+      if (auto *UA = dyn_cast<OffloadUnbundlingJobAction>(HostAction)) {
+        CudaDeviceActions.clear();
+        for (auto Arch : GpuArchList) {
+          CudaDeviceActions.push_back(UA);
+          UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch),
+                                          AssociatedOffloadKind);
+        }
+        return ABRT_Success;
+      }
+
+      return IsActive ? ABRT_Success : ABRT_Inactive;
+    }
+
+    void appendTopLevelActions(ActionList &AL) override {
+      // Utility to append actions to the top level list.
+      auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
+        OffloadAction::DeviceDependences Dep;
+        Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
+                AssociatedOffloadKind);
+        AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
+      };
+
+      // If we have a fat binary, add it to the list.
+      if (CudaFatBinary) {
+        AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN);
+        CudaDeviceActions.clear();
+        CudaFatBinary = nullptr;
+        return;
+      }
+
+      if (CudaDeviceActions.empty())
+        return;
+
+      // If we have CUDA actions at this point, that's because we have a have
+      // partial compilation, so we should have an action for each GPU
+      // architecture.
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(ToolChains.size() == 1 &&
+             "Expecting to have a sing CUDA toolchain.");
+      for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+        AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
+
+      CudaDeviceActions.clear();
+    }
+
+    bool initialize() override {
+      assert(AssociatedOffloadKind == Action::OFK_Cuda ||
+             AssociatedOffloadKind == Action::OFK_HIP);
+
+      // We don't need to support CUDA.
+      if (AssociatedOffloadKind == Action::OFK_Cuda &&
+          !C.hasOffloadToolChain<Action::OFK_Cuda>())
+        return false;
+
+      // We don't need to support HIP.
+      if (AssociatedOffloadKind == Action::OFK_HIP &&
+          !C.hasOffloadToolChain<Action::OFK_HIP>())
+        return false;
+
+      const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+      assert(HostTC && "No toolchain for host compilation.");
+      if (HostTC->getTriple().isNVPTX() ||
+          HostTC->getTriple().getArch() == llvm::Triple::amdgcn) {
+        // We do not support targeting NVPTX/AMDGCN for host compilation. Throw
+        // an error and abort pipeline construction early so we don't trip
+        // asserts that assume device-side compilation.
+        C.getDriver().Diag(diag::err_drv_cuda_host_arch)
+            << HostTC->getTriple().getArchName();
+        return true;
+      }
+
+      ToolChains.push_back(
+          AssociatedOffloadKind == Action::OFK_Cuda
+              ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+              : C.getSingleOffloadToolChain<Action::OFK_HIP>());
+
+      Arg *PartialCompilationArg = Args.getLastArg(
+          options::OPT_cuda_host_only, options::OPT_cuda_device_only,
+          options::OPT_cuda_compile_host_device);
+      CompileHostOnly = PartialCompilationArg &&
+                        PartialCompilationArg->getOption().matches(
+                            options::OPT_cuda_host_only);
+      CompileDeviceOnly = PartialCompilationArg &&
+                          PartialCompilationArg->getOption().matches(
+                              options::OPT_cuda_device_only);
+
+      // Collect all cuda_gpu_arch parameters, removing duplicates.
+      std::set<CudaArch> GpuArchs;
+      bool Error = false;
+      for (Arg *A : Args) {
+        if (!(A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) ||
+              A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ)))
+          continue;
+        A->claim();
+
+        const StringRef ArchStr = A->getValue();
+        if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ) &&
+            ArchStr == "all") {
+          GpuArchs.clear();
+          continue;
+        }
+        CudaArch Arch = StringToCudaArch(ArchStr);
+        if (Arch == CudaArch::UNKNOWN) {
+          C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
+          Error = true;
+        } else if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
+          GpuArchs.insert(Arch);
+        else if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ))
+          GpuArchs.erase(Arch);
+        else
+          llvm_unreachable("Unexpected option.");
+      }
+
+      // Collect list of GPUs remaining in the set.
+      for (CudaArch Arch : GpuArchs)
+        GpuArchList.push_back(Arch);
+
+      // Default to sm_20 which is the lowest common denominator for
+      // supported GPUs.  sm_20 code should work correctly, if
+      // suboptimally, on all newer GPUs.
+      if (GpuArchList.empty())
+        GpuArchList.push_back(CudaArch::SM_20);
+
+      return Error;
+    }
+  };
+
+  /// \brief CUDA action builder. It injects device code in the host backend
+  /// action.
+  class CudaActionBuilder final : public CudaActionBuilderBase {
  public:
    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
                      const Driver::InputList &Inputs)
-        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {}

    ActionBuilderReturnCode
    getDeviceDependences(OffloadAction::DeviceDependences &DA,
@ -2279,147 +2454,73 @@ class OffloadingActionBuilder final {

      return ABRT_Success;
    }
+  };
+  /// \brief HIP action builder. It injects device code in the host backend
+  /// action.
+  class HIPActionBuilder final : public CudaActionBuilderBase {
+    /// The linker inputs obtained for each device arch.
+    SmallVector<ActionList, 8> DeviceLinkerInputs;

-    ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
-      // While generating code for CUDA, we only depend on the host input action
-      // to trigger the creation of all the CUDA device actions.
+  public:
+    HIPActionBuilder(Compilation &C, DerivedArgList &Args,
+                     const Driver::InputList &Inputs)
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}

-      // If we are dealing with an input action, replicate it for each GPU
-      // architecture. If we are in host-only mode we return 'success' so that
-      // the host uses the CUDA offload kind.
-      if (auto *IA = dyn_cast<InputAction>(HostAction)) {
-        assert(!GpuArchList.empty() &&
-               "We should have at least one GPU architecture.");
+    bool canUseBundlerUnbundler() const override { return true; }

-        // If the host input is not CUDA or HIP, we don't need to bother about
-        // this input.
-        if (IA->getType() != types::TY_CUDA &&
-            IA->getType() != types::TY_HIP) {
-          // The builder will ignore this input.
-          IsActive = false;
-          return ABRT_Inactive;
-        }
-
-        // Set the flag to true, so that the builder acts on the current input.
-        IsActive = true;
-
-        if (CompileHostOnly)
-          return ABRT_Success;
-
-        // Replicate inputs for each GPU architecture.
-        auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE
-                                                 : types::TY_CUDA_DEVICE;
-        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-          CudaDeviceActions.push_back(
-              C.MakeAction<InputAction>(IA->getInputArg(), Ty));
+    ActionBuilderReturnCode
+    getDeviceDependences(OffloadAction::DeviceDependences &DA,
+                         phases::ID CurPhase, phases::ID FinalPhase,
+                         PhasesTy &Phases) override {
+      // amdgcn does not support linking of object files, therefore we skip
+      // backend and assemble phases to output LLVM IR.
+      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+          CurPhase == phases::Assemble)
+        return ABRT_Success;
+
+      assert((CurPhase == phases::Link ||
+              CudaDeviceActions.size() == GpuArchList.size()) &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+      // This happens to each device action originated from each input file.
+      // Later on, device actions in DeviceLinkerInputs are used to create
+      // device link actions in appendLinkDependences and the created device
+      // link actions are passed to the offload action as device dependence.
+      if (CurPhase == phases::Link) {
+        DeviceLinkerInputs.resize(CudaDeviceActions.size());
+        auto LI = DeviceLinkerInputs.begin();
+        for (auto *A : CudaDeviceActions) {
+          LI->push_back(A);
+          ++LI;
        }

+        // We will pass the device action as a host dependence, so we don't
+        // need to do anything else with them.
+        CudaDeviceActions.clear();
        return ABRT_Success;
      }

-      return IsActive ? ABRT_Success : ABRT_Inactive;
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
+                                               AssociatedOffloadKind);
+
+      return ABRT_Success;
    }

-    void appendTopLevelActions(ActionList &AL) override {
-      // Utility to append actions to the top level list.
-      auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
-        OffloadAction::DeviceDependences Dep;
-        Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
-                Action::OFK_Cuda);
-        AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
-      };
-
-      // If we have a fat binary, add it to the list.
-      if (CudaFatBinary) {
-        AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN);
-        CudaDeviceActions.clear();
-        CudaFatBinary = nullptr;
-        return;
+    void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
+      // Append a new link action for each device.
+      unsigned I = 0;
+      for (auto &LI : DeviceLinkerInputs) {
+        auto *DeviceLinkAction =
+            C.MakeAction<LinkJobAction>(LI, types::TY_Image);
+        DA.add(*DeviceLinkAction, *ToolChains[0],
+               CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+        ++I;
      }
-
-      if (CudaDeviceActions.empty())
-        return;
-
-      // If we have CUDA actions at this point, that's because we have a have
-      // partial compilation, so we should have an action for each GPU
-      // architecture.
-      assert(CudaDeviceActions.size() == GpuArchList.size() &&
-             "Expecting one action per GPU architecture.");
-      assert(ToolChains.size() == 1 &&
-             "Expecting to have a sing CUDA toolchain.");
-      for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-        AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
-
-      CudaDeviceActions.clear();
-    }
-
-    bool initialize() override {
-      // We don't need to support CUDA.
-      if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
-        return false;
-
-      const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
-      assert(HostTC && "No toolchain for host compilation.");
-      if (HostTC->getTriple().isNVPTX() ||
-          HostTC->getTriple().getArch() == llvm::Triple::amdgcn) {
-        // We do not support targeting NVPTX/AMDGCN for host compilation. Throw
-        // an error and abort pipeline construction early so we don't trip
-        // asserts that assume device-side compilation.
-        C.getDriver().Diag(diag::err_drv_cuda_host_arch)
-            << HostTC->getTriple().getArchName();
-        return true;
-      }
-
-      ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
-
-      Arg *PartialCompilationArg = Args.getLastArg(
-          options::OPT_cuda_host_only, options::OPT_cuda_device_only,
-          options::OPT_cuda_compile_host_device);
-      CompileHostOnly = PartialCompilationArg &&
-                        PartialCompilationArg->getOption().matches(
-                            options::OPT_cuda_host_only);
-      CompileDeviceOnly = PartialCompilationArg &&
-                          PartialCompilationArg->getOption().matches(
-                              options::OPT_cuda_device_only);
-
-      // Collect all cuda_gpu_arch parameters, removing duplicates.
-      std::set<CudaArch> GpuArchs;
-      bool Error = false;
-      for (Arg *A : Args) {
-        if (!(A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) ||
-              A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ)))
-          continue;
-        A->claim();
-
-        const StringRef ArchStr = A->getValue();
-        if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ) &&
-            ArchStr == "all") {
-          GpuArchs.clear();
-          continue;
-        }
-        CudaArch Arch = StringToCudaArch(ArchStr);
-        if (Arch == CudaArch::UNKNOWN) {
-          C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
-          Error = true;
-        } else if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
-          GpuArchs.insert(Arch);
-        else if (A->getOption().matches(options::OPT_no_cuda_gpu_arch_EQ))
-          GpuArchs.erase(Arch);
-        else
-          llvm_unreachable("Unexpected option.");
-      }
-
-      // Collect list of GPUs remaining in the set.
-      for (CudaArch Arch : GpuArchs)
-        GpuArchList.push_back(Arch);
-
-      // Default to sm_20 which is the lowest common denominator for
-      // supported GPUs.  sm_20 code should work correctly, if
-      // suboptimally, on all newer GPUs.
-      if (GpuArchList.empty())
-        GpuArchList.push_back(CudaArch::SM_20);
-
-      return Error;
    }
  };

@ -2589,6 +2690,9 @@ public:
    // Create a specialized builder for CUDA.
    SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));

+    // Create a specialized builder for HIP.
+    SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs));
+
    // Create a specialized builder for OpenMP.
    SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs));

--- a/test/Driver/cuda-phases.cu
+++ b/test/Driver/cuda-phases.cu
@ -7,195 +7,267 @@
 // REQUIRES: clang-driver
 // REQUIRES: powerpc-registered-target
 // REQUIRES: nvptx-registered-target
-
+// REQUIRES: amdgpu-registered-target
 //
 // Test single gpu architecture with complete compilation.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN %s
-// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
-// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
-// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
-// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
-// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:sm_30]])
+// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:gfx803]])
+// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object
+// BIN_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler
+// BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
+// BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
+// BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object

 //
 // Test single gpu architecture up to the assemble phase.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
-// RUN: | FileCheck -check-prefix=ASM %s
-// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
-// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
-// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
+// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// ASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// ASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
+// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-[[T]])
+// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-[[T]])
+// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]])

 //
 // Test two gpu architectures with complete compilation.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN2 %s
-// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
-// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
-// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
-// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
-// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
-// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
-// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
-// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
-// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
-// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
-// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
+// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH1:sm_30|gfx803]])
+// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH1]])" {[[P7]]}, object
+// BIN2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler
+// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P15:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object
+// BIN2_NV-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler
+// BIN2_NV-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-[[T]])
+// BIN2_NV-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P17]]}, ir
+// BIN2_NV-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]])
+// BIN2_AMD-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
+// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]])
+// BIN2_AMD-DAG: [[P22:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH1]])
+// BIN2_AMD-DAG: [[P23:[0-9]+]]: linker, {[[P12]]}, image, (device-[[T]], [[ARCH2]])
+// BIN2_AMD-DAG: [[P24:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P21]]},
+// BIN2_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P22]]},
+// BIN2_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH2]])" {[[P23]]}, object

 //
 // Test two gpu architecturess up to the assemble phase.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
-// RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
-// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
-// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
-// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
+// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
+// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
+// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH1]])
+// ASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH1]])
+// ASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler
+// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// ASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
+// ASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
+// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
+// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-[[T]])
+// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-[[T]])
+// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]])

 //
 // Test single gpu architecture with complete compilation in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
-// RUN: | FileCheck -check-prefix=HBIN %s
-// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
-// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
-// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN,HBIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN,HBIN_AMD %s
+// HBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
+// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
 //
 // Test single gpu architecture up to the assemble phase in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=HASM %s
-// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=HASM,HASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=HASM,HASM_AMD %s
+// HASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])

 //
 // Test two gpu architectures with complete compilation in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
-// RUN: | FileCheck -check-prefix=HBIN2 %s
-// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
-// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
-// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_AMD %s
+// HBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
+// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])

 //
 // Test two gpu architectures up to the assemble phase in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=HASM2 %s
-// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_AMD %s
+// HASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])

 //
 // Test single gpu architecture with complete compilation in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
-// RUN: | FileCheck -check-prefix=DBIN %s
-// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN,DBIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN,DBIN_AMD %s
+// DBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object

 //
 // Test single gpu architecture up to the assemble phase in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=DASM %s
-// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM,DASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM,DASM_AMD %s
+// DASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler

 //
 // Test two gpu architectures with complete compilation in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
-// RUN: | FileCheck -check-prefix=DBIN2 %s
-// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
-// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
-// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
-// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
-// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only \
+// RUN: 2>&1 | FileCheck -check-prefixes=DBIN2,DBIN2_AMD %s
+// DBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P4]]}, object
+// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object

 //
 // Test two gpu architectures up to the assemble phase in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=DASM2 %s
-// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASM2,DASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2_AMD %s
+// DASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// DASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
+// DASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler