[CUDA] Driver changes to support CUDA compilation on MacOS.

Summary: Compiling CUDA device code requires us to know the host toolchain, because CUDA device-side compiles pull in e.g. host headers. When we only supported Linux compilation, this worked because CudaToolChain, which is responsible for device-side CUDA compilation, inherited from the Linux toolchain. But in order to support MacOS, CudaToolChain needs to take a HostToolChain pointer. Because a CUDA toolchain now requires a host TC, we no longer will create a CUDA toolchain from Driver::getToolChain -- you have to go through CreateOffloadingDeviceToolChains. I am *pretty* sure this is correct, and that previously any attempt to create a CUDA toolchain through getToolChain() would eventually have resulted in us throwing "error: unsupported use of NVPTX for host compilation". In any case hacking getToolChain to create a CUDA+host toolchain would be wrong, because a Driver can be reused for multiple compilations, potentially with different host TCs, and getToolChain will cache the result, causing us to potentially use a stale host TC. So that's the main change in this patch. In addition, we have to pull CudaInstallationDetector out of Generic_GCC and into a top-level class. It's now used by the Generic_GCC and MachO toolchains. Reviewers: tra Subscribers: rryan, hfinkel, sfantao Differential Revision: https://reviews.llvm.org/D26774 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@287285 91177308-0d34-0410-b5e6-96231b3b80d8
2016-11-18 00:41:22 +00:00 · 2016-11-18 00:41:22 +00:00 · 29df057cd3
parent c2d61705d6
commit 29df057cd3
13 changed files with 247 additions and 113 deletions
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@ -38,6 +38,7 @@ class FileSystem;

 namespace driver {
  class Compilation;
+  class CudaInstallationDetector;
  class Driver;
  class JobAction;
  class RegisterEffectiveTriple;
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@ -473,14 +473,18 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
  if (llvm::any_of(Inputs, [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
        return types::isCuda(I.first);
      })) {
-    const ToolChain &TC = getToolChain(
-        C.getInputArgs(),
-        llvm::Triple(C.getSingleOffloadToolChain<Action::OFK_Host>()
-                             ->getTriple()
-                             .isArch64Bit()
-                         ? "nvptx64-nvidia-cuda"
-                         : "nvptx-nvidia-cuda"));
-    C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
+    const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+    const llvm::Triple &HostTriple = HostTC->getTriple();
+    llvm::Triple CudaTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
+                                                     : "nvptx-nvidia-cuda");
+    // Use the CUDA and host triples as the key into the ToolChains map, because
+    // the device toolchain we create depends on both.
+    ToolChain *&CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()];
+    if (!CudaTC) {
+      CudaTC = new toolchains::CudaToolChain(*this, CudaTriple, *HostTC,
+                                             C.getInputArgs());
+    }
+    C.addOffloadDeviceToolChain(CudaTC, Action::OFK_Cuda);
  }

  //
@ -3717,9 +3721,6 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
        break;
      }
      break;
-    case llvm::Triple::CUDA:
-      TC = new toolchains::CudaToolChain(*this, Target, Args);
-      break;
    case llvm::Triple::PS4:
      TC = new toolchains::PS4CPU(*this, Target, Args);
      break;
@ -3761,6 +3762,12 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
      }
    }
  }
+
+  // Intentionally omitted from the switch above: llvm::Triple::CUDA.  CUDA
+  // compiles always need two toolchains, the CUDA toolchain and the host
+  // toolchain.  So the only valid way to create a CUDA toolchain is via
+  // CreateOffloadingDeviceToolChains.
+
  return *TC;
 }

--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@ -52,7 +52,8 @@ MachO::MachO(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)

 /// Darwin - Darwin tool chain for i386 and x86_64.
 Darwin::Darwin(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
-    : MachO(D, Triple, Args), TargetInitialized(false) {}
+    : MachO(D, Triple, Args), TargetInitialized(false),
+      CudaInstallation(D, Triple, Args) {}

 types::ID MachO::LookupTypeForExtension(StringRef Ext) const {
  types::ID Ty = types::lookupTypeForExtension(Ext);
@ -99,6 +100,11 @@ bool Darwin::hasBlocksRuntime() const {
  }
 }

+void Darwin::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                ArgStringList &CC1Args) const {
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+}
+
 // This is just a MachO name translation routine and there's no
 // way to join this into ARMTargetParser without breaking all
 // other assumptions. Maybe MachO should consider standardising
@ -1296,6 +1302,10 @@ SanitizerMask Darwin::getSupportedSanitizers() const {
  return Res;
 }

+void Darwin::printVerboseInfo(raw_ostream &OS) const {
+  CudaInstallation.print(OS);
+}
+
 /// Generic_GCC - A tool chain using the 'gcc' command to perform
 /// all subcommands; this relies on gcc translating the majority of
 /// command line options.
@ -1811,10 +1821,10 @@ static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
  return CudaVersion::UNKNOWN;
 }

-// \brief -- try common CUDA installation paths looking for files we need for
-// CUDA compilation.
-void Generic_GCC::CudaInstallationDetector::init(
-    const llvm::Triple &TargetTriple, const llvm::opt::ArgList &Args) {
+CudaInstallationDetector::CudaInstallationDetector(
+    const Driver &D, const llvm::Triple &TargetTriple,
+    const llvm::opt::ArgList &Args)
+    : D(D) {
  SmallVector<std::string, 4> CudaPathCandidates;

  if (Args.hasArg(options::OPT_cuda_path_EQ))
@ -1835,13 +1845,25 @@ void Generic_GCC::CudaInstallationDetector::init(
    BinPath = CudaPath + "/bin";
    IncludePath = InstallPath + "/include";
    LibDevicePath = InstallPath + "/nvvm/libdevice";
-    LibPath = InstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");

    auto &FS = D.getVFS();
-    if (!(FS.exists(IncludePath) && FS.exists(BinPath) && FS.exists(LibPath) &&
+    if (!(FS.exists(IncludePath) && FS.exists(BinPath) &&
          FS.exists(LibDevicePath)))
      continue;

+    // On Linux, we have both lib and lib64 directories, and we need to choose
+    // based on our triple.  On MacOS, we have only a lib directory.
+    //
+    // It's sufficient for our purposes to be flexible: If both lib and lib64
+    // exist, we choose whichever one matches our triple.  Otherwise, if only
+    // lib exists, we use it.
+    if (TargetTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
+      LibPath = InstallPath + "/lib64";
+    else if (FS.exists(InstallPath + "/lib"))
+      LibPath = InstallPath + "/lib";
+    else
+      continue;
+
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
        FS.getBufferForFile(InstallPath + "/version.txt");
    if (!VersionFile) {
@ -1898,7 +1920,33 @@ void Generic_GCC::CudaInstallationDetector::init(
  }
 }

-void Generic_GCC::CudaInstallationDetector::CheckCudaVersionSupportsArch(
+void CudaInstallationDetector::AddCudaIncludeArgs(
+    const ArgList &DriverArgs, ArgStringList &CC1Args) const {
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    // Add cuda_wrappers/* to our system include path.  This lets us wrap
+    // standard library headers.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include");
+    llvm::sys::path::append(P, "cuda_wrappers");
+    CC1Args.push_back("-internal-isystem");
+    CC1Args.push_back(DriverArgs.MakeArgString(P));
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nocudainc))
+    return;
+
+  if (!isValid()) {
+    D.Diag(diag::err_drv_no_cuda_installation);
+    return;
+  }
+
+  CC1Args.push_back("-internal-isystem");
+  CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
+  CC1Args.push_back("-include");
+  CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+}
+
+void CudaInstallationDetector::CheckCudaVersionSupportsArch(
    CudaArch Arch) const {
  if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
      ArchsWithVersionTooLowErrors.count(Arch) > 0)
@ -1913,7 +1961,7 @@ void Generic_GCC::CudaInstallationDetector::CheckCudaVersionSupportsArch(
  }
 }

-void Generic_GCC::CudaInstallationDetector::print(raw_ostream &OS) const {
+void CudaInstallationDetector::print(raw_ostream &OS) const {
  if (isValid())
    OS << "Found CUDA installation: " << InstallPath << ", version "
       << CudaVersionToString(Version) << "\n";
@ -2756,7 +2804,8 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple(

 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                         const ArgList &Args)
-    : ToolChain(D, Triple, Args), GCCInstallation(D), CudaInstallation(D) {
+    : ToolChain(D, Triple, Args), GCCInstallation(D),
+      CudaInstallation(D, Triple, Args) {
  getProgramPaths().push_back(getDriver().getInstalledDir());
  if (getDriver().getInstalledDir() != getDriver().Dir)
    getProgramPaths().push_back(getDriver().Dir);
@ -4162,7 +4211,6 @@ static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
 Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
    : Generic_ELF(D, Triple, Args) {
  GCCInstallation.init(Triple, Args);
-  CudaInstallation.init(Triple, Args);
  Multilibs = GCCInstallation.getMultilibs();
  llvm::Triple::ArchType Arch = Triple.getArch();
  std::string SysRoot = computeSysRoot();
@ -4767,26 +4815,7 @@ void Linux::addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,

 void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs,
                               ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    // Add cuda_wrappers/* to our system include path.  This lets us wrap
-    // standard library headers.
-    SmallString<128> P(getDriver().ResourceDir);
-    llvm::sys::path::append(P, "include");
-    llvm::sys::path::append(P, "cuda_wrappers");
-    addSystemInclude(DriverArgs, CC1Args, P);
-  }
-
-  if (DriverArgs.hasArg(options::OPT_nocudainc))
-    return;
-
-  if (!CudaInstallation.isValid()) {
-    getDriver().Diag(diag::err_drv_no_cuda_installation);
-    return;
-  }
-
-  addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath());
-  CC1Args.push_back("-include");
-  CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }

 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
@ -4968,16 +4997,18 @@ Tool *DragonFly::buildLinker() const {
 /// together object files from the assembler into a single blob.

 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ArgList &Args)
-    : Linux(D, Triple, Args) {
+                             const ToolChain &HostTC, const ArgList &Args)
+    : ToolChain(D, Triple, Args), HostTC(HostTC),
+      CudaInstallation(D, Triple, Args) {
  if (CudaInstallation.isValid())
    getProgramPaths().push_back(CudaInstallation.getBinPath());
 }

-void
-CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                                     llvm::opt::ArgStringList &CC1Args) const {
-  Linux::addClangTargetOptions(DriverArgs, CC1Args);
+void CudaToolChain::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  HostTC.addClangTargetOptions(DriverArgs, CC1Args);
+
  CC1Args.push_back("-fcuda-is-device");

  if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
@ -5019,13 +5050,18 @@ void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
    assert(!Arch.empty() && "Must have an explicit GPU arch.");
    CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
  }
-  Linux::AddCudaIncludeArgs(DriverArgs, CC1Args);
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }

 llvm::opt::DerivedArgList *
 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
-                             StringRef BoundArch, Action::OffloadKind) const {
-  DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+                             StringRef BoundArch,
+                             Action::OffloadKind DeviceOffloadKind) const {
+  DerivedArgList *DAL =
+      HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
+  if (!DAL)
+    DAL = new DerivedArgList(Args.getBaseArgs());
+
  const OptTable &Opts = getDriver().getOpts();

  for (Arg *A : Args) {
@ -5077,6 +5113,30 @@ Tool *CudaToolChain::buildLinker() const {
  return new tools::NVPTX::Linker(*this);
 }

+void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
+  HostTC.addClangWarningOptions(CC1Args);
+}
+
+ToolChain::CXXStdlibType
+CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
+  return HostTC.GetCXXStdlibType(Args);
+}
+
+void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                              ArgStringList &CC1Args) const {
+  HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
+}
+
+void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
+                                                 ArgStringList &CC1Args) const {
+  HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
+}
+
+void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
+                                        ArgStringList &CC1Args) const {
+  HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
+}
+
 /// XCore tool chain
 XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                               const ArgList &Args)
--- a/lib/Driver/ToolChains.h
+++ b/lib/Driver/ToolChains.h
@ -24,6 +24,60 @@

 namespace clang {
 namespace driver {
+
+/// A class to find a viable CUDA installation
+class CudaInstallationDetector {
+private:
+  const Driver &D;
+  bool IsValid = false;
+  CudaVersion Version = CudaVersion::UNKNOWN;
+  std::string InstallPath;
+  std::string BinPath;
+  std::string LibPath;
+  std::string LibDevicePath;
+  std::string IncludePath;
+  llvm::StringMap<std::string> LibDeviceMap;
+
+  // CUDA architectures for which we have raised an error in
+  // CheckCudaVersionSupportsArch.
+  mutable llvm::SmallSet<CudaArch, 4> ArchsWithVersionTooLowErrors;
+
+public:
+  CudaInstallationDetector(const Driver &D, const llvm::Triple &Triple,
+                           const llvm::opt::ArgList &Args);
+
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const;
+
+  /// \brief Emit an error if Version does not support the given Arch.
+  ///
+  /// If either Version or Arch is unknown, does not emit an error.  Emits at
+  /// most one error per Arch.
+  void CheckCudaVersionSupportsArch(CudaArch Arch) const;
+
+  /// \brief Check whether we detected a valid Cuda install.
+  bool isValid() const { return IsValid; }
+  /// \brief Print information about the detected CUDA installation.
+  void print(raw_ostream &OS) const;
+
+  /// \brief Get the detected Cuda install's version.
+  CudaVersion version() const { return Version; }
+  /// \brief Get the detected Cuda installation path.
+  StringRef getInstallPath() const { return InstallPath; }
+  /// \brief Get the detected path to Cuda's bin directory.
+  StringRef getBinPath() const { return BinPath; }
+  /// \brief Get the detected Cuda Include path.
+  StringRef getIncludePath() const { return IncludePath; }
+  /// \brief Get the detected Cuda library path.
+  StringRef getLibPath() const { return LibPath; }
+  /// \brief Get the detected Cuda device library path.
+  StringRef getLibDevicePath() const { return LibDevicePath; }
+  /// \brief Get libdevice file for given architecture
+  std::string getLibDeviceFile(StringRef Gpu) const {
+    return LibDeviceMap.lookup(Gpu);
+  }
+};
+
 namespace toolchains {

 /// Generic_GCC - A tool chain using the 'gcc' command to perform
@ -157,57 +211,6 @@ public:

 protected:
  GCCInstallationDetector GCCInstallation;
-
-  // \brief A class to find a viable CUDA installation
-  class CudaInstallationDetector {
-  private:
-    const Driver &D;
-    bool IsValid = false;
-    CudaVersion Version = CudaVersion::UNKNOWN;
-    std::string InstallPath;
-    std::string BinPath;
-    std::string LibPath;
-    std::string LibDevicePath;
-    std::string IncludePath;
-    llvm::StringMap<std::string> LibDeviceMap;
-
-    // CUDA architectures for which we have raised an error in
-    // CheckCudaVersionSupportsArch.
-    mutable llvm::SmallSet<CudaArch, 4> ArchsWithVersionTooLowErrors;
-
-  public:
-    CudaInstallationDetector(const Driver &D) : D(D) {}
-    void init(const llvm::Triple &TargetTriple, const llvm::opt::ArgList &Args);
-
-    /// \brief Emit an error if Version does not support the given Arch.
-    ///
-    /// If either Version or Arch is unknown, does not emit an error.  Emits at
-    /// most one error per Arch.
-    void CheckCudaVersionSupportsArch(CudaArch Arch) const;
-
-    /// \brief Check whether we detected a valid Cuda install.
-    bool isValid() const { return IsValid; }
-    /// \brief Print information about the detected CUDA installation.
-    void print(raw_ostream &OS) const;
-
-    /// \brief Get the detected Cuda install's version.
-    CudaVersion version() const { return Version; }
-    /// \brief Get the detected Cuda installation path.
-    StringRef getInstallPath() const { return InstallPath; }
-    /// \brief Get the detected path to Cuda's bin directory.
-    StringRef getBinPath() const { return BinPath; }
-    /// \brief Get the detected Cuda Include path.
-    StringRef getIncludePath() const { return IncludePath; }
-    /// \brief Get the detected Cuda library path.
-    StringRef getLibPath() const { return LibPath; }
-    /// \brief Get the detected Cuda device library path.
-    StringRef getLibDevicePath() const { return LibDevicePath; }
-    /// \brief Get libdevice file for given architecture
-    std::string getLibDeviceFile(StringRef Gpu) const {
-      return LibDeviceMap.lookup(Gpu);
-    }
-  };
-
  CudaInstallationDetector CudaInstallation;

 public:
@ -403,6 +406,8 @@ public:
  /// The OS version we are targeting.
  mutable VersionTuple TargetVersion;

+  CudaInstallationDetector CudaInstallation;
+
 private:
  void AddDeploymentTarget(llvm::opt::DerivedArgList &Args) const;

@ -543,6 +548,9 @@ public:
  ObjCRuntime getDefaultObjCRuntime(bool isNonFragile) const override;
  bool hasBlocksRuntime() const override;

+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
  bool UseObjCMixedDispatch() const override {
    // This is only used with the non-fragile ABI and non-legacy dispatch.

@ -572,6 +580,8 @@ public:
  bool SupportsEmbeddedBitcode() const override;

  SanitizerMask getSupportedSanitizers() const override;
+
+  void printVerboseInfo(raw_ostream &OS) const override;
 };

 /// DarwinClang - The Darwin toolchain used by Clang.
@ -867,10 +877,10 @@ protected:
  Tool *buildLinker() const override;
 };

-class LLVM_LIBRARY_VISIBILITY CudaToolChain : public Linux {
+class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
 public:
  CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                const llvm::opt::ArgList &Args);
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args);

  llvm::opt::DerivedArgList *
  TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
@ -881,16 +891,29 @@ public:
  // Never try to use the integrated assembler with CUDA; always fork out to
  // ptxas.
  bool useIntegratedAs() const override { return false; }
+  bool isCrossCompiling() const override { return true; }
+  bool isPICDefault() const override { return false; }
+  bool isPIEDefault() const override { return false; }
+  bool isPICDefaultForced() const override { return false; }
+  bool SupportsProfiling() const override { return false; }
+  bool SupportsObjCGC() const override { return false; }

  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                          llvm::opt::ArgStringList &CC1Args) const override;

-  const Generic_GCC::CudaInstallationDetector &cudaInstallation() const {
-    return CudaInstallation;
-  }
-  Generic_GCC::CudaInstallationDetector &cudaInstallation() {
-    return CudaInstallation;
-  }
+  void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const override;
+  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+  void AddClangCXXStdlibIncludeArgs(
+      const llvm::opt::ArgList &Args,
+      llvm::opt::ArgStringList &CC1Args) const override;
+  void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
+
+  const ToolChain &HostTC;
+  CudaInstallationDetector CudaInstallation;

 protected:
  Tool *buildAssembler() const override;  // ptxas
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@ -11981,7 +11981,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,

  // Check that our installation's ptxas supports gpu_arch.
  if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
-    TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch);
+    TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);
  }

  ArgStringList CmdArgs;
--- a/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/bin/.keep
+++ b/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/bin/.keep
--- a/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/include/.keep
+++ b/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/include/.keep
--- a/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/lib/.keep
+++ b/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/lib/.keep
--- a/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
+++ b/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
--- a/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
+++ b/test/Driver/Inputs/CUDA-macosx/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
--- a/test/Driver/cuda-detect.cu
+++ b/test/Driver/cuda-detect.cu
@ -5,10 +5,18 @@
 // # Check that we properly detect CUDA installation.
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
+
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s

 // Make sure we map libdevice bitcode files to proper GPUs. These
 // tests use Inputs/CUDA_80 which has full set of libdevice files.
@ -51,33 +59,51 @@
 // RUN:   | FileCheck %s -check-prefix COMMON \
 // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE50

-
 // Verify that -nocudainc prevents adding include path to CUDA headers.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \
 // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+
 // We should not add any CUDA include paths if there's no valid CUDA installation
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC

 // Verify that we get an error if there's no libdevice library to link with.
 // NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20  for this purpose.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_20 \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE

 // Verify that  -nocudalib prevents linking libdevice bitcode in.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOLIBDEVICE
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOLIBDEVICE
+
 // Verify that we don't add include paths, link with libdevice or
 // -include __clang_cuda_runtime_wrapper.h without valid CUDA installation.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
 // RUN:     -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE

 // Verify that C++ include paths are passed for both host and device frontends.
 // RUN: %clang -### -no-canonical-prefixes -target x86_64-linux-gnu %s \
--- a/test/Driver/cuda-external-tools.cu
+++ b/test/Driver/cuda-external-tools.cu
@ -1,4 +1,5 @@
-// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+// Tests that ptxas and fatbinary are invoked correctly during CUDA
+// compilation.
 //
 // REQUIRES: clang-driver
 // REQUIRES: x86-registered-target
@ -56,6 +57,14 @@
 // RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
 // RUN:   -check-prefix FATBINARY-EXTRA %s

+// MacOS spot-checks
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+// RUN: %clang -### -target x86_32-apple-macosx -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
 // Match clang job that produces PTX assembly.
 // CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // SM20: "-target-cpu" "sm_20"
--- a/test/Driver/cuda-macosx.cu
+++ b/test/Driver/cuda-macosx.cu
@ -0,0 +1,8 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+//
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --sysroot=%S/Inputs/CUDA-macosx 2>&1 | FileCheck %s
+
+// CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA-macosx/usr/local/cuda