jittor/python/jittor/compile_extern.py

# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import os, sys, shutil
from .compiler import *
from jittor_utils import run_cmd, get_version, get_int_version
from jittor.utils.misc import download_url_to_local

def search_file(dirs, name, prefer_version=()):
    for d in dirs:
        fname = os.path.join(d, name)
        for i in range(len(prefer_version),-1,-1):
            vname = ".".join((fname,)+prefer_version[:i])
            if os.path.isfile(vname):
                LOG.i(f"found {vname}")
                return vname
    LOG.f(f"file {name} not found in {dirs}")

def install_mkl(root_folder):
    # origin url is
    # url = "https://github.com/intel/mkl-dnn/releases/download/v1.0.2/mkldnn_lnx_1.0.2_cpu_gomp.tgz"
    url = "https://cloud.tsinghua.edu.cn/f/da02bf62b55b4aa3b8ee/?dl=1"
    filename = "mkldnn_lnx_1.0.2_cpu_gomp.tgz"
    # newest version for oneDNN
    # url = "https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_lnx_2.2.0_cpu_gomp.tgz"
    # filename = "dnnl_lnx_2.2.0_cpu_gomp.tgz"
    fullname = os.path.join(root_folder, filename)
    dirname = os.path.join(root_folder, filename.replace(".tgz",""))

    if not os.path.isfile(os.path.join(dirname, "examples", "test")):
        LOG.i("Downloading mkl...")
        download_url_to_local(url, filename, root_folder, "47187284ede27ad3bd64b5f0e7d5e730")
        # newest version for oneDNN
        # download_url_to_local(url, filename, root_folder, "35bbbdf550a9d8ad54db798e372000f6")
        import tarfile

        with tarfile.open(fullname, "r") as tar:
            tar.extractall(root_folder)

        assert 0 == os.system(f"cd {dirname}/examples && "
            f"{cc_path} -std=c++14 cpu_cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && LD_LIBRARY_PATH=../lib/ ./test")
        # newest version for oneDNN
        # assert 0 == os.system(f"cd {dirname}/examples && "
        #     f"{cc_path} -std=c++14 cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && LD_LIBRARY_PATH=../lib/ ./test")

def setup_mkl():
    global mkl_ops, use_mkl
    use_mkl = os.environ.get("use_mkl", "1")=="1"
    mkl_ops = None
    if not use_mkl: return
    mkl_include_path = os.environ.get("mkl_include_path")
    mkl_lib_path = os.environ.get("mkl_lib_path")

    if mkl_lib_path is None or mkl_include_path is None:
        mkl_install_sh = os.path.join(jittor_path, "script", "install_mkl.sh")
        LOG.v("setup mkl...")
        # mkl_path = os.path.join(cache_path, "mkl")
        # mkl_path decouple with cc_path
        from pathlib import Path
        mkl_path = os.path.join(str(Path.home()), ".cache", "jittor", "mkl")

        make_cache_dir(mkl_path)
        install_mkl(mkl_path)
        mkl_home = ""
        for name in os.listdir(mkl_path):
            if name.startswith("mkldnn_lnx") and os.path.isdir(os.path.join(mkl_path, name)):
                mkl_home = os.path.join(mkl_path, name)
                break
        assert mkl_home!=""
        mkl_include_path = os.path.join(mkl_home, "include")
        mkl_lib_path = os.path.join(mkl_home, "lib")

    mkl_lib_name = os.path.join(mkl_lib_path, "libmkldnn.so")
    assert os.path.isdir(mkl_include_path)
    assert os.path.isdir(mkl_lib_path)
    assert os.path.isfile(mkl_lib_name)
    LOG.v(f"mkl_include_path: {mkl_include_path}")
    LOG.v(f"mkl_lib_path: {mkl_lib_path}")
    LOG.v(f"mkl_lib_name: {mkl_lib_name}")
    # We do not link manualy, link in custom ops
    # ctypes.CDLL(mkl_lib_name, dlopen_flags)

    mkl_op_dir = os.path.join(jittor_path, "extern", "mkl", "ops")
    mkl_op_files = [os.path.join(mkl_op_dir, name) for name in os.listdir(mkl_op_dir)]
    mkl_ops = compile_custom_ops(mkl_op_files,
        extra_flags=f" -I'{mkl_include_path}' -L'{mkl_lib_path}' -lmkldnn -Wl,-rpath='{mkl_lib_path}' ")
    LOG.vv("Get mkl_ops: "+str(dir(mkl_ops)))


def install_cub(root_folder):
    url = "https://github.com/NVIDIA/cub/archive/1.11.0.tar.gz"
    url = "https://codeload.github.com/NVIDIA/cub/tar.gz/1.11.0"
    filename = "cub-1.11.0.tgz"
    md5 = "97196a885598e40592100e1caaf3d5ea"
    fullname = os.path.join(root_folder, filename)
    dirname = os.path.join(root_folder, filename.replace(".tgz",""))

    if not os.path.isfile(os.path.join(dirname, "examples", "test")):
        LOG.i("Downloading cub...")
        download_url_to_local(url, filename, root_folder, md5)
        import tarfile

        with tarfile.open(fullname, "r") as tar:
            tar.extractall(root_folder)
        assert 0 == os.system(f"cd {dirname}/examples && "
                    f"{nvcc_path} device/example_device_radix_sort.cu -O2 -I.. -std=c++14 -o test")
        if core.get_device_count():
            assert 0 == os.system(f"cd {dirname}/examples && ./test")
    return dirname

def setup_cub():
    global cub_home
    cub_home = ""
    from pathlib import Path
    cub_path = os.path.join(str(Path.home()), ".cache", "jittor", "cub")
    cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
    extra_flags = ""
    if cuda_version < 11:
        cub_home = install_cub(cub_path)
        extra_flags = f"-I{cub_home}"
        cub_home += "/"
    setup_cuda_lib("cub", link=False, extra_flags=extra_flags)

def setup_cuda_extern():
    if not has_cuda: return
    LOG.vv("setup cuda extern...")
    cache_path_cuda = os.path.join(cache_path, "cuda")
    cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
    make_cache_dir(cache_path_cuda)
    cuda_extern_src = os.path.join(jittor_path, "extern", "cuda", "src")
    cuda_extern_files = [os.path.join(cuda_extern_src, name)
        for name in os.listdir(cuda_extern_src)]
    so_name = os.path.join(cache_path_cuda, "cuda_extern.so")
    compile(cc_path, cc_flags+f" -I'{cuda_include}' ", cuda_extern_files, so_name)
    ctypes.CDLL(so_name, dlopen_flags)

    try:
        setup_cub()
    except Exception as e:
        import traceback
        line = traceback.format_exc()
        LOG.w(f"CUDA found but cub is not loaded:\n{line}")

    libs = ["cublas", "cudnn", "curand"]
    for lib_name in libs:
        try:
            setup_cuda_lib(lib_name)
        except Exception as e:
            import traceback
            line = traceback.format_exc()
            LOG.w(f"CUDA found but {lib_name} is not loaded:\n{line}")
            if lib_name == "cudnn":
                LOG.w(f"Develop version of CUDNN not found, "
                    "please refer to CUDA offical tar file installation: "
                    "https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar")

def setup_cuda_lib(lib_name, link=True, extra_flags=""):
    globals()[lib_name+"_ops"] = None
    globals()[lib_name] = None
    if not has_cuda: return
    LOG.v(f"setup {lib_name}...")

    culib_path = os.path.join(cuda_lib, f"lib{lib_name}.so")
    jt_cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
    jt_culib_include = os.path.join(jittor_path, "extern", "cuda", lib_name, "inc")

    link_flags = ""
    if link:
        extra_include_path = os.path.abspath(os.path.join(cuda_include, "..", "targets/x86_64-linux/include"))
        extra_lib_path = os.path.abspath(os.path.join(cuda_lib, "..", "targets/x86_64-linux/lib"))
        cuda_include_name = search_file([cuda_include, extra_include_path, "/usr/include"], lib_name+".h")
        # cuda11 prefer cudnn 8
        nvcc_version = get_int_version(nvcc_path)
        prefer_version = ()
        if nvcc_version[0] == 11:
            prefer_version = ("8",)
        culib_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], f"lib{lib_name}.so", prefer_version)

        if lib_name == "cudnn":
            # cudnn cannot found libcudnn_cnn_train.so.8, we manual link for it.
            if nvcc_version >= (11,0,0):
                libs = ["libcudnn_ops_infer.so", "libcudnn_ops_train.so", "libcudnn_cnn_infer.so", "libcudnn_cnn_train.so"]
                for l in libs:
                    ex_cudnn_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], l, prefer_version)
                    ctypes.CDLL(ex_cudnn_path, dlopen_flags)

        # dynamic link cuda library
        ctypes.CDLL(culib_path, dlopen_flags)
        link_flags = f"-l{lib_name} -L'{cuda_lib}'"

    # find all source files
    culib_src_dir = os.path.join(jittor_path, "extern", "cuda", lib_name)
    culib_src_files = []
    for r, _, f in os.walk(culib_src_dir):
        for fname in f:
            culib_src_files.append(os.path.join(r, fname))
    if len(culib_src_files) == 0:
        return

    # compile and get operators
    culib = compile_custom_ops(culib_src_files, return_module=True,
        extra_flags=f" -I'{jt_cuda_include}' -I'{jt_culib_include}' {link_flags} {extra_flags} ")
    culib_ops = culib.ops
    globals()[lib_name+"_ops"] = culib_ops
    globals()[lib_name] = culib
    LOG.vv(f"Get {lib_name}_ops: "+str(dir(culib_ops)))

def install_cutt(root_folder):
    # Modified from: https://github.com/ap-hynninen/cutt
    url = "https://codeload.github.com/Jittor/cutt/zip/v1.1"

    filename = "cutt-1.1.zip"
    fullname = os.path.join(root_folder, filename)
    dirname = os.path.join(root_folder, filename.replace(".zip",""))
    true_md5 = "7bb71cf7c49dbe57772539bf043778f7"

    if os.path.exists(fullname):
        md5 = run_cmd('md5sum '+fullname).split()[0]
        if md5 != true_md5:
            os.remove(fullname)
            shutil.rmtree(dirname)
    if not os.path.isfile(os.path.join(dirname, "bin", "cutt_test")):
        LOG.i("Downloading cutt...")
        download_url_to_local(url, filename, root_folder, true_md5)

        import zipfile

        zf = zipfile.ZipFile(fullname)
        try:
            zf.extractall(path=root_folder)
        except RuntimeError as e:
            print(e)
            raise
        zf.close()

        LOG.i("installing cutt...")
        arch_flag = ""
        if len(flags.cuda_archs):
            arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
            arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
        run_cmd(f"make NVCC_GENCODE='{arch_flag}' nvcc_path='{nvcc_path}'", cwd=dirname)
    return dirname

def setup_cutt():
    global cutt_ops, use_cutt
    if not has_cuda:
        use_cutt = False
        return
    use_cutt = os.environ.get("use_cutt", "1")=="1"
    cutt_ops = None
    if not use_cutt: return
    cutt_include_path = os.environ.get("cutt_include_path")
    cutt_lib_path = os.environ.get("cutt_lib_path")

    if cutt_lib_path is None or cutt_include_path is None:
        LOG.v("setup cutt...")
        # cutt_path decouple with cc_path
        from pathlib import Path
        cutt_path = os.path.join(str(Path.home()), ".cache", "jittor", "cutt")

        make_cache_dir(cutt_path)
        install_cutt(cutt_path)
        cutt_home = os.path.join(cutt_path, "cutt-1.1")
        cutt_include_path = os.path.join(cutt_home, "src")
        cutt_lib_path = os.path.join(cutt_home, "lib")

    cutt_lib_name = os.path.join(cutt_lib_path, "libcutt.so")
    assert os.path.isdir(cutt_include_path)
    assert os.path.isdir(cutt_lib_path)
    assert os.path.isfile(cutt_lib_name), cutt_lib_name
    LOG.v(f"cutt_include_path: {cutt_include_path}")
    LOG.v(f"cutt_lib_path: {cutt_lib_path}")
    LOG.v(f"cutt_lib_name: {cutt_lib_name}")
    # We do not link manualy, link in custom ops
    ctypes.CDLL(cutt_lib_name, dlopen_flags)

    cutt_op_dir = os.path.join(jittor_path, "extern", "cuda", "cutt", "ops")
    cutt_op_files = [os.path.join(cutt_op_dir, name) for name in os.listdir(cutt_op_dir)]
    cutt_ops = compile_custom_ops(cutt_op_files,
        extra_flags=f" -I'{cutt_include_path}'")
    LOG.vv("Get cutt_ops: "+str(dir(cutt_ops)))


def install_nccl(root_folder):
    url = "https://github.com/NVIDIA/nccl/archive/v2.8.4-1.tar.gz"
    url = "https://codeload.github.com/NVIDIA/nccl/tar.gz/v2.8.4-1"

    filename = "nccl.tgz"
    fullname = os.path.join(root_folder, filename)
    dirname = os.path.join(root_folder, "nccl-2.8.4-1")
    true_md5 = "900666558c5bc43e0a5e84045b88a06f"

    if os.path.exists(fullname):
        md5 = run_cmd('md5sum '+fullname).split()[0]
        if md5 != true_md5:
            os.remove(fullname)
            if os.path.isdir(dirname):
                shutil.rmtree(dirname)
    if not os.path.isfile(os.path.join(dirname, "build", "lib", "libnccl.so")):
        LOG.i("Downloading nccl...")
        download_url_to_local(url, filename, root_folder, true_md5)

        if core.get_device_count() == 0:
            return
        if not inside_mpi():
            return

        import tarfile
        with tarfile.open(fullname, "r") as tar:
            tar.extractall(root_folder)

        LOG.i("installing nccl...")
        arch_flag = ""
        if len(flags.cuda_archs):
            arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
            arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
        run_cmd(f"make -j8 src.build CUDA_HOME='{cuda_home}' NVCC_GENCODE='{arch_flag}' ", cwd=dirname)
    return dirname

def setup_nccl():
    global nccl_ops, use_nccl
    use_nccl = os.environ.get("use_nccl", "1")=="1"
    nccl_ops = None
    if not has_cuda or not has_mpi:
        use_nccl = False
        return
    if not use_nccl: return
    nccl_include_path = os.environ.get("nccl_include_path")
    nccl_lib_path = os.environ.get("nccl_lib_path")

    if nccl_lib_path is None or nccl_include_path is None:
        LOG.v("setup nccl...")
        # nccl_path decouple with cc_path
        from pathlib import Path
        nccl_path = os.path.join(str(Path.home()), ".cache", "jittor", "nccl")

        make_cache_dir(nccl_path)
        nccl_home = install_nccl(nccl_path)
        if nccl_home is None: return
        nccl_include_path = os.path.join(nccl_home, "build", "include")
        nccl_lib_path = os.path.join(nccl_home, "build", "lib")

    if not inside_mpi():
        return

    nccl_lib_name = os.path.join(nccl_lib_path, "libnccl.so")
    assert os.path.isdir(nccl_include_path)
    assert os.path.isdir(nccl_lib_path)
    assert os.path.isfile(nccl_lib_name), nccl_lib_name
    LOG.v(f"nccl_include_path: {nccl_include_path}")
    LOG.v(f"nccl_lib_path: {nccl_lib_path}")
    LOG.v(f"nccl_lib_name: {nccl_lib_name}")
    # We do not link manualy, link in custom ops
    ctypes.CDLL(nccl_lib_name, dlopen_flags)

    nccl_src_dir = os.path.join(jittor_path, "extern", "cuda", "nccl")
    nccl_src_files = []
    for r, _, f in os.walk(nccl_src_dir):
        for fname in f:
            nccl_src_files.append(os.path.join(r, fname))

    nccl_ops = compile_custom_ops(nccl_src_files,
        extra_flags=f" -I'{nccl_include_path}' {mpi_compile_flags} ")
    LOG.vv("Get nccl_ops: "+str(dir(nccl_ops)))

def manual_link(flags):
    lib_dirs = []
    libs = []
    for f in flags.split():
        if f.startswith("-l"):
            libs.append(f[2:])
        elif f.startswith("-L"):
            lib_dirs.append(f[2:])
    LOG.v("manual_link:", flags)
    LOG.v("lib_dirs:", lib_dirs)
    LOG.v("libs:", libs)
    for lib in libs:
        for d in lib_dirs:
            libname = os.path.join(d, f"lib{lib}.so")
            if os.path.isfile(libname):
                LOG.v("link:", libname)
                ctypes.CDLL(libname, dlopen_flags)
                break

def inside_mpi():
    return "OMPI_COMM_WORLD_SIZE" in os.environ

def setup_mpi():
    global mpi_ops, mpi, use_mpi
    global mpicc_path, has_mpi
    use_mpi = os.environ.get("use_mpi", "1")=="1"
    mpi_ops = None
    mpi = None
    has_mpi = False
    mpicc_path = env_or_try_find('mpicc_path', 'mpicc')
    if mpicc_path == "":
        LOG.i("mpicc not found, distribution disabled.")
        use_mpi = False
    else:
        use_mpi = True
        has_mpi = True
    if not use_mpi:
        return

    global mpi_compile_flags, mpi_link_flags, mpi_flags
    mpi_compile_flags = run_cmd(mpicc_path+" --showme:compile")
    mpi_link_flags = run_cmd(mpicc_path+" --showme:link")
    mpi_flags = mpi_compile_flags + " " + mpi_link_flags
    LOG.v("mpi_flags: "+mpi_flags)

    # find all source files
    mpi_src_dir = os.path.join(jittor_path, "extern", "mpi")
    mpi_src_files = []
    for r, _, f in os.walk(mpi_src_dir):
        for fname in f:
            mpi_src_files.append(os.path.join(r, fname))

    # mpi compile flags add for nccl
    mpi_compile_flags += f" -I'{os.path.join(mpi_src_dir, 'inc')}' "
    mpi_compile_flags = mpi_compile_flags.replace("-pthread", "")

    mpi_version = get_version(mpicc_path)
    if mpi_version.startswith("(1.") or mpi_version.startswith("(2."):
        # mpi version 1.x need to link like this
        manual_link(mpi_flags)
    # mpi(4.x) cannot use deepbind, it need to
    # share the 'environ' symbol.
    mpi = compile_custom_ops(mpi_src_files,
        extra_flags=f" {mpi_flags} ", return_module=True,
        dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW, gen_name_="jittor_mpi_core")
    mpi_ops = mpi.ops
    LOG.vv("Get mpi: "+str(mpi.__dict__.keys()))
    LOG.vv("Get mpi_ops: "+str(mpi_ops.__dict__.keys()))
    def warper(func):
        def inner(self, *args, **kw):
            return func(self, *args, **kw)
        inner.__doc__ = func.__doc__
        return inner
    for k in mpi_ops.__dict__:
        if not k.startswith("mpi_"): continue
        if k == "mpi_test": continue
        setattr(core.Var, k, warper(mpi_ops.__dict__[k]))

setup_mpi()
in_mpi = inside_mpi()
rank = mpi.world_rank() if in_mpi else 0
setup_nccl()

setup_cutt()
setup_mkl()

setup_cuda_extern()