From 3b9a49b0f46e4570bbad6bebb1ced40d19564578 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 16 Oct 2024 21:49:03 +0900 Subject: [PATCH] win32+clang support * based on Windows support PR #2465 by @andreigh - https://github.com/openai/triton/pull/2465 * manually applied, rebased, fix lint errors * use sysconfig.get_config_var() to get the path of python*.lib * clang fix for windows * remove '-fPIC' for windows clang * fix download_and_copy() to support windows * add "exe" extension for windows * use "pyd" extension for windows to make importlib work * third_party/nvidia: fix for windows * win32 fix _path_to_binary() * add library_dir, include_dir for win32 * backend/compiler lazy remove temp files to support win * additional works done by @mantaionut (2024/05/31) * rework for latest triton and cleanup (2024/10/14) * extract minimal fixes to support win32+clang (2024/10/16) Original-author-by: Andrei Gheorghe Signed-off-by: Won-Kyu Park --- python/setup.py | 70 +++++++++++++++----------- python/triton/backends/compiler.py | 10 ++-- python/triton/compiler/compiler.py | 3 +- python/triton/runtime/build.py | 1 + third_party/nvidia/backend/compiler.py | 14 +++--- third_party/nvidia/backend/driver.py | 15 +++++- 6 files changed, 71 insertions(+), 42 deletions(-) diff --git a/python/setup.py b/python/setup.py index 5810b18075ee..0c2d905a6098 100644 --- a/python/setup.py +++ b/python/setup.py @@ -167,7 +167,7 @@ def get_json_package_info(): def get_llvm_package_info(): system = platform.system() try: - arch = {"x86_64": "x64", "arm64": "arm64", "aarch64": "arm64"}[platform.machine()] + arch = {"x86_64": "x64", "AMD64": "x64", "arm64": "arm64", "aarch64": "arm64"}[platform.machine()] except KeyError: arch = platform.machine() if system == "Darwin": @@ -196,6 +196,8 @@ def get_llvm_package_info(): f"LLVM pre-compiled image is not available for {system}-{arch}. Proceeding with user-configured LLVM from source build." ) return Package("llvm", "LLVM-C.lib", "", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH") + elif system == "Windows": + system_suffix = f"windows-{arch}" else: print( f"LLVM pre-compiled image is not available for {system}-{arch}. Proceeding with user-configured LLVM from source build." @@ -281,17 +283,20 @@ def download_and_copy(name, src_path, dst_path, variable, version, url_func): base_dir = os.path.dirname(__file__) system = platform.system() try: - arch = {"x86_64": "64", "arm64": "aarch64", "aarch64": "aarch64"}[platform.machine()] + arch = {"x86_64": "64", "AMD64": "64", "arm64": "aarch64", "aarch64": "aarch64"}[platform.machine()] except KeyError: arch = platform.machine() - url = url_func(arch, version) + supported = {"Linux": "linux", "Windows": "win"} + is_supported = system in supported + if is_supported: + url = url_func(supported[system], arch, version) tmp_path = os.path.join(triton_cache_path, "nvidia", name) # path to cache the download dst_path = os.path.join(base_dir, os.pardir, "third_party", "nvidia", "backend", dst_path) # final binary path platform_name = "sbsa-linux" if arch == "aarch64" else "x86_64-linux" src_path = src_path(platform_name, version) if callable(src_path) else src_path src_path = os.path.join(tmp_path, src_path) download = not os.path.exists(src_path) - if os.path.exists(dst_path) and system == "Linux" and shutil.which(dst_path) is not None: + if os.path.exists(dst_path) and is_supported and shutil.which(dst_path) is not None: curr_version = subprocess.check_output([dst_path, "--version"]).decode("utf-8").strip() curr_version = re.search(r"V([.|\d]+)", curr_version).group(1) download = download or curr_version != version @@ -420,6 +425,10 @@ def build_extension(self, ext): "-DTRITON_CODEGEN_BACKENDS=" + ';'.join([b.name for b in backends if not b.is_external]), "-DTRITON_PLUGIN_DIRS=" + ';'.join([b.src_dir for b in backends if b.is_external]) ] + if platform.system() == "Windows": + installed_base = sysconfig.get_config_var('installed_base') + py_lib_dirs = os.getenv("PYTHON_LIB_DIRS", os.path.join(installed_base, "libs")) + cmake_args.append("-DPYTHON_LIB_DIRS=" + py_lib_dirs) if lit_dir is not None: cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir) cmake_args.extend(thirdparty_cmake_args) @@ -429,9 +438,8 @@ def build_extension(self, ext): build_args = ["--config", cfg] if platform.system() == "Windows": + cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"] - if sys.maxsize > 2**32: - cmake_args += ["-A", "x64"] else: cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count())) @@ -498,63 +506,65 @@ def get_platform_dependent_src_path(subdir): if int(version_major) >= 12 and int(version_minor1) >= 5 else subdir)(*version.split('.'))) +exe = ".exe" if os.name == "nt" else "" + download_and_copy( - name="ptxas", src_path="bin/ptxas", dst_path="bin/ptxas", variable="TRITON_PTXAS_PATH", - version=NVIDIA_TOOLCHAIN_VERSION["ptxas"], url_func=lambda arch, version: + name="ptxas", src_path=f"bin/ptxas{exe}", dst_path=f"bin/ptxas{exe}", variable="TRITON_PTXAS_PATH", + version=NVIDIA_TOOLCHAIN_VERSION["ptxas"], url_func=lambda system, arch, version: ((lambda version_major, version_minor1, version_minor2: - f"https://anaconda.org/nvidia/cuda-nvcc-tools/{version}/download/linux-{arch}/cuda-nvcc-tools-{version}-0.tar.bz2" + f"https://anaconda.org/nvidia/cuda-nvcc-tools/{version}/download/{system}-{arch}/cuda-nvcc-tools-{version}-0.tar.bz2" if int(version_major) >= 12 and int(version_minor1) >= 5 else - f"https://anaconda.org/nvidia/cuda-nvcc/{version}/download/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2") + f"https://anaconda.org/nvidia/cuda-nvcc/{version}/download/{system}-{arch}/cuda-nvcc-{version}-0.tar.bz2") (*version.split('.')))) download_and_copy( name="cuobjdump", - src_path="bin/cuobjdump", - dst_path="bin/cuobjdump", + src_path=f"bin/cuobjdump{exe}", + dst_path=f"bin/cuobjdump{exe}", variable="TRITON_CUOBJDUMP_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cuobjdump"], - url_func=lambda arch, version: - f"https://anaconda.org/nvidia/cuda-cuobjdump/{version}/download/linux-{arch}/cuda-cuobjdump-{version}-0.tar.bz2", + url_func=lambda system, arch, version: + f"https://anaconda.org/nvidia/cuda-cuobjdump/{version}/download/{system}-{arch}/cuda-cuobjdump-{version}-0.tar.bz2", ) download_and_copy( name="nvdisasm", - src_path="bin/nvdisasm", - dst_path="bin/nvdisasm", + src_path=f"bin/nvdisasm{exe}", + dst_path=f"bin/nvdisasm{exe}", variable="TRITON_NVDISASM_PATH", version=NVIDIA_TOOLCHAIN_VERSION["nvdisasm"], - url_func=lambda arch, version: - f"https://anaconda.org/nvidia/cuda-nvdisasm/{version}/download/linux-{arch}/cuda-nvdisasm-{version}-0.tar.bz2", + url_func=lambda system, arch, version: + f"https://anaconda.org/nvidia/cuda-nvdisasm/{version}/download/{system}-{arch}/cuda-nvdisasm-{version}-0.tar.bz2", ) download_and_copy( name="cudacrt", src_path=get_platform_dependent_src_path("include"), dst_path="include", - variable="TRITON_CUDACRT_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cudacrt"], url_func=lambda arch, version: + variable="TRITON_CUDACRT_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cudacrt"], url_func=lambda system, arch, version: ((lambda version_major, version_minor1, version_minor2: - f"https://anaconda.org/nvidia/cuda-crt-dev_linux-{arch}/{version}/download/noarch/cuda-crt-dev_linux-{arch}-{version}-0.tar.bz2" + f"https://anaconda.org/nvidia/cuda-crt-dev_{system}-{arch}/{version}/download/noarch/cuda-crt-dev_{system}-{arch}-{version}-0.tar.bz2" if int(version_major) >= 12 and int(version_minor1) >= 5 else - f"https://anaconda.org/nvidia/cuda-nvcc/{version}/download/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2") + f"https://anaconda.org/nvidia/cuda-nvcc/{version}/download/{system}-{arch}/cuda-nvcc-{version}-0.tar.bz2") (*version.split('.')))) download_and_copy( name="cudart", src_path=get_platform_dependent_src_path("include"), dst_path="include", - variable="TRITON_CUDART_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cudart"], url_func=lambda arch, version: + variable="TRITON_CUDART_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cudart"], url_func=lambda system, arch, version: ((lambda version_major, version_minor1, version_minor2: - f"https://anaconda.org/nvidia/cuda-cudart-dev_linux-{arch}/{version}/download/noarch/cuda-cudart-dev_linux-{arch}-{version}-0.tar.bz2" + f"https://anaconda.org/nvidia/cuda-cudart-dev_{system}-{arch}/{version}/download/noarch/cuda-cudart-dev_{system}-{arch}-{version}-0.tar.bz2" if int(version_major) >= 12 and int(version_minor1) >= 5 else - f"https://anaconda.org/nvidia/cuda-cudart-dev/{version}/download/linux-{arch}/cuda-cudart-dev-{version}-0.tar.bz2" + f"https://anaconda.org/nvidia/cuda-cudart-dev/{version}/download/{system}-{arch}/cuda-cudart-dev-{version}-0.tar.bz2" )(*version.split('.')))) download_and_copy( name="cupti", src_path=get_platform_dependent_src_path("include"), dst_path="include", - variable="TRITON_CUPTI_INCLUDE_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cupti"], url_func=lambda arch, version: + variable="TRITON_CUPTI_INCLUDE_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cupti"], url_func=lambda system, arch, version: ((lambda version_major, version_minor1, version_minor2: - f"https://anaconda.org/nvidia/cuda-cupti-dev/{version}/download/linux-{arch}/cuda-cupti-dev-{version}-0.tar.bz2" + f"https://anaconda.org/nvidia/cuda-cupti-dev/{version}/download/{system}-{arch}/cuda-cupti-dev-{version}-0.tar.bz2" if int(version_major) >= 12 and int(version_minor1) >= 5 else - f"https://anaconda.org/nvidia/cuda-cupti/{version}/download/linux-{arch}/cuda-cupti-{version}-0.tar.bz2") + f"https://anaconda.org/nvidia/cuda-cupti/{version}/download/{system}-{arch}/cuda-cupti-{version}-0.tar.bz2") (*version.split('.')))) download_and_copy( name="cupti", src_path=get_platform_dependent_src_path("lib"), dst_path="lib/cupti", - variable="TRITON_CUPTI_LIB_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cupti"], url_func=lambda arch, version: + variable="TRITON_CUPTI_LIB_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cupti"], url_func=lambda system, arch, version: ((lambda version_major, version_minor1, version_minor2: - f"https://anaconda.org/nvidia/cuda-cupti-dev/{version}/download/linux-{arch}/cuda-cupti-dev-{version}-0.tar.bz2" + f"https://anaconda.org/nvidia/cuda-cupti-dev/{version}/download/{system}-{arch}/cuda-cupti-dev-{version}-0.tar.bz2" if int(version_major) >= 12 and int(version_minor1) >= 5 else - f"https://anaconda.org/nvidia/cuda-cupti/{version}/download/linux-{arch}/cuda-cupti-{version}-0.tar.bz2") + f"https://anaconda.org/nvidia/cuda-cupti/{version}/download/{system}-{arch}/cuda-cupti-{version}-0.tar.bz2") (*version.split('.')))) backends = [*BackendInstaller.copy(["nvidia", "amd"]), *BackendInstaller.copy_externals()] diff --git a/python/triton/backends/compiler.py b/python/triton/backends/compiler.py index 037cd1b597e3..0517bb814461 100644 --- a/python/triton/backends/compiler.py +++ b/python/triton/backends/compiler.py @@ -204,20 +204,24 @@ def __init__(self, target: GPUTarget) -> None: @staticmethod def _path_to_binary(binary: str): + exe = ".exe" if os.name == "nt" else "" base_dir = os.path.join(os.path.dirname(__file__), os.pardir) paths = [ os.environ.get(f"TRITON_{binary.upper()}_PATH", ""), - os.path.join(base_dir, "third_party", "cuda", "bin", binary), + os.path.join(base_dir, "third_party", "cuda", "bin", f"{binary}{exe}"), ] for p in paths: - bin = p.split(" ")[0] + if os.name != "nt": + bin = p.split(" ")[0] + else: + bin = p if os.path.exists(bin) and os.path.isfile(bin): result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT) if result is not None: version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) if version is not None: return p, version.group(1) - raise RuntimeError(f"Cannot find {binary}") + raise RuntimeError(f"Cannot find {binary}{exe}") @abstractclassmethod def supports_target(target: GPUTarget): diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py index 8ca1f8b326d0..29c4eb217251 100644 --- a/python/triton/compiler/compiler.py +++ b/python/triton/compiler/compiler.py @@ -151,7 +151,8 @@ def triton_key(): # backend libtriton_hash = hashlib.sha256() - with open(os.path.join(TRITON_PATH, "_C/libtriton.so"), "rb") as f: + ext = "so" if os.name != "nt" else "pyd" + with open(os.path.join(TRITON_PATH, "_C", "libtriton." + ext), "rb") as f: while True: chunk = f.read(1024**2) if not chunk: diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py index 20da2bc25790..0534dd0d2839 100644 --- a/python/triton/runtime/build.py +++ b/python/triton/runtime/build.py @@ -47,6 +47,7 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries): cc_cmd += [f'-l{lib}' for lib in libraries] cc_cmd += [f"-L{dir}" for dir in library_dirs] cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None] + if os.name == "nt": cc_cmd.pop(cc_cmd.index("-fPIC")) ret = subprocess.check_call(cc_cmd) if ret == 0: return so diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py index 36e73d6b882d..b027f332b7dc 100644 --- a/third_party/nvidia/backend/compiler.py +++ b/third_party/nvidia/backend/compiler.py @@ -20,9 +20,10 @@ def min_dot_size(target: GPUTarget): @functools.lru_cache() def _path_to_binary(binary: str): + exe = ".exe" if os.name == "nt" else "" paths = [ os.environ.get(f"TRITON_{binary.upper()}_PATH", ""), - os.path.join(os.path.dirname(__file__), "bin", binary), + os.path.join(os.path.dirname(__file__), "bin", f"{binary}{exe}"), ] for bin in paths: @@ -32,7 +33,7 @@ def _path_to_binary(binary: str): version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) if version is not None: return bin, version.group(1) - raise RuntimeError(f"Cannot find {binary}") + raise RuntimeError(f"Cannot find {binary}{exe}") @functools.lru_cache() @@ -340,10 +341,6 @@ def make_cubin(src, metadata, opt, capability): ] try: subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog) - if os.path.exists(fsrc.name): - os.remove(fsrc.name) - if os.path.exists(flog.name): - os.remove(flog.name) except subprocess.CalledProcessError as e: with open(flog.name) as log_file: log = log_file.read() @@ -365,6 +362,11 @@ def make_cubin(src, metadata, opt, capability): cubin = f.read() if os.path.exists(fbin): os.remove(fbin) + + if os.path.exists(fsrc.name): + os.remove(fsrc.name) + if os.path.exists(flog.name): + os.remove(flog.name) return cubin def add_stages(self, stages, options): diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py index c5f6419afca7..e767409da887 100644 --- a/third_party/nvidia/backend/driver.py +++ b/third_party/nvidia/backend/driver.py @@ -1,6 +1,7 @@ import functools import os import hashlib +import sysconfig import subprocess import tempfile from pathlib import Path @@ -14,12 +15,20 @@ libdevice_dir = os.path.join(dirname, "lib") libraries = ['cuda'] +if os.name == "nt": + include_dir += [os.path.join(os.environ.get("CUDA_PATH"), "include")] + @functools.lru_cache() def libcuda_dirs(): env_libcuda_path = os.getenv("TRITON_LIBCUDA_PATH") if env_libcuda_path: return [env_libcuda_path] + if os.name == "nt": + installed_base = sysconfig.get_config_var('installed_base') + dirs = [os.path.join(os.environ.get("CUDA_PATH"), "lib", "x64")] + dirs += [os.getenv("PYTHON_LIB_DIRS", os.path.join(installed_base, "libs"))] + return dirs libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() # each line looks like the following: @@ -48,7 +57,9 @@ def library_dirs(): def compile_module_from_src(src, name): key = hashlib.sha256(src.encode("utf-8")).hexdigest() cache = get_cache_manager(key) - cache_path = cache.get_file(f"{name}.so") + ext = "so" if os.name != "nt" else "pyd" + so_name = f'{name}.{ext}' + cache_path = cache.get_file(so_name) if cache_path is None: with tempfile.TemporaryDirectory() as tmpdir: src_path = os.path.join(tmpdir, "main.c") @@ -56,7 +67,7 @@ def compile_module_from_src(src, name): f.write(src) so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries) with open(so, "rb") as f: - cache_path = cache.put(f.read(), f"{name}.so", binary=True) + cache_path = cache.put(f.read(), so_name, binary=True) import importlib.util spec = importlib.util.spec_from_file_location(name, cache_path) mod = importlib.util.module_from_spec(spec)