From 779782d64702bb794d58ffc8d77e133690099936 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 6 Dec 2024 19:13:05 +0800 Subject: [PATCH] Allow CUDA source inputs compiled to LTOIR, and enable pynvjitlinker to link inputs that contains LTOIR (#62) Adds functionality supporting kernel and FFI functions being JIT-compiled to LTOIR and link with LTO, allowing better optimization when foreign functions are used in Numba-cuda. --------- Co-authored-by: Graham Markall --- ci/test_conda_pynvjitlink.sh | 2 +- numba_cuda/numba/cuda/codegen.py | 50 ++++++--- numba_cuda/numba/cuda/cudadrv/driver.py | 105 +++++++++++++++++- numba_cuda/numba/cuda/cudadrv/nvrtc.py | 41 ++++++- .../cuda/tests/cudadrv/test_nvjitlink.py | 92 ++++++++++++--- .../tests/test_binary_generation/Makefile | 7 ++ .../tests/test_binary_generation/build.bat | 8 ++ 7 files changed, 266 insertions(+), 39 deletions(-) diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh index 22c0d67..3f18e30 100755 --- a/ci/test_conda_pynvjitlink.sh +++ b/ci/test_conda_pynvjitlink.sh @@ -8,7 +8,7 @@ set -euo pipefail if [ "${CUDA_VER%.*.*}" = "11" ]; then CTK_PACKAGES="cudatoolkit" else - CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc" + CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc cuda-cuobjdump" fi rapids-logger "Install testing dependencies" diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index 3610ade..426eb82 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -9,7 +9,6 @@ import subprocess import tempfile - CUDA_TRIPLE = 'nvptx64-nvidia-cuda' @@ -181,17 +180,7 @@ def get_ltoir(self, cc=None): return ltoir - def get_cubin(self, cc=None): - cc = self._ensure_cc(cc) - - cubin = self._cubin_cache.get(cc, None) - if cubin: - return cubin - - linker = driver.Linker.new( - max_registers=self._max_registers, cc=cc, lto=self._lto - ) - + def _link_all(self, linker, cc, ignore_nonlto=False): if linker.lto: ltoir = self.get_ltoir(cc=cc) linker.add_ltoir(ltoir) @@ -200,11 +189,44 @@ def get_cubin(self, cc=None): linker.add_ptx(ptx.encode()) for path in self._linking_files: - linker.add_file_guess_ext(path) + linker.add_file_guess_ext(path, ignore_nonlto) if self.needs_cudadevrt: - linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True)) + linker.add_file_guess_ext( + get_cudalib('cudadevrt', static=True), ignore_nonlto + ) + + def get_cubin(self, cc=None): + cc = self._ensure_cc(cc) + cubin = self._cubin_cache.get(cc, None) + if cubin: + return cubin + + if self._lto and config.DUMP_ASSEMBLY: + linker = driver.Linker.new( + max_registers=self._max_registers, + cc=cc, + additional_flags=["-ptx"], + lto=self._lto + ) + # `-ptx` flag is meant to view the optimized PTX for LTO objects. + # Non-LTO objects are not passed to linker. + self._link_all(linker, cc, ignore_nonlto=True) + + ptx = linker.get_linked_ptx().decode('utf-8') + + print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-')) + print(ptx) + print('=' * 80) + + linker = driver.Linker.new( + max_registers=self._max_registers, + cc=cc, + lto=self._lto + ) + self._link_all(linker, cc, ignore_nonlto=False) cubin = linker.complete() + self._cubin_cache[cc] = cubin self._linkerinfo_cache[cc] = linker.info_log diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index ccb34a8..465d33d 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -21,6 +21,9 @@ import traceback import asyncio import pathlib +import subprocess +import tempfile +import re from itertools import product from abc import ABCMeta, abstractmethod from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof, @@ -36,7 +39,7 @@ from .drvapi import API_PROTOTYPES from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid from .mappings import FILE_EXTENSION_MAP -from .linkable_code import LinkableCode +from .linkable_code import LinkableCode, LTOIR, Fatbin, Object from numba.cuda.cudadrv import enums, drvapi, nvrtc USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING @@ -2683,12 +2686,18 @@ def add_cu_file(self, path): cu = f.read() self.add_cu(cu, os.path.basename(path)) - def add_file_guess_ext(self, path_or_code): + def add_file_guess_ext(self, path_or_code, ignore_nonlto=False): """ Add a file or LinkableCode object to the link. If a file is passed, the type will be inferred from the extension. A LinkableCode object represents a file already in memory. + + When `ignore_nonlto` is set to true, do not add code that will not + be LTO-ed in the linking process. This is useful in inspecting the + LTO-ed portion of the PTX when linker is added with objects that can be + both LTO-ed and not LTO-ed. """ + if isinstance(path_or_code, str): ext = pathlib.Path(path_or_code).suffix if ext == '': @@ -2704,6 +2713,26 @@ def add_file_guess_ext(self, path_or_code): "Don't know how to link file with extension " f"{ext}" ) + + if ignore_nonlto: + warn_and_return = False + if kind in ( + FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"] + ): + entry_types = inspect_obj_content(path_or_code) + if "nvvm" not in entry_types: + warn_and_return = True + elif kind != FILE_EXTENSION_MAP["ltoir"]: + warn_and_return = True + + if warn_and_return: + warnings.warn( + f"Not adding {path_or_code} as it is not " + "optimizable at link time, and `ignore_nonlto == " + "True`." + ) + return + self.add_file(path_or_code, kind) return else: @@ -2716,6 +2745,25 @@ def add_file_guess_ext(self, path_or_code): if path_or_code.kind == "cu": self.add_cu(path_or_code.data, path_or_code.name) else: + if ignore_nonlto: + warn_and_return = False + if isinstance(path_or_code, (Fatbin, Object)): + with tempfile.NamedTemporaryFile("w") as fp: + fp.write(path_or_code.data) + entry_types = inspect_obj_content(fp.name) + if "nvvm" not in entry_types: + warn_and_return = True + elif not isinstance(path_or_code, LTOIR): + warn_and_return = True + + if warn_and_return: + warnings.warn( + f"Not adding {path_or_code.name} as it is not " + "optimizable at link time, and `ignore_nonlto == " + "True`." + ) + return + self.add_data( path_or_code.data, path_or_code.kind, path_or_code.name ) @@ -3065,6 +3113,28 @@ def add_file(self, path, kind): name = pathlib.Path(path).name self.add_data(data, kind, name) + def add_cu(self, cu, name): + """Add CUDA source in a string to the link. The name of the source + file should be specified in `name`.""" + with driver.get_active_context() as ac: + dev = driver.get_device(ac.devnum) + cc = dev.compute_capability + + program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto) + + if not self.lto and config.DUMP_ASSEMBLY: + print(("ASSEMBLY %s" % name).center(80, "-")) + print(program) + print("=" * 80) + + suffix = ".ltoir" if self.lto else ".ptx" + program_name = os.path.splitext(name)[0] + suffix + # Link the program's PTX or LTOIR using the normal linker mechanism + if self.lto: + self.add_ltoir(program, program_name) + else: + self.add_ptx(program.encode(), program_name) + def add_data(self, data, kind, name): if kind == FILE_EXTENSION_MAP["cubin"]: fn = self._linker.add_cubin @@ -3086,6 +3156,12 @@ def add_data(self, data, kind, name): except NvJitLinkError as e: raise LinkerError from e + def get_linked_ptx(self): + try: + return self._linker.get_linked_ptx() + except NvJitLinkError as e: + raise LinkerError from e + def complete(self): try: return self._linker.get_linked_cubin() @@ -3361,3 +3437,28 @@ def get_version(): Return the driver version as a tuple of (major, minor) """ return driver.get_version() + + +def inspect_obj_content(objpath: str): + """ + Given path to a fatbin or object, use `cuobjdump` to examine its content + Return the set of entries in the object. + """ + code_types :set[str] = set() + + try: + out = subprocess.run(["cuobjdump", objpath], check=True, + capture_output=True) + except FileNotFoundError as e: + msg = ("cuobjdump has not been found. You may need " + "to install the CUDA toolkit and ensure that " + "it is available on your PATH.\n") + raise RuntimeError(msg) from e + + objtable = out.stdout.decode('utf-8') + entry_pattern = r"Fatbin (.*) code" + for line in objtable.split("\n"): + if match := re.match(entry_pattern, line): + code_types.add(match.group(1)) + + return code_types diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index 82dd62e..706c25d 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -61,6 +61,14 @@ class NVRTC: NVVM interface. Initialization is protected by a lock and uses the standard (for Numba) open_cudalib function to load the NVRTC library. """ + + _CU12ONLY_PROTOTYPES = { + # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet); + "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), + # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto); + "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p) + } + _PROTOTYPES = { # nvrtcResult nvrtcVersion(int *major, int *minor) 'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)), @@ -110,6 +118,10 @@ def __new__(cls): cls.__INSTANCE = None raise NvrtcSupportError("NVRTC cannot be loaded") from e + from numba.cuda.cudadrv.runtime import get_version + if get_version() >= (12, 0): + inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES + # Find & populate functions for name, proto in inst._PROTOTYPES.items(): func = getattr(lib, name) @@ -208,10 +220,22 @@ def get_ptx(self, program): return ptx.value.decode() + def get_lto(self, program): + """ + Get the compiled LTOIR as a Python bytes object. + """ + lto_size = c_size_t() + self.nvrtcGetLTOIRSize(program.handle, byref(lto_size)) + + lto = b" " * lto_size.value + self.nvrtcGetLTOIR(program.handle, lto) + + return lto -def compile(src, name, cc): + +def compile(src, name, cc, ltoir=False): """ - Compile a CUDA C/C++ source to PTX for a given compute capability. + Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability. :param src: The source code to compile :type src: str @@ -219,6 +243,8 @@ def compile(src, name, cc): :type name: str :param cc: A tuple ``(major, minor)`` of the compute capability :type cc: tuple + :param ltoir: Compile into LTOIR if True, otherwise into PTX + :type ltoir: bool :return: The compiled PTX and compilation log :rtype: tuple """ @@ -242,6 +268,9 @@ def compile(src, name, cc): numba_include = f'-I{numba_cuda_path}' options = [arch, *cuda_include, numba_include, '-rdc', 'true'] + if ltoir: + options.append("-dlto") + if nvrtc.get_version() < (12, 0): options += ["-std=c++17"] @@ -261,5 +290,9 @@ def compile(src, name, cc): msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}") warnings.warn(msg) - ptx = nvrtc.get_ptx(program) - return ptx, log + if ltoir: + ltoir = nvrtc.get_lto(program) + return ltoir, log + else: + ptx = nvrtc.get_ptx(program) + return ptx, log diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 8d4e085..044895c 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -5,6 +5,10 @@ import itertools import os +import io +import contextlib +import warnings + from numba.cuda import get_current_device from numba import cuda from numba import config @@ -23,6 +27,9 @@ test_device_functions_fatbin = os.path.join( TEST_BIN_DIR, "test_device_functions.fatbin" ) + test_device_functions_fatbin_multi = os.path.join( + TEST_BIN_DIR, "test_device_functions_multi.fatbin" + ) test_device_functions_o = os.path.join( TEST_BIN_DIR, "test_device_functions.o" ) @@ -156,32 +163,81 @@ def test_nvjitlink_jit_with_linkable_code(self): test_device_functions_o, test_device_functions_ptx, ) + for lto in [True, False]: + for file in files: + with self.subTest(file=file): + sig = "uint32(uint32, uint32)" + add_from_numba = cuda.declare_device("add_from_numba", sig) + + @cuda.jit(link=[file], lto=lto) + def kernel(result): + result[0] = add_from_numba(1, 2) + + result = cuda.device_array(1) + kernel[1, 1](result) + assert result[0] == 3 + + def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self): + files = [ + test_device_functions_cu, + test_device_functions_ltoir, + test_device_functions_fatbin_multi + ] + + config.DUMP_ASSEMBLY = True + for file in files: with self.subTest(file=file): - sig = "uint32(uint32, uint32)" - add_from_numba = cuda.declare_device("add_from_numba", sig) + f = io.StringIO() + with contextlib.redirect_stdout(f): + sig = "uint32(uint32, uint32)" + add_from_numba = cuda.declare_device("add_from_numba", sig) - @cuda.jit(link=[file]) - def kernel(result): - result[0] = add_from_numba(1, 2) + @cuda.jit(link=[file], lto=True) + def kernel(result): + result[0] = add_from_numba(1, 2) - result = cuda.device_array(1) - kernel[1, 1](result) - assert result[0] == 3 + result = cuda.device_array(1) + kernel[1, 1](result) + assert result[0] == 3 - def test_nvjitlink_jit_with_linkable_code_lto(self): - file = test_device_functions_ltoir + self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue()) - sig = "uint32(uint32, uint32)" - add_from_numba = cuda.declare_device("add_from_numba", sig) + config.DUMP_ASSEMBLY = False - @cuda.jit(link=[file], lto=True) - def kernel(result): - result[0] = add_from_numba(1, 2) + def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self): + files = [ + test_device_functions_a, + test_device_functions_cubin, + test_device_functions_fatbin, + test_device_functions_o, + test_device_functions_ptx, + ] - result = cuda.device_array(1) - kernel[1, 1](result) - assert result[0] == 3 + config.DUMP_ASSEMBLY = True + + for file in files: + with self.subTest(file=file): + with warnings.catch_warnings(record=True) as w: + with contextlib.redirect_stdout(None): # suppress other PTX + sig = "uint32(uint32, uint32)" + add_from_numba = cuda.declare_device( + "add_from_numba", sig + ) + + @cuda.jit(link=[file], lto=True) + def kernel(result): + result[0] = add_from_numba(1, 2) + + result = cuda.device_array(1) + kernel[1, 1](result) + assert result[0] == 3 + + assert len(w) == 1 + self.assertIn("it is not optimizable at link time, and " + "`ignore_nonlto == True`", str(w[0].message)) + + config.DUMP_ASSEMBLY = False def test_nvjitlink_jit_with_invalid_linkable_code(self): with open(test_device_functions_cubin, "rb") as f: diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile index 85921c7..145f8b0 100644 --- a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile @@ -14,9 +14,14 @@ endif # Gencode flags suitable for most tests GENCODE := -gencode arch=compute_$(GPU_CC),code=sm_$(GPU_CC) +MULTI_GENCODE := -gencode arch=compute_$(GPU_CC),code=[sm_$(GPU_CC),lto_$(GPU_CC)] + # Fatbin tests need to generate code for an additional compute capability FATBIN_GENCODE := $(GENCODE) -gencode arch=compute_$(ALT_CC),code=sm_$(ALT_CC) +# Fatbin that contains both LTO, SASS for multiple architectures +MULTI_FATBIN_GENCODE := $(MULTI_GENCODE) -gencode arch=compute_$(ALT_CC),code=[sm_$(ALT_CC),lto_$(ALT_CC)] + # LTO-IR tests need to generate for the LTO "architecture" instead LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC) @@ -30,6 +35,7 @@ PTX_FLAGS := $(GENCODE) -ptx OBJECT_FLAGS := $(GENCODE) -dc LIBRARY_FLAGS := $(GENCODE) -lib FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin +MULTI_FATBIN_FLAGS := $(MULTI_FATBIN_GENCODE) --fatbin LTOIR_FLAGS := $(LTOIR_GENCODE) -dc OUTPUT_DIR := ./ @@ -41,6 +47,7 @@ all: nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/undefined_extern.cubin undefined_extern.cu nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.cubin test_device_functions.cu nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.fatbin test_device_functions.cu + nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions_multi.fatbin test_device_functions.cu nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ptx test_device_functions.cu nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat index 403fca8..9d47a33 100644 --- a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat +++ b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat @@ -13,9 +13,15 @@ if "%GPU_CC:~0,1%"=="7" ( REM Gencode flags suitable for most tests set GENCODE=-gencode arch=compute_%GPU_CC%,code=sm_%GPU_CC% +REM Gencode flags for a fatbin with SASS and LTO +set MULTI_GENCODE=-gencode arch=compute_%GPU_CC%,code=[sm_%GPU_CC%,lto_%GPU_CC%] + REM Fatbin tests need to generate code for an additional compute capability set FATBIN_GENCODE=%GENCODE% -gencode arch=compute_%ALT_CC%,code=sm_%ALT_CC% +REM Fatbin that contains both LTO, SASS for multiple architectures +set MULTI_FATBIN_GENCODE=%MULTI_GENCODE% -gencode arch=compute_%ALT_CC%,code=[sm_%ALT_CC%,lto_%ALT_CC%] + REM LTO-IR tests need to generate for the LTO "architecture" instead set LTOIR_GENCODE=-gencode arch=lto_%GPU_CC%,code=lto_%GPU_CC% @@ -29,6 +35,7 @@ set PTX_FLAGS=%GENCODE% -ptx set OBJECT_FLAGS=%GENCODE% -dc set LIBRARY_FLAGS=%GENCODE% -lib set FATBIN_FLAGS=%FATBIN_GENCODE% --fatbin +set MULTI_FATBIN_FLAGS=%MULTI_FATBIN_GENCODE% --fatbin set LTOIR_FLAGS=%LTOIR_GENCODE% -dc set OUTPUT_DIR=. @@ -42,6 +49,7 @@ echo Alternative CC: %ALT_CC% nvcc %NVCC_FLAGS% %CUBIN_FLAGS% -o %OUTPUT_DIR%\undefined_extern.cubin undefined_extern.cu nvcc %NVCC_FLAGS% %CUBIN_FLAGS% -o %OUTPUT_DIR%\test_device_functions.cubin test_device_functions.cu nvcc %NVCC_FLAGS% %FATBIN_FLAGS% -o %OUTPUT_DIR%\test_device_functions.fatbin test_device_functions.cu +nvcc %NVCC_FLAGS% %MULTI_FATBIN_FLAGS% -o %OUTPUT_DIR%\test_device_functions_multi.fatbin test_device_functions.cu nvcc %NVCC_FLAGS% %PTX_FLAGS% -o %OUTPUT_DIR%\test_device_functions.ptx test_device_functions.cu nvcc %NVCC_FLAGS% %OBJECT_FLAGS% -o %OUTPUT_DIR%\test_device_functions.o test_device_functions.cu nvcc %NVCC_FLAGS% %LIBRARY_FLAGS% -o %OUTPUT_DIR%\test_device_functions.a test_device_functions.cu