Conditionally add LTO-able objects for PTX prints

NVIDIA · Oct 30, 2024 · afcce87 · afcce87
1 parent b42c67d
commit afcce87
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 25 deletions.
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
@@ -8,7 +8,6 @@
 import os
 import subprocess
 import tempfile
-from warnings import warn
 
 CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
 
@@ -179,7 +178,7 @@ def get_ltoir(self, cc=None):
 
         return ltoir
 
-    def _link_all(self, linker, cc):
+    def _link_all(self, linker, cc, ignore_nonlto=False):
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)
             linker.add_ltoir(ltoir)
@@ -188,9 +187,11 @@ def _link_all(self, linker, cc):
             linker.add_ptx(ptx.encode())
 
         for path in self._linking_files:
-            linker.add_file_guess_ext(path)
+            linker.add_file_guess_ext(path, ignore_nonlto)
         if self.needs_cudadevrt:
-            linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
+            linker.add_file_guess_ext(
+                get_cudalib('cudadevrt', static=True), ignore_nonlto
+            )
 
     def get_cubin(self, cc=None):
         cc = self._ensure_cc(cc)
@@ -206,30 +207,22 @@ def get_cubin(self, cc=None):
                 additional_flags=["-ptx"],
                 lto=self._lto
             )
-            self._link_all(linker, cc)
-
-            try:
-                ptx = linker.get_linked_ptx().decode('utf-8')
-
-                print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
-                print(ptx)
-                print('=' * 80)
-            except driver.LinkerError as e:
-                if linkererr_cause := getattr(e, "__cause__", None):
-                    if "-ptx requires that all inputs have LTOIR" in str(
-                        linkererr_cause
-                    ):
-                        warn(
-                            "Linker input contains non-LTOIR objects, nvjitlink"
-                            " cannot generate LTO-ed PTX."
-                        )
+            # `-ptx` flag is meant to view the optimized PTX for LTO objects.
+            # Non-LTO objects are not passed to linker.
+            self._link_all(linker, cc, ignore_nonlto=True)
+
+            ptx = linker.get_linked_ptx().decode('utf-8')
+
+            print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
+            print(ptx)
+            print('=' * 80)
 
         linker = driver.Linker.new(
             max_registers=self._max_registers,
             cc=cc,
             lto=self._lto
         )
-        self._link_all(linker, cc)
+        self._link_all(linker, cc, ignore_nonlto=False)
         cubin = linker.complete()
 
         self._cubin_cache[cc] = cubin

diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -36,7 +36,7 @@
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
-from .linkable_code import LinkableCode
+from .linkable_code import LinkableCode, LTOIR
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 
 USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
@@ -2683,12 +2683,18 @@ def add_cu_file(self, path):
             cu = f.read()
         self.add_cu(cu, os.path.basename(path))
 
-    def add_file_guess_ext(self, path_or_code):
+    def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
         """
         Add a file or LinkableCode object to the link. If a file is
         passed, the type will be inferred from the extension. A LinkableCode
         object represents a file already in memory.
+
+        When `ignore_nonlto` is set to true, do not add code that are will not
+        be LTO-ed in the linking process. This is useful in inspecting the
+        LTO-ed portion of the PTX when linker is added with objects that can be
+        both LTO-ed and not LTO-ed.
         """
+
         if isinstance(path_or_code, str):
             ext = pathlib.Path(path_or_code).suffix
             if ext == '':
@@ -2704,6 +2710,13 @@ def add_file_guess_ext(self, path_or_code):
                         "Don't know how to link file with extension "
                         f"{ext}"
                     )
+                if ignore_nonlto and kind != FILE_EXTENSION_MAP["ltoir"]:
+                    warnings.warn(
+                        f"Not adding {path_or_code} as it is not optimizable "
+                        "at link time, and `ignore_nonlto == True`."
+                    )
+                    return
+
                 self.add_file(path_or_code, kind)
             return
         else:
@@ -2716,6 +2729,13 @@ def add_file_guess_ext(self, path_or_code):
             if path_or_code.kind == "cu":
                 self.add_cu(path_or_code.data, path_or_code.name)
             else:
+                if ignore_nonlto and not isinstance(path_or_code.kind, LTOIR):
+                    warnings.warn(
+                        f"Not adding {path_or_code.name} as it is not "
+                        "optimizable at link time, and `ignore_nonlto == True`."
+                    )
+                    return
+
                 self.add_data(
                     path_or_code.data, path_or_code.kind, path_or_code.name
                 )

diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -230,7 +230,8 @@ def kernel(result):
                         assert result[0] == 3
 
                 assert len(w) == 1
-                self.assertIn("cannot generate LTO-ed PTX", str(w[0].message))
+                self.assertIn("it is not optimizable at link time, and "
+                              "`ignore_nonlto == True`", str(w[0].message))
 
         config.DUMP_ASSEMBLY = False