From 779782d64702bb794d58ffc8d77e133690099936 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 6 Dec 2024 19:13:05 +0800
Subject: [PATCH] Allow CUDA source inputs compiled to LTOIR, and enable
 pynvjitlinker to link inputs that contains LTOIR (#62)

Adds functionality supporting kernel and FFI functions being JIT-compiled to LTOIR and link with LTO, allowing better optimization when foreign functions are used in Numba-cuda.

---------

Co-authored-by: Graham Markall <gmarkall@nvidia.com>
---
 ci/test_conda_pynvjitlink.sh                  |   2 +-
 numba_cuda/numba/cuda/codegen.py              |  50 ++++++---
 numba_cuda/numba/cuda/cudadrv/driver.py       | 105 +++++++++++++++++-
 numba_cuda/numba/cuda/cudadrv/nvrtc.py        |  41 ++++++-
 .../cuda/tests/cudadrv/test_nvjitlink.py      |  92 ++++++++++++---
 .../tests/test_binary_generation/Makefile     |   7 ++
 .../tests/test_binary_generation/build.bat    |   8 ++
 7 files changed, 266 insertions(+), 39 deletions(-)

diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh
index 22c0d67..3f18e30 100755
--- a/ci/test_conda_pynvjitlink.sh
+++ b/ci/test_conda_pynvjitlink.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
   CTK_PACKAGES="cudatoolkit"
 else
-  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc"
+  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc cuda-cuobjdump"
 fi
 
 rapids-logger "Install testing dependencies"
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index 3610ade..426eb82 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -9,7 +9,6 @@
 import subprocess
 import tempfile
 
-
 CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
 
 
@@ -181,17 +180,7 @@ def get_ltoir(self, cc=None):
 
         return ltoir
 
-    def get_cubin(self, cc=None):
-        cc = self._ensure_cc(cc)
-
-        cubin = self._cubin_cache.get(cc, None)
-        if cubin:
-            return cubin
-
-        linker = driver.Linker.new(
-            max_registers=self._max_registers, cc=cc, lto=self._lto
-        )
-
+    def _link_all(self, linker, cc, ignore_nonlto=False):
         if linker.lto:
             ltoir = self.get_ltoir(cc=cc)
             linker.add_ltoir(ltoir)
@@ -200,11 +189,44 @@ def get_cubin(self, cc=None):
             linker.add_ptx(ptx.encode())
 
         for path in self._linking_files:
-            linker.add_file_guess_ext(path)
+            linker.add_file_guess_ext(path, ignore_nonlto)
         if self.needs_cudadevrt:
-            linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
+            linker.add_file_guess_ext(
+                get_cudalib('cudadevrt', static=True), ignore_nonlto
+            )
+
+    def get_cubin(self, cc=None):
+        cc = self._ensure_cc(cc)
 
+        cubin = self._cubin_cache.get(cc, None)
+        if cubin:
+            return cubin
+
+        if self._lto and config.DUMP_ASSEMBLY:
+            linker = driver.Linker.new(
+                max_registers=self._max_registers,
+                cc=cc,
+                additional_flags=["-ptx"],
+                lto=self._lto
+            )
+            # `-ptx` flag is meant to view the optimized PTX for LTO objects.
+            # Non-LTO objects are not passed to linker.
+            self._link_all(linker, cc, ignore_nonlto=True)
+
+            ptx = linker.get_linked_ptx().decode('utf-8')
+
+            print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
+            print(ptx)
+            print('=' * 80)
+
+        linker = driver.Linker.new(
+            max_registers=self._max_registers,
+            cc=cc,
+            lto=self._lto
+        )
+        self._link_all(linker, cc, ignore_nonlto=False)
         cubin = linker.complete()
+
         self._cubin_cache[cc] = cubin
         self._linkerinfo_cache[cc] = linker.info_log
 
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index ccb34a8..465d33d 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -21,6 +21,9 @@
 import traceback
 import asyncio
 import pathlib
+import subprocess
+import tempfile
+import re
 from itertools import product
 from abc import ABCMeta, abstractmethod
 from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
@@ -36,7 +39,7 @@
 from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
-from .linkable_code import LinkableCode
+from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 
 USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
@@ -2683,12 +2686,18 @@ def add_cu_file(self, path):
             cu = f.read()
         self.add_cu(cu, os.path.basename(path))
 
-    def add_file_guess_ext(self, path_or_code):
+    def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
         """
         Add a file or LinkableCode object to the link. If a file is
         passed, the type will be inferred from the extension. A LinkableCode
         object represents a file already in memory.
+
+        When `ignore_nonlto` is set to true, do not add code that will not
+        be LTO-ed in the linking process. This is useful in inspecting the
+        LTO-ed portion of the PTX when linker is added with objects that can be
+        both LTO-ed and not LTO-ed.
         """
+
         if isinstance(path_or_code, str):
             ext = pathlib.Path(path_or_code).suffix
             if ext == '':
@@ -2704,6 +2713,26 @@ def add_file_guess_ext(self, path_or_code):
                         "Don't know how to link file with extension "
                         f"{ext}"
                     )
+
+                if ignore_nonlto:
+                    warn_and_return = False
+                    if kind in (
+                        FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"]
+                    ):
+                        entry_types = inspect_obj_content(path_or_code)
+                        if "nvvm" not in entry_types:
+                            warn_and_return = True
+                    elif kind != FILE_EXTENSION_MAP["ltoir"]:
+                        warn_and_return = True
+
+                    if warn_and_return:
+                        warnings.warn(
+                            f"Not adding {path_or_code} as it is not "
+                            "optimizable at link time, and `ignore_nonlto == "
+                            "True`."
+                        )
+                        return
+
                 self.add_file(path_or_code, kind)
             return
         else:
@@ -2716,6 +2745,25 @@ def add_file_guess_ext(self, path_or_code):
             if path_or_code.kind == "cu":
                 self.add_cu(path_or_code.data, path_or_code.name)
             else:
+                if ignore_nonlto:
+                    warn_and_return = False
+                    if isinstance(path_or_code, (Fatbin, Object)):
+                        with tempfile.NamedTemporaryFile("w") as fp:
+                            fp.write(path_or_code.data)
+                            entry_types = inspect_obj_content(fp.name)
+                        if "nvvm" not in entry_types:
+                            warn_and_return = True
+                    elif not isinstance(path_or_code, LTOIR):
+                        warn_and_return = True
+
+                    if warn_and_return:
+                        warnings.warn(
+                            f"Not adding {path_or_code.name} as it is not "
+                            "optimizable at link time, and `ignore_nonlto == "
+                            "True`."
+                        )
+                        return
+
                 self.add_data(
                     path_or_code.data, path_or_code.kind, path_or_code.name
                 )
@@ -3065,6 +3113,28 @@ def add_file(self, path, kind):
         name = pathlib.Path(path).name
         self.add_data(data, kind, name)
 
+    def add_cu(self, cu, name):
+        """Add CUDA source in a string to the link. The name of the source
+        file should be specified in `name`."""
+        with driver.get_active_context() as ac:
+            dev = driver.get_device(ac.devnum)
+            cc = dev.compute_capability
+
+        program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto)
+
+        if not self.lto and config.DUMP_ASSEMBLY:
+            print(("ASSEMBLY %s" % name).center(80, "-"))
+            print(program)
+            print("=" * 80)
+
+        suffix = ".ltoir" if self.lto else ".ptx"
+        program_name = os.path.splitext(name)[0] + suffix
+        # Link the program's PTX or LTOIR using the normal linker mechanism
+        if self.lto:
+            self.add_ltoir(program, program_name)
+        else:
+            self.add_ptx(program.encode(), program_name)
+
     def add_data(self, data, kind, name):
         if kind == FILE_EXTENSION_MAP["cubin"]:
             fn = self._linker.add_cubin
@@ -3086,6 +3156,12 @@ def add_data(self, data, kind, name):
         except NvJitLinkError as e:
             raise LinkerError from e
 
+    def get_linked_ptx(self):
+        try:
+            return self._linker.get_linked_ptx()
+        except NvJitLinkError as e:
+            raise LinkerError from e
+
     def complete(self):
         try:
             return self._linker.get_linked_cubin()
@@ -3361,3 +3437,28 @@ def get_version():
     Return the driver version as a tuple of (major, minor)
     """
     return driver.get_version()
+
+
+def inspect_obj_content(objpath: str):
+    """
+    Given path to a fatbin or object, use `cuobjdump` to examine its content
+    Return the set of entries in the object.
+    """
+    code_types :set[str] = set()
+
+    try:
+        out = subprocess.run(["cuobjdump", objpath], check=True,
+                             capture_output=True)
+    except FileNotFoundError as e:
+        msg = ("cuobjdump has not been found. You may need "
+               "to install the CUDA toolkit and ensure that "
+               "it is available on your PATH.\n")
+        raise RuntimeError(msg) from e
+
+    objtable = out.stdout.decode('utf-8')
+    entry_pattern = r"Fatbin (.*) code"
+    for line in objtable.split("\n"):
+        if match := re.match(entry_pattern, line):
+            code_types.add(match.group(1))
+
+    return code_types
diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
index 82dd62e..706c25d 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -61,6 +61,14 @@ class NVRTC:
     NVVM interface. Initialization is protected by a lock and uses the standard
     (for Numba) open_cudalib function to load the NVRTC library.
     """
+
+    _CU12ONLY_PROTOTYPES = {
+        # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
+        "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
+        # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
+        "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p)
+    }
+
     _PROTOTYPES = {
         # nvrtcResult nvrtcVersion(int *major, int *minor)
         'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
@@ -110,6 +118,10 @@ def __new__(cls):
                     cls.__INSTANCE = None
                     raise NvrtcSupportError("NVRTC cannot be loaded") from e
 
+                from numba.cuda.cudadrv.runtime import get_version
+                if get_version() >= (12, 0):
+                    inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES
+
                 # Find & populate functions
                 for name, proto in inst._PROTOTYPES.items():
                     func = getattr(lib, name)
@@ -208,10 +220,22 @@ def get_ptx(self, program):
 
         return ptx.value.decode()
 
+    def get_lto(self, program):
+        """
+        Get the compiled LTOIR as a Python bytes object.
+        """
+        lto_size = c_size_t()
+        self.nvrtcGetLTOIRSize(program.handle, byref(lto_size))
+
+        lto = b" " * lto_size.value
+        self.nvrtcGetLTOIR(program.handle, lto)
+
+        return lto
 
-def compile(src, name, cc):
+
+def compile(src, name, cc, ltoir=False):
     """
-    Compile a CUDA C/C++ source to PTX for a given compute capability.
+    Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
 
     :param src: The source code to compile
     :type src: str
@@ -219,6 +243,8 @@ def compile(src, name, cc):
     :type name: str
     :param cc: A tuple ``(major, minor)`` of the compute capability
     :type cc: tuple
+    :param ltoir: Compile into LTOIR if True, otherwise into PTX
+    :type ltoir: bool
     :return: The compiled PTX and compilation log
     :rtype: tuple
     """
@@ -242,6 +268,9 @@ def compile(src, name, cc):
     numba_include = f'-I{numba_cuda_path}'
     options = [arch, *cuda_include, numba_include, '-rdc', 'true']
 
+    if ltoir:
+        options.append("-dlto")
+
     if nvrtc.get_version() < (12, 0):
         options += ["-std=c++17"]
 
@@ -261,5 +290,9 @@ def compile(src, name, cc):
         msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
         warnings.warn(msg)
 
-    ptx = nvrtc.get_ptx(program)
-    return ptx, log
+    if ltoir:
+        ltoir = nvrtc.get_lto(program)
+        return ltoir, log
+    else:
+        ptx = nvrtc.get_ptx(program)
+        return ptx, log
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 8d4e085..044895c 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -5,6 +5,10 @@
 
 import itertools
 import os
+import io
+import contextlib
+import warnings
+
 from numba.cuda import get_current_device
 from numba import cuda
 from numba import config
@@ -23,6 +27,9 @@
     test_device_functions_fatbin = os.path.join(
         TEST_BIN_DIR, "test_device_functions.fatbin"
     )
+    test_device_functions_fatbin_multi = os.path.join(
+        TEST_BIN_DIR, "test_device_functions_multi.fatbin"
+    )
     test_device_functions_o = os.path.join(
         TEST_BIN_DIR, "test_device_functions.o"
     )
@@ -156,32 +163,81 @@ def test_nvjitlink_jit_with_linkable_code(self):
             test_device_functions_o,
             test_device_functions_ptx,
         )
+        for lto in [True, False]:
+            for file in files:
+                with self.subTest(file=file):
+                    sig = "uint32(uint32, uint32)"
+                    add_from_numba = cuda.declare_device("add_from_numba", sig)
+
+                    @cuda.jit(link=[file], lto=lto)
+                    def kernel(result):
+                        result[0] = add_from_numba(1, 2)
+
+                    result = cuda.device_array(1)
+                    kernel[1, 1](result)
+                    assert result[0] == 3
+
+    def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
+        files = [
+            test_device_functions_cu,
+            test_device_functions_ltoir,
+            test_device_functions_fatbin_multi
+        ]
+
+        config.DUMP_ASSEMBLY = True
+
         for file in files:
             with self.subTest(file=file):
-                sig = "uint32(uint32, uint32)"
-                add_from_numba = cuda.declare_device("add_from_numba", sig)
+                f = io.StringIO()
+                with contextlib.redirect_stdout(f):
+                    sig = "uint32(uint32, uint32)"
+                    add_from_numba = cuda.declare_device("add_from_numba", sig)
 
-                @cuda.jit(link=[file])
-                def kernel(result):
-                    result[0] = add_from_numba(1, 2)
+                    @cuda.jit(link=[file], lto=True)
+                    def kernel(result):
+                        result[0] = add_from_numba(1, 2)
 
-                result = cuda.device_array(1)
-                kernel[1, 1](result)
-                assert result[0] == 3
+                    result = cuda.device_array(1)
+                    kernel[1, 1](result)
+                    assert result[0] == 3
 
-    def test_nvjitlink_jit_with_linkable_code_lto(self):
-        file = test_device_functions_ltoir
+                self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
 
-        sig = "uint32(uint32, uint32)"
-        add_from_numba = cuda.declare_device("add_from_numba", sig)
+        config.DUMP_ASSEMBLY = False
 
-        @cuda.jit(link=[file], lto=True)
-        def kernel(result):
-            result[0] = add_from_numba(1, 2)
+    def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
+        files = [
+            test_device_functions_a,
+            test_device_functions_cubin,
+            test_device_functions_fatbin,
+            test_device_functions_o,
+            test_device_functions_ptx,
+        ]
 
-        result = cuda.device_array(1)
-        kernel[1, 1](result)
-        assert result[0] == 3
+        config.DUMP_ASSEMBLY = True
+
+        for file in files:
+            with self.subTest(file=file):
+                with warnings.catch_warnings(record=True) as w:
+                    with contextlib.redirect_stdout(None): # suppress other PTX
+                        sig = "uint32(uint32, uint32)"
+                        add_from_numba = cuda.declare_device(
+                            "add_from_numba", sig
+                        )
+
+                        @cuda.jit(link=[file], lto=True)
+                        def kernel(result):
+                            result[0] = add_from_numba(1, 2)
+
+                        result = cuda.device_array(1)
+                        kernel[1, 1](result)
+                        assert result[0] == 3
+
+                assert len(w) == 1
+                self.assertIn("it is not optimizable at link time, and "
+                              "`ignore_nonlto == True`", str(w[0].message))
+
+        config.DUMP_ASSEMBLY = False
 
     def test_nvjitlink_jit_with_invalid_linkable_code(self):
         with open(test_device_functions_cubin, "rb") as f:
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
index 85921c7..145f8b0 100644
--- a/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile
@@ -14,9 +14,14 @@ endif
 # Gencode flags suitable for most tests
 GENCODE := -gencode arch=compute_$(GPU_CC),code=sm_$(GPU_CC)
 
+MULTI_GENCODE := -gencode arch=compute_$(GPU_CC),code=[sm_$(GPU_CC),lto_$(GPU_CC)]
+
 # Fatbin tests need to generate code for an additional compute capability
 FATBIN_GENCODE := $(GENCODE) -gencode arch=compute_$(ALT_CC),code=sm_$(ALT_CC)
 
+# Fatbin that contains both LTO, SASS for multiple architectures
+MULTI_FATBIN_GENCODE := $(MULTI_GENCODE) -gencode arch=compute_$(ALT_CC),code=[sm_$(ALT_CC),lto_$(ALT_CC)]
+
 # LTO-IR tests need to generate for the LTO "architecture" instead
 LTOIR_GENCODE := -gencode arch=lto_$(GPU_CC),code=lto_$(GPU_CC)
 
@@ -30,6 +35,7 @@ PTX_FLAGS := $(GENCODE) -ptx
 OBJECT_FLAGS := $(GENCODE) -dc
 LIBRARY_FLAGS := $(GENCODE) -lib
 FATBIN_FLAGS := $(FATBIN_GENCODE) --fatbin
+MULTI_FATBIN_FLAGS := $(MULTI_FATBIN_GENCODE) --fatbin
 LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
 
 OUTPUT_DIR := ./
@@ -41,6 +47,7 @@ all:
 	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/undefined_extern.cubin undefined_extern.cu
 	nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.cubin test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.fatbin test_device_functions.cu
+	nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/test_device_functions_multi.fatbin test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ptx test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
 	nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
diff --git a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat
index 403fca8..9d47a33 100644
--- a/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat
+++ b/numba_cuda/numba/cuda/tests/test_binary_generation/build.bat
@@ -13,9 +13,15 @@ if "%GPU_CC:~0,1%"=="7" (
 REM Gencode flags suitable for most tests
 set GENCODE=-gencode arch=compute_%GPU_CC%,code=sm_%GPU_CC%
 
+REM Gencode flags for a fatbin with SASS and LTO
+set MULTI_GENCODE=-gencode arch=compute_%GPU_CC%,code=[sm_%GPU_CC%,lto_%GPU_CC%]
+
 REM Fatbin tests need to generate code for an additional compute capability
 set FATBIN_GENCODE=%GENCODE% -gencode arch=compute_%ALT_CC%,code=sm_%ALT_CC%
 
+REM Fatbin that contains both LTO, SASS for multiple architectures
+set MULTI_FATBIN_GENCODE=%MULTI_GENCODE% -gencode arch=compute_%ALT_CC%,code=[sm_%ALT_CC%,lto_%ALT_CC%]
+
 REM LTO-IR tests need to generate for the LTO "architecture" instead
 set LTOIR_GENCODE=-gencode arch=lto_%GPU_CC%,code=lto_%GPU_CC%
 
@@ -29,6 +35,7 @@ set PTX_FLAGS=%GENCODE% -ptx
 set OBJECT_FLAGS=%GENCODE% -dc
 set LIBRARY_FLAGS=%GENCODE% -lib
 set FATBIN_FLAGS=%FATBIN_GENCODE% --fatbin
+set MULTI_FATBIN_FLAGS=%MULTI_FATBIN_GENCODE% --fatbin
 set LTOIR_FLAGS=%LTOIR_GENCODE% -dc
 
 set OUTPUT_DIR=.
@@ -42,6 +49,7 @@ echo Alternative CC: %ALT_CC%
 nvcc %NVCC_FLAGS% %CUBIN_FLAGS% -o %OUTPUT_DIR%\undefined_extern.cubin undefined_extern.cu
 nvcc %NVCC_FLAGS% %CUBIN_FLAGS% -o %OUTPUT_DIR%\test_device_functions.cubin test_device_functions.cu
 nvcc %NVCC_FLAGS% %FATBIN_FLAGS% -o %OUTPUT_DIR%\test_device_functions.fatbin test_device_functions.cu
+nvcc %NVCC_FLAGS% %MULTI_FATBIN_FLAGS% -o %OUTPUT_DIR%\test_device_functions_multi.fatbin test_device_functions.cu
 nvcc %NVCC_FLAGS% %PTX_FLAGS% -o %OUTPUT_DIR%\test_device_functions.ptx test_device_functions.cu
 nvcc %NVCC_FLAGS% %OBJECT_FLAGS% -o %OUTPUT_DIR%\test_device_functions.o test_device_functions.cu
 nvcc %NVCC_FLAGS% %LIBRARY_FLAGS% -o %OUTPUT_DIR%\test_device_functions.a test_device_functions.cu