-
Notifications
You must be signed in to change notification settings - Fork 83
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fdc76e8
commit 9d8ecbc
Showing
7 changed files
with
406 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
from dataclasses import dataclass | ||
from typing import List, Optional | ||
|
||
from cuda.bindings import nvjitlink | ||
from cuda.core.experimental._module import ObjectCode | ||
from cuda.core.experimental._utils import check_or_create_options | ||
|
||
|
||
@dataclass | ||
class LinkerOptions: | ||
"""Customizable :obj:`LinkerOptions` for nvJitLink. | ||
Attributes | ||
---------- | ||
arch : str | ||
Pass SM architecture value. Can use compute_<N> value instead if only generating PTX. | ||
This is a required option. | ||
Acceptable value type: str | ||
Maps to: -arch=sm_<N> | ||
max_register_count : int, optional | ||
Maximum register count. | ||
Default: None | ||
Acceptable value type: int | ||
Maps to: -maxrregcount=<N> | ||
time : bool, optional | ||
Print timing information to InfoLog. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -time | ||
verbose : bool, optional | ||
Print verbose messages to InfoLog. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -verbose | ||
link_time_optimization : bool, optional | ||
Perform link time optimization. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -lto | ||
ptx : bool, optional | ||
Emit PTX after linking instead of CUBIN; only supported with -lto. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -ptx | ||
optimization_level : int, optional | ||
Set optimization level. Only 0 and 3 are accepted. | ||
Default: None | ||
Acceptable value type: int | ||
Maps to: -O<N> | ||
debug : bool, optional | ||
Generate debug information. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -g | ||
lineinfo : bool, optional | ||
Generate line information. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -lineinfo | ||
ftz : bool, optional | ||
Flush denormal values to zero. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -ftz=<n> | ||
prec_div : bool, optional | ||
Use precise division. | ||
Default: True | ||
Acceptable value type: bool | ||
Maps to: -prec-div=<n> | ||
prec_sqrt : bool, optional | ||
Use precise square root. | ||
Default: True | ||
Acceptable value type: bool | ||
Maps to: -prec-sqrt=<n> | ||
fma : bool, optional | ||
Use fast multiply-add. | ||
Default: True | ||
Acceptable value type: bool | ||
Maps to: -fma=<n> | ||
kernels_used : List[str], optional | ||
Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple | ||
times. | ||
Default: None | ||
Acceptable value type: list of str | ||
Maps to: -kernels-used=<name> | ||
variables_used : List[str], optional | ||
Pass list of variables that are used; any not in the list can be removed. This option can be specified multiple | ||
times. | ||
Default: None | ||
Acceptable value type: list of str | ||
Maps to: -variables-used=<name> | ||
optimize_unused_variables : bool, optional | ||
Assume that if a variable is not referenced in device code, it can be removed. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -optimize-unused-variables | ||
xptxas : List[str], optional | ||
Pass options to PTXAS. This option can be called multiple times. | ||
Default: None | ||
Acceptable value type: list of str | ||
Maps to: -Xptxas=<opt> | ||
split_compile : int, optional | ||
Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split | ||
compilation (default). | ||
Default: 1 | ||
Acceptable value type: int | ||
Maps to: -split-compile=<N> | ||
split_compile_extended : int, optional | ||
A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. | ||
Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This | ||
option can potentially impact performance of the compiled binary. | ||
Default: 1 | ||
Acceptable value type: int | ||
Maps to: -split-compile-extended=<N> | ||
jump_table_density : int, optional | ||
When doing LTO, specify the case density percentage in switch statements, and use it as a minimal threshold to | ||
determine whether jump table (brx.idx instruction) will be used to implement a switch statement. Default value | ||
is 101. The percentage ranges from 0 to 101 inclusively. | ||
Default: 101 | ||
Acceptable value type: int | ||
Maps to: -jump-table-density=<N> | ||
no_cache : bool, optional | ||
Do not cache the intermediate steps of nvJitLink. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -no-cache | ||
device_stack_protector : bool, optional | ||
Enable stack canaries in device code. Stack canaries make it more difficult to exploit certain types of memory | ||
safety bugs involving stack-local variables. The compiler uses heuristics to assess the risk of such a bug in | ||
each function. Only those functions which are deemed high-risk make use of a stack canary. | ||
Default: False | ||
Acceptable value type: bool | ||
Maps to: -device-stack-protector | ||
""" | ||
|
||
arch: str | ||
max_register_count: Optional[int] = None | ||
time: Optional[bool] = None | ||
verbose: Optional[bool] = None | ||
link_time_optimization: Optional[bool] = None | ||
ptx: Optional[bool] = None | ||
optimization_level: Optional[int] = None | ||
debug: Optional[bool] = None | ||
lineinfo: Optional[bool] = None | ||
ftz: Optional[bool] = None | ||
prec_div: Optional[bool] = None | ||
prec_sqrt: Optional[bool] = None | ||
fma: Optional[bool] = None | ||
kernels_used: Optional[List[str]] = None | ||
variables_used: Optional[List[str]] = None | ||
optimize_unused_variables: Optional[bool] = None | ||
xptxas: Optional[List[str]] = None | ||
split_compile: Optional[int] = None | ||
split_compile_extended: Optional[int] = None | ||
jump_table_density: Optional[int] = None | ||
no_cache: Optional[bool] = None | ||
device_stack_protector: Optional[bool] = None | ||
|
||
def __post_init__(self): | ||
self.formatted_options = [] | ||
if self.arch is not None: | ||
self.formatted_options.append(f"-arch={self.arch}") | ||
if self.max_register_count is not None: | ||
self.formatted_options.append(f"-maxrregcount={self.max_register_count}") | ||
if self.time is not None: | ||
self.formatted_options.append("-time") | ||
if self.verbose is not None: | ||
self.formatted_options.append("-verbose") | ||
if self.link_time_optimization is not None: | ||
self.formatted_options.append("-lto") | ||
if self.ptx is not None: | ||
self.formatted_options.append("-ptx") | ||
if self.optimization_level is not None: | ||
self.formatted_options.append(f"-O{self.optimization_level}") | ||
if self.debug is not None: | ||
self.formatted_options.append("-g") | ||
if self.lineinfo is not None: | ||
self.formatted_options.append("-lineinfo") | ||
if self.ftz is not None: | ||
self.formatted_options.append(f"-ftz={'true' if self.ftz else 'false'}") | ||
if self.prec_div is not None: | ||
self.formatted_options.append(f"-prec-div={'true' if self.prec_div else 'false'}") | ||
if self.prec_sqrt is not None: | ||
self.formatted_options.append(f"-prec-sqrt={'true' if self.prec_sqrt else 'false'}") | ||
if self.fma is not None: | ||
self.formatted_options.append(f"-fma={'true' if self.fma else 'false'}") | ||
if self.kernels_used is not None: | ||
for kernel in self.kernels_used: | ||
self.formatted_options.append(f"-kernels-used={kernel}") | ||
if self.variables_used is not None: | ||
for variable in self.variables_used: | ||
self.formatted_options.append(f"-variables-used={variable}") | ||
if self.optimize_unused_variables is not None: | ||
self.formatted_options.append("-optimize-unused-variables") | ||
if self.xptxas is not None: | ||
for opt in self.xptxas: | ||
self.formatted_options.append(f"-Xptxas={opt}") | ||
if self.split_compile is not None: | ||
self.formatted_options.append(f"-split-compile={self.split_compile}") | ||
if self.split_compile_extended is not None: | ||
self.formatted_options.append(f"-split-compile-extended={self.split_compile_extended}") | ||
if self.jump_table_density is not None: | ||
self.formatted_options.append(f"-jump-table-density={self.jump_table_density}") | ||
if self.no_cache is not None: | ||
self.formatted_options.append("-no-cache") | ||
if self.device_stack_protector is not None: | ||
self.formatted_options.append("-device-stack-protector") | ||
|
||
|
||
class Linker: | ||
__slots__ = "_handle" | ||
|
||
def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): | ||
self._handle = None | ||
options = check_or_create_options(LinkerOptions, options, "Linker options") | ||
self._handle = nvjitlink.create(len(options.formatted_options), options.formatted_options) | ||
|
||
if object_codes is not None: | ||
for code in object_codes: | ||
assert isinstance(code, ObjectCode) | ||
self._add_code_object(code) | ||
|
||
def _add_code_object(self, object_code: ObjectCode): | ||
data = object_code._module | ||
assert isinstance(data, bytes) | ||
nvjitlink.add_data( | ||
self._handle, | ||
self._input_type_from_code_type(object_code._code_type), | ||
data, | ||
len(data), | ||
f"{object_code._handle}_{object_code._code_type}", | ||
) | ||
|
||
def link(self, target_type) -> ObjectCode: | ||
nvjitlink.complete(self._handle) | ||
if target_type not in ["cubin", "ptx"]: | ||
raise ValueError(f"Unsupported target type: {target_type}") | ||
code = None | ||
if target_type == "cubin": | ||
cubin_size = nvjitlink.get_linked_cubin_size(self._handle) | ||
code = bytearray(cubin_size) | ||
nvjitlink.get_linked_cubin(self._handle, code) | ||
else: | ||
ptx_size = nvjitlink.get_linked_ptx_size(self._handle) | ||
code = bytearray(ptx_size) | ||
nvjitlink.get_linked_ptx(self._handle, code) | ||
|
||
return ObjectCode(bytes(code), target_type) | ||
|
||
def get_error_log(self) -> str: | ||
log_size = nvjitlink.get_error_log_size(self._handle) | ||
log = bytearray(log_size) | ||
nvjitlink.get_error_log(self._handle, log) | ||
return log.decode() | ||
|
||
def get_info_log(self) -> str: | ||
log_size = nvjitlink.get_info_log_size(self._handle) | ||
log = bytearray(log_size) | ||
nvjitlink.get_info_log(self._handle, log) | ||
return log.decode() | ||
|
||
def _input_type_from_code_type(self, code_type: str) -> nvjitlink.InputType: | ||
# this list is based on the supported values for code_type in the ObjectCode class definition. | ||
# nvjitlink supports other options for input type | ||
if code_type == "ptx": | ||
return nvjitlink.InputType.PTX | ||
elif code_type == "cubin": | ||
return nvjitlink.InputType.CUBIN | ||
elif code_type == "fatbin": | ||
return nvjitlink.InputType.FATBIN | ||
elif code_type == "ltoir": | ||
return nvjitlink.InputType.LTOIR | ||
elif code_type == "object": | ||
return nvjitlink.InputType.OBJECT | ||
else: | ||
raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") | ||
|
||
@property | ||
def handle(self) -> int: | ||
return self._handle | ||
|
||
def __del__(self): | ||
if self._handle is not None: | ||
nvjitlink.destroy(self._handle) | ||
self._handle = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,3 +31,8 @@ CUDA compilation toolchain | |
:toctree: generated/ | ||
|
||
Program | ||
Linker | ||
|
||
:template: dataclass.rst | ||
|
||
LinkerOptions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ maxdepth: 3 | |
--- | ||
0.1.0 <release/0.1.0-notes> | ||
0.2.0 <release/0.2.0-notes> | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# `cuda.core` Release notes | ||
|
||
Released on Nov <TODO>, 2024 | ||
|
||
## Hightlights | ||
- Addition of the Linker class which gives object oriented and pythonic access to the nvJitLink API. | ||
|
||
## Limitations | ||
|
||
-The Linker class only supports cuda >=12. For cuda <12, use low level cuLink API. | ||
<TODO> |
Oops, something went wrong.