diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index e19686929..44679465a 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -20,12 +20,16 @@ THE SOFTWARE. """ -from typing import Any, Sequence, Mapping, Tuple, Optional +from typing import Any, Sequence, Mapping, Tuple, Optional, TYPE_CHECKING from dataclasses import dataclass, replace import islpy as isl +if TYPE_CHECKING: + from loopy.codegen import CodeGenerationState + + def process_preambles(preambles: Sequence[Tuple[int, str]]) -> Sequence[str]: seen_preamble_tags = set() dedup_preambles = [] @@ -170,7 +174,8 @@ def all_code(self): + "\n\n" + str(self.host_program.ast)) - def current_program(self, codegen_state): + def current_program( + self, codegen_state: "CodeGenerationState") -> GeneratedProgram: if codegen_state.is_generating_device_code: if self.device_programs: result = self.device_programs[-1] @@ -329,13 +334,23 @@ def generate_host_or_device_program(codegen_state, schedule_index): cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( + fdef_preambles, fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, schedule_index, fdecl_ast, body_ast) + if fdef_preambles: + if codegen_state.is_generating_device_code: + codegen_result = codegen_result.copy( + device_preambles=( + codegen_result.device_preambles + tuple(fdef_preambles))) + else: + codegen_result = codegen_result.copy( + host_preambles=( + codegen_result.host_preambles + tuple(fdef_preambles))) + codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index e38624b43..446a007c4 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -203,7 +203,8 @@ def get_function_definition( def get_function_declaration( self, codegen_state: CodeGenerationState, codegen_result: CodeGenerationResult, schedule_index: int - ) -> ASTType: + ) -> Tuple[Sequence[Tuple[str, str]], ASTType]: + """Returns preambles and the AST for the function declaration.""" raise NotImplementedError def generate_top_of_body( @@ -289,14 +290,16 @@ def __str__(self): return "" -class DummyHostASTBuilder(ASTBuilderBase): +class DummyHostASTBuilder(ASTBuilderBase[None]): def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): return function_body - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - return None + def get_function_declaration( + self, codegen_state, codegen_result, + schedule_index, + ) -> Tuple[Sequence[Tuple[str, str]], None]: + return [], None def get_temporary_decls(self, codegen_state, schedule_index): return [] diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index f3f644845..750b5686e 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -23,7 +23,7 @@ THE SOFTWARE. """ -from typing import cast, Tuple, Optional +from typing import cast, Tuple, Optional, Sequence import re import numpy as np # noqa @@ -817,8 +817,10 @@ def get_function_definition( else: return Collection(result+[Line(), fbody]) - def get_function_declaration(self, codegen_state: CodeGenerationState, - codegen_result: CodeGenerationResult, schedule_index: int) -> Generable: + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], Generable]: kernel = codegen_state.kernel assert codegen_state.kernel.linearization is not None @@ -846,7 +848,7 @@ def get_function_declaration(self, codegen_state: CodeGenerationState, passed_names = [arg.name for arg in kernel.args] written_names = kernel.get_written_variables() - return FunctionDeclarationWrapper( + return [], FunctionDeclarationWrapper( FunctionDeclaration( name, [self.arg_to_cgen_declarator( diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index f95dea681..fbbbd297c 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -23,10 +23,12 @@ THE SOFTWARE. """ +from typing import Tuple, Sequence + import numpy as np from pymbolic import var from pytools import memoize_method -from cgen import Declarator, Const +from cgen import Declarator, Const, Generable from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -35,6 +37,8 @@ from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg, ArrayArg from loopy.kernel.function_interface import ScalarCallable +from loopy.codegen.result import CodeGenerationResult +from loopy.codegen import CodeGenerationState # {{{ vector types @@ -320,9 +324,11 @@ def known_callables(self): # {{{ top-level codegen - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - fdecl = super().get_function_declaration( + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], Generable]: + preambles, fdecl = super().get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper @@ -352,7 +358,7 @@ def get_function_declaration(self, codegen_state, codegen_result, fdecl = CudaLaunchBounds(nthreads, fdecl) - return FunctionDeclarationWrapper(fdecl) + return preambles, FunctionDeclarationWrapper(fdecl) def preamble_generators(self): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 9974803c5..2fbd6bcf8 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -24,7 +24,7 @@ """ -from typing import cast, Tuple +from typing import cast, Tuple, Sequence import numpy as np # noqa import pymbolic.primitives as p @@ -202,8 +202,10 @@ def get_dtype_registry(self): class ISPCASTBuilder(CFamilyASTBuilder): # {{{ top-level codegen - def get_function_declaration(self, codegen_state: CodeGenerationState, - codegen_result: CodeGenerationResult, schedule_index: int) -> Generable: + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], Generable]: name = codegen_result.current_program(codegen_state).name kernel = codegen_state.kernel @@ -243,7 +245,7 @@ def get_function_declaration(self, codegen_state: CodeGenerationState, arg_decls)) from loopy.target.c import FunctionDeclarationWrapper - return FunctionDeclarationWrapper(result) + return [], FunctionDeclarationWrapper(result) def get_kernel_call(self, codegen_state: CodeGenerationState, subkernel_name: str, diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d9b23670e..380311237 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -23,10 +23,12 @@ THE SOFTWARE. """ +from typing import Tuple, Sequence + import numpy as np from pymbolic import var from pytools import memoize_method -from cgen import Declarator +from cgen import Declarator, Generable from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -36,6 +38,8 @@ from loopy.kernel.array import VectorArrayDimTag, FixedStrideArrayDimTag, ArrayBase from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg from loopy.kernel.function_interface import ScalarCallable +from loopy.codegen import CodeGenerationState +from loopy.codegen.result import CodeGenerationResult # {{{ dtype registry wrappers @@ -624,9 +628,11 @@ def preamble_generators(self): # {{{ top-level codegen - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - fdecl = super().get_function_declaration( + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], Generable]: + preambles, fdecl = super().get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper @@ -634,10 +640,14 @@ def get_function_declaration(self, codegen_state, codegen_result, if not codegen_state.is_entrypoint: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature - return fdecl + return preambles, fdecl - fdecl = fdecl.subdecl + return preambles, FunctionDeclarationWrapper( + self._wrap_kernel_decl(codegen_state, schedule_index, fdecl.subdecl)) + def _wrap_kernel_decl( + self, codegen_state: CodeGenerationState, schedule_index: int, + fdecl: Declarator) -> Declarator: from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) @@ -654,7 +664,7 @@ def get_function_declaration(self, codegen_state, codegen_result, fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) - return FunctionDeclarationWrapper(fdecl) + return fdecl def generate_top_of_body(self, codegen_state): from loopy.kernel.data import ImageArg diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 66dd9ae3f..60d3c84bb 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -22,20 +22,28 @@ THE SOFTWARE. """ -from typing import Sequence, Mapping, Tuple, Dict, List, Union +from warnings import warn +from typing import Sequence, Tuple, List, Union, Optional, cast import numpy as np import pymbolic.primitives as p import genpy +from cgen import (Generable, Pointer, Const, FunctionBody, Collection, Initializer, + Line, Block) +from cgen.opencl import CLGlobal from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder, ExpressionToOpenCLCExpressionMapper) from loopy.target.python import PythonASTBuilderBase from loopy.kernel import LoopKernel from loopy.types import NumpyType +from loopy.typing import ExpressionT from loopy.diagnostic import LoopyError, LoopyTypeError -from warnings import warn from loopy.kernel.function_interface import ScalarCallable +from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, ConstantArg +from loopy.schedule import CallKernel +from loopy.codegen import CodeGenerationState +from loopy.codegen.result import CodeGenerationResult import logging logger = logging.getLogger(__name__) @@ -388,8 +396,16 @@ class PyOpenCLTarget(OpenCLTarget): host_program_name_prefix = "_lpy_host_" host_program_name_suffix = "" - def __init__(self, device=None, pyopencl_module_name="_lpy_cl", - atomics_flavor=None, use_int8_for_bool=True): + # FIXME Not yet complete + limit_arg_size_nbytes: Optional[int] + pointer_size_nbytes: int + + def __init__( + self, device=None, *, pyopencl_module_name: str = "_lpy_cl", + atomics_flavor=None, use_int8_for_bool: bool = True, + limit_arg_size_nbytes: Optional[int] = None, + pointer_size_nbytes: Optional[int] = None + ) -> None: # This ensures the dtype registry is populated. import pyopencl.tools # noqa @@ -409,6 +425,12 @@ def __init__(self, device=None, pyopencl_module_name="_lpy_cl", self.pyopencl_module_name = pyopencl_module_name + if pointer_size_nbytes is None: + pointer_size_nbytes = tuple.__itemsize__ + + self.limit_arg_size_nbytes = limit_arg_size_nbytes + self.pointer_size_nbytes = pointer_size_nbytes + @property def device(self): warn("PyOpenCLTarget.device is deprecated, it will stop working in 2022.", @@ -533,17 +555,12 @@ def with_device(self, device): def generate_value_arg_setup( kernel: LoopKernel, passed_names: Sequence[str] - ) -> Tuple[genpy.Generable, Mapping[int, int], int]: + ) -> genpy.Suite: options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase - cl_arg_idx = 0 - arg_idx_to_cl_arg_idx: Dict[int, int] = {} - - fp_arg_count = 0 - from genpy import If, Raise, Statement as S, Suite result: List[str] = [] @@ -564,11 +581,8 @@ def add_buf_arg(arg_idx, typechar, expr_str): buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") for arg_idx, passed_name in enumerate(passed_names): - arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx - if passed_name in kernel.all_inames(): - add_buf_arg(cl_arg_idx, kernel.index_dtype.numpy_dtype.char, passed_name) - cl_arg_idx += 1 + add_buf_arg(arg_idx, kernel.index_dtype.numpy_dtype.char, passed_name) continue var_descr = kernel.get_var_descriptor(passed_name) @@ -576,9 +590,6 @@ def add_buf_arg(arg_idx, typechar, expr_str): if not isinstance(var_descr, lp.ValueArg): assert isinstance(var_descr, ArrayBase) - # assume each of those generates exactly one... - cl_arg_idx += 1 - continue if not options.skip_arg_checks: @@ -587,11 +598,9 @@ def add_buf_arg(arg_idx, typechar, expr_str): 'must be supplied")'))) if var_descr.dtype.is_composite(): - buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"{passed_name}") - cl_arg_idx += 1 - elif var_descr.dtype.is_complex(): assert isinstance(var_descr.dtype, NumpyType) @@ -604,20 +613,12 @@ def add_buf_arg(arg_idx, typechar, expr_str): else: raise TypeError("unexpected complex type: %s" % dtype) - buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append( f"_lpy_pack('{arg_char}{arg_char}', " f"{passed_name}.real, {passed_name}.imag)") - cl_arg_idx += 1 - - fp_arg_count += 2 elif isinstance(var_descr.dtype, NumpyType): - if var_descr.dtype.dtype.kind == "f": - fp_arg_count += 1 - - add_buf_arg(cl_arg_idx, var_descr.dtype.dtype.char, passed_name) - cl_arg_idx += 1 + add_buf_arg(arg_idx, var_descr.dtype.dtype.char, passed_name) else: raise LoopyError("do not know how to pass argument of type '%s'" @@ -633,13 +634,14 @@ def add_buf_arg(arg_idx, typechar, expr_str): f"({', '.join(str(i) for i in args_and_indices)},), " ")")) - return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx + return Suite(result) # }}} -def generate_array_arg_setup(kernel: LoopKernel, passed_names: Sequence[str], - arg_idx_to_cl_arg_idx: Mapping[int, int]) -> genpy.Generable: +def generate_array_arg_setup( + kernel: LoopKernel, passed_names: Sequence[str], + ) -> genpy.Generable: from loopy.kernel.array import ArrayBase from genpy import Statement as S, Suite @@ -653,7 +655,7 @@ def generate_array_arg_setup(kernel: LoopKernel, passed_names: Sequence[str], var_descr = kernel.get_var_descriptor(passed_name) if isinstance(var_descr, ArrayBase): - cl_indices_and_args.append(arg_idx_to_cl_arg_idx[arg_idx]) + cl_indices_and_args.append(arg_idx) cl_indices_and_args.append(passed_name) if cl_indices_and_args: @@ -710,10 +712,12 @@ def get_function_definition( Return("_lpy_evt"), ])) - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], genpy.Generable]: # no such thing in Python - return None + return [], None def _get_global_temporaries(self, codegen_state): from loopy.kernel.data import AddressSpace @@ -755,11 +759,17 @@ def get_temporary_decls(self, codegen_state, schedule_index): return code_lines - def get_kernel_call(self, - codegen_state, subkernel_name, gsize, lsize): + def get_kernel_call( + self, codegen_state: CodeGenerationState, + subkernel_name: str, + gsize: Tuple[ExpressionT, ...], lsize: Tuple[ExpressionT, ...] + ) -> genpy.Suite: + from genpy import Suite, Assign, Assert, Line, Comment + + kernel = codegen_state.kernel + from loopy.schedule.tools import get_subkernel_arg_info - skai = get_subkernel_arg_info( - codegen_state.kernel, subkernel_name) + skai = get_subkernel_arg_info(kernel, subkernel_name) ecm = self.get_expression_to_code_mapper(codegen_state) @@ -768,12 +778,67 @@ def get_kernel_call(self, if not lsize: lsize = (1,) - value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ - generate_value_arg_setup(codegen_state.kernel, skai.passed_names) + from loopy.target.pyopencl import split_args_for_overflow + + assert isinstance(kernel.target, PyOpenCLTarget) + regular_arg_names, struct_overflow_arg_names = split_args_for_overflow( + kernel, skai.passed_names, + limit_arg_size_nbytes=kernel.target.limit_arg_size_nbytes, + pointer_size_nbytes=kernel.target.pointer_size_nbytes) + + value_arg_code = generate_value_arg_setup( + codegen_state.kernel, regular_arg_names) arry_arg_code = generate_array_arg_setup( - codegen_state.kernel, skai.passed_names, arg_idx_to_cl_arg_idx) + codegen_state.kernel, regular_arg_names) + + if struct_overflow_arg_names: + regular_arg_names_set = frozenset(regular_arg_names) + struct_overflow_arg_names_set = frozenset( + struct_overflow_arg_names) + + py_passed_args = [] + struct_pack_types = [] + struct_pack_args = [] + + for arg_name in skai.passed_names: + if arg_name in regular_arg_names_set: + py_passed_args.append(arg_name) + else: + assert arg_name in struct_overflow_arg_names_set + + arg = kernel.arg_dict[arg_name] + if isinstance(arg, ValueArg): + struct_pack_types.append(arg.dtype.numpy_dtype.char) + struct_pack_args.append(arg_name) + elif isinstance(arg, (ArrayArg, ConstantArg)): + struct_pack_types.append("P") + struct_pack_args.append(f"{arg_name}._ptr_as_int()") + elif isinstance(arg, ImageArg): + raise AssertionError() + else: + raise ValueError(f"unrecognized arg type: '{type(arg)}'") + + cl_arg_count = len(regular_arg_names) + overflow_args_code = Suite([ + # It's important for _lpy_overflow_args_buf to be in a variable. + # Otherwise, no reference to it will survive until the kernel + # launch and the buffer may be released. + Assign("_lpy_overflow_args_buf", + "_lpy_cl.Buffer(queue.context, " + "_lpy_cl.mem_flags.READ_ONLY " + "| _lpy_cl.mem_flags.COPY_HOST_PTR, " + "hostbuf=" + f"_lpy_pack({repr(''.join(struct_pack_types))}, " + f"{', '.join(struct_pack_args)}))"), + Line(f"_lpy_knl.set_arg({cl_arg_count}, _lpy_overflow_args_buf)") + ]) + + cl_arg_count += 1 + + else: + cl_arg_count = len(skai.passed_names) + overflow_args_code = Suite([]) - from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE import pyopencl.version as cl_ver @@ -797,6 +862,7 @@ def get_kernel_call(self, Line(), value_arg_code, arry_arg_code, + overflow_args_code, Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" "queue, _lpy_knl, " "%(gsize)s, %(lsize)s, " @@ -821,12 +887,201 @@ def get_kernel_call(self, # }}} +# {{{ split_args_for_overflow + +def split_args_for_overflow( + kernel: LoopKernel, passed_names: Sequence[str], + *, limit_arg_size_nbytes: Optional[int], pointer_size_nbytes: int + ) -> Tuple[Sequence[str], Sequence[str]]: + if limit_arg_size_nbytes is None: + return passed_names, [] + + regular_arg_names = [] + overflow_arg_names = [] + + # Consider that the pointer to the arg overflow struct also occupies + # argument space. + running_arg_size = pointer_size_nbytes + + for arg_name in passed_names: + arg = kernel.arg_dict[arg_name] + if isinstance(arg, (ValueArg, ArrayArg, ConstantArg)): + if isinstance(arg, ValueArg): + arg_size = arg.dtype.numpy_dtype.itemsize + else: + arg_size = pointer_size_nbytes + + if running_arg_size + arg_size > limit_arg_size_nbytes: + overflow_arg_names.append(arg_name) + else: + regular_arg_names.append(arg_name) + + running_arg_size += arg_size + + elif isinstance(arg, ImageArg): + regular_arg_names.append(arg_name) + else: + raise ValueError(f"unrecognized arg type: '{type(arg)}'") + + return regular_arg_names, overflow_arg_names + +# }}} + + # {{{ device ast builder class PyOpenCLCASTBuilder(OpenCLCASTBuilder): """A C device AST builder for integration with PyOpenCL. """ + # {{{ function decl/def, with arg overflow handling + + def get_function_definition( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, + schedule_index: int, function_decl: Generable, function_body: Generable, + ) -> Tuple[Sequence[Tuple[str, str]], Generable]: + assert isinstance(function_body, Block) + kernel = codegen_state.kernel + assert kernel.linearization is not None + + subkernel_name = cast(CallKernel, + kernel.linearization[schedule_index]).kernel_name + + result = [] + + from loopy.kernel.data import AddressSpace + # We only need to write declarations for global variables with + # the first device program. `is_first_dev_prog` determines + # whether this is the first device program in the schedule. + is_first_dev_prog = codegen_state.is_generating_device_code + for i in range(schedule_index): + if isinstance(kernel.linearization[i], CallKernel): + is_first_dev_prog = False + break + if is_first_dev_prog: + for tv in sorted( + kernel.temporary_variables.values(), + key=lambda key_tv: key_tv.name): + + if tv.address_space == AddressSpace.GLOBAL and ( + tv.initializer is not None): + assert tv.read_only + + decl = self.wrap_global_constant( + self.get_temporary_var_declarator(codegen_state, tv)) + + if tv.initializer is not None: + from loopy.target.c import generate_array_literal + decl = Initializer(decl, generate_array_literal( + codegen_state, tv, tv.initializer)) + + result.append(decl) + + # {{{ unpack overflow args + + if codegen_state.is_entrypoint: + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info(kernel, subkernel_name) + + _, struct_overflow_arg_names = split_args_for_overflow( + kernel, skai.passed_names, + limit_arg_size_nbytes=self.target.limit_arg_size_nbytes, + pointer_size_nbytes=self.target.pointer_size_nbytes) + + arg_unpack_code = [ + Initializer( + self.arg_to_cgen_declarator( + kernel, arg_name, + is_written=arg_name in skai.written_names), + f"_lpy_overflow_args->{arg_name}") + for arg_name in struct_overflow_arg_names + ] + ([Line()] if struct_overflow_arg_names else []) + + function_body = Block(arg_unpack_code + function_body.contents) + + # }}} + + fbody = FunctionBody(function_decl, function_body) + if not result: + return fbody + else: + return Collection(result+[Line(), fbody]) + + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], Generable]: + kernel = codegen_state.kernel + + assert codegen_state.kernel.linearization is not None + subkernel_name = cast( + CallKernel, + codegen_state.kernel.linearization[schedule_index] + ).kernel_name + + from cgen import FunctionDeclaration, Value, Struct + + name = codegen_result.current_program(codegen_state).name + if self.target.fortran_abi: + name += "_" + + from loopy.target.c import FunctionDeclarationWrapper + + if codegen_state.is_entrypoint: + name = Value("void", name) + + # subkernel launches occur only as part of entrypoint kernels for now + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info(kernel, subkernel_name) + passed_names = skai.passed_names + written_names = skai.written_names + + regular_arg_names, struct_overflow_arg_names = split_args_for_overflow( + kernel, passed_names, + limit_arg_size_nbytes=self.target.limit_arg_size_nbytes, + pointer_size_nbytes=self.target.pointer_size_nbytes) + + arg_overflow_struct_name = f"_lpy_arg_struct_{subkernel_name}" + arg_overflow_struct = Struct( + arg_overflow_struct_name, [ + self.arg_to_cgen_declarator( + kernel, arg_name, + is_written=arg_name in written_names) + for arg_name in struct_overflow_arg_names]) + + arg_struct_preambles = [ + (f"declare-{arg_overflow_struct_name}", + str(arg_overflow_struct)) + ] if struct_overflow_arg_names else [] + + return arg_struct_preambles, FunctionDeclarationWrapper( + self._wrap_kernel_decl( + codegen_state, schedule_index, + FunctionDeclaration( + name, + [self.arg_to_cgen_declarator( + kernel, arg_name, + is_written=arg_name in written_names) + for arg_name in regular_arg_names] + + [CLGlobal(Const(Pointer(Value( + f"struct {arg_overflow_struct_name}", + "_lpy_overflow_args"))))] + ))) + else: + name = Value("static void", name) + passed_names = [arg.name for arg in kernel.args] + written_names = kernel.get_written_variables() + + return [], FunctionDeclarationWrapper( + FunctionDeclaration( + name, + [self.arg_to_cgen_declarator( + kernel, arg_name, + is_written=arg_name in written_names) + for arg_name in passed_names])) + # }}} + # {{{ library @property diff --git a/loopy/target/python.py b/loopy/target/python.py index 4cba32c1c..cbf6aca24 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -23,13 +23,18 @@ THE SOFTWARE. """ +from typing import Tuple, Sequence + from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper +from genpy import Generable, Suite, Collection + from loopy.type_inference import TypeReader from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase -from genpy import Suite, Collection +from loopy.codegen import CodeGenerationState +from loopy.codegen.result import CodeGenerationResult # {{{ expression to code @@ -137,7 +142,7 @@ def _base_python_preamble_generator(preamble_info): """) -class PythonASTBuilderBase(ASTBuilderBase): +class PythonASTBuilderBase(ASTBuilderBase[Generable]): """A Python host AST builder for integration with PyOpenCL. """ @@ -161,9 +166,11 @@ def ast_module(self): import genpy return genpy - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): - return None + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> Tuple[Sequence[Tuple[str, str]], None]: + return [], None def get_function_definition(self, codegen_state, codegen_result, schedule_index, diff --git a/test/test_target.py b/test/test_target.py index b6acd0902..00c284293 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -25,8 +25,9 @@ import numpy as np import loopy as lp import pyopencl as cl -import pyopencl.clmath # noqa -import pyopencl.clrandom # noqa +import pyopencl.clmath +import pyopencl.clrandom +import pyopencl.tools import pytest from loopy.target.c import CTarget @@ -719,6 +720,46 @@ def test_empty_array_stride_check_fortran(ctx_factory): knl(queue, input=a_f) +def test_passing_bajillions_of_svm_args(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from pyopencl.characterize import has_coarse_grain_buffer_svm + if not has_coarse_grain_buffer_svm(queue.device): + pytest.skip("device does not support SVM, which is required for this test") + + nargsets = 300 + knl = lp.make_kernel( + "{[i]: 0<=i 1: exec(sys.argv[1])