diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index e6bb6d9a50..c502a47376 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -178,11 +178,6 @@ def generate_code(sdfg, validate=True) -> List[CodeObject]: shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg") raise RuntimeError('SDFG serialization failed - files do not match') - # Run with the deserialized version - # NOTE: This means that all subsequent modifications to `sdfg` - # are not reflected outside of this function (e.g., library - # node expansion). - sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_types.infer_connector_types(sdfg) diff --git a/dace/codegen/common.py b/dace/codegen/common.py index 5dafc696cf..37cfb864eb 100644 --- a/dace/codegen/common.py +++ b/dace/codegen/common.py @@ -74,7 +74,7 @@ def update_persistent_desc(desc: data.Data, sdfg: SDFG): Replaces the symbols used in a persistent data descriptor according to NestedSDFG's symbol mapping. The replacement happens recursively up to the top-level SDFG. """ - if (desc.lifetime == dtypes.AllocationLifetime.Persistent and sdfg.parent + if (desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External) and sdfg.parent and any(str(s) in sdfg.parent_nsdfg_node.symbol_mapping for s in desc.free_symbols)): newdesc = deepcopy(desc) csdfg = sdfg @@ -155,7 +155,7 @@ def get_gpu_runtime() -> gpu_runtime.GPURuntime: backend = get_gpu_backend() if backend == 'cuda': libpath = ctypes.util.find_library('cudart') - if os.name == 'nt' and not libpath: # Windows-based search + if os.name == 'nt' and not libpath: # Windows-based search for version in (12, 11, 10, 9): libpath = ctypes.util.find_library(f'cudart64_{version}0') if libpath: diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index ea1b9e9cb8..d0d29cfa1e 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -147,21 +147,20 @@ def __exit__(self, *args, **kwargs): self.unload() -def _array_interface_ptr(array: Any, array_type: dt.Array) -> int: +def _array_interface_ptr(array: Any, storage: dtypes.StorageType) -> int: """ If the given array implements ``__array_interface__`` (see ``dtypes.is_array``), returns the base host or device pointer to the array's allocated memory. :param array: Array object that implements NumPy's array interface. - :param array_type: Data descriptor of the array (used to get storage - location to determine whether it's a host or GPU device - pointer). + :param array_type: Storage location of the array, used to determine whether + it is a host or device pointer (e.g. GPU). :return: A pointer to the base location of the allocated buffer. """ if hasattr(array, 'data_ptr'): return array.data_ptr() - if array_type.storage == dtypes.StorageType.GPU_Global: + if storage == dtypes.StorageType.GPU_Global: return array.__cuda_array_interface__['data'][0] return array.__array_interface__['data'][0] @@ -200,10 +199,13 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None): self.argnames = argnames self.has_gpu_code = False + self.external_memory_types = set() for _, _, aval in self._sdfg.arrays_recursive(): if aval.storage in dtypes.GPU_STORAGES: self.has_gpu_code = True break + if aval.lifetime == dtypes.AllocationLifetime.External: + self.external_memory_types.add(aval.storage) if not self.has_gpu_code: for node, _ in self._sdfg.all_nodes_recursive(): if getattr(node, 'schedule', False) in dtypes.GPU_SCHEDULES: @@ -271,6 +273,42 @@ class State(ctypes.Structure): return State + def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]: + """ + Returns the total external memory size to be allocated for this SDFG. + + :return: A dictionary mapping storage types to the number of bytes necessary + to allocate for the SDFG to work properly. + """ + if not self._initialized: + raise ValueError('Compiled SDFG is uninitialized, please call ``initialize`` prior to ' + 'querying external memory size.') + + result: Dict[dtypes.StorageType, int] = {} + for storage in self.external_memory_types: + func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}') + result[storage] = func(self._libhandle, *self._lastargs[1]) + + return result + + def set_workspace(self, storage: dtypes.StorageType, workspace: Any): + """ + Sets the workspace for the given storage type to the given buffer. + + :param storage: The storage type to fill. + :param workspace: An array-convertible object (through ``__[cuda_]array_interface__``, + see ``_array_interface_ptr``) to use for the workspace. + """ + if not self._initialized: + raise ValueError('Compiled SDFG is uninitialized, please call ``initialize`` prior to ' + 'setting external memory.') + if storage not in self.external_memory_types: + raise ValueError(f'Compiled SDFG does not specify external memory of {storage}') + + func = self._lib.get_symbol(f'__dace_set_external_memory_{storage.name}', None) + ptr = _array_interface_ptr(workspace, storage) + func(self._libhandle, ctypes.c_void_p(ptr), *self._lastargs[1]) + @property def filename(self): return self._lib._library_filename @@ -487,7 +525,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: for arg, actype, atype, aname in callparams if aname in symbols) # Replace arrays with their base host/device pointers - newargs = tuple((ctypes.c_void_p(_array_interface_ptr(arg, atype)), actype, + newargs = tuple((ctypes.c_void_p(_array_interface_ptr(arg, atype.storage)), actype, atype) if dtypes.is_array(arg) else (arg, actype, atype) for arg, actype, atype, _ in callparams) diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index 103cb5fa2e..0b4f58d5ef 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -32,6 +32,7 @@ class DefinedMemlets: referenced correctly in nested scopes and SDFGs. The ones defined in the first (top) scope, refer to global variables. """ + def __init__(self): self._scopes = [(None, {}, True), (None, {}, True)] @@ -142,6 +143,7 @@ def remove(self, name: str, ancestor: int = 0, is_global: bool = False) -> Tuple class TargetDispatcher(object): """ Dispatches sub-SDFG generation (according to scope), storage<->storage copies, and storage<->tasklet copies to targets. """ + def __init__(self, framecode): # Avoid import loop from dace.codegen.targets import framecode as fc @@ -215,7 +217,8 @@ def register_state_dispatcher(self, dispatcher, predicate=None): """ if not hasattr(dispatcher, "generate_state"): - raise TypeError("State dispatcher \"{}\" does not " "implement \"generate_state\"".format(dispatcher)) + raise TypeError("State dispatcher \"{}\" does not " + "implement \"generate_state\"".format(dispatcher)) if predicate is None: self._generic_state_dispatcher = dispatcher else: @@ -241,7 +244,8 @@ def register_node_dispatcher(self, dispatcher, predicate=None): :see: TargetCodeGenerator """ if not hasattr(dispatcher, "generate_node"): - raise TypeError("Node dispatcher must " "implement \"generate_node\"") + raise TypeError("Node dispatcher must " + "implement \"generate_node\"") if predicate is None: self._generic_node_dispatcher = dispatcher else: @@ -448,9 +452,12 @@ def dispatch_allocate(self, """ Dispatches a code generator for data allocation. """ self._used_targets.add(self._array_dispatchers[datadesc.storage]) - if datadesc.lifetime is dtypes.AllocationLifetime.Persistent: + if datadesc.lifetime == dtypes.AllocationLifetime.Persistent: declaration_stream = CodeIOStream() callsite_stream = self.frame._initcode + elif datadesc.lifetime == dtypes.AllocationLifetime.External: + declaration_stream = CodeIOStream() + callsite_stream = CodeIOStream() else: declaration_stream = callsite_stream @@ -468,8 +475,10 @@ def dispatch_deallocate(self, sdfg: SDFG, dfg: ScopeSubgraphView, state_id: int, """ Dispatches a code generator for a data deallocation. """ self._used_targets.add(self._array_dispatchers[datadesc.storage]) - if datadesc.lifetime is dtypes.AllocationLifetime.Persistent: + if datadesc.lifetime == dtypes.AllocationLifetime.Persistent: callsite_stream = self.frame._exitcode + elif datadesc.lifetime == dtypes.AllocationLifetime.External: + return self._array_dispatchers[datadesc.storage].deallocate_array(sdfg, dfg, state_id, node, datadesc, function_stream, callsite_stream) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index d5e7cacc53..295bf21310 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -62,7 +62,8 @@ def copy_expr( offset_cppstr = "0" dt = "" - is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent) + is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) defined_types = None # Non-free symbol dependent Arrays due to their shape dependent_shape = (isinstance(data_desc, data.Array) and not isinstance(data_desc, data.View) and any( @@ -219,7 +220,7 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: # Special case: If memory is persistent and defined in this SDFG, add state # struct to name - if (desc.transient and desc.lifetime is dtypes.AllocationLifetime.Persistent): + if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): from dace.codegen.targets.cuda import CUDACodeGen # Avoid import loop if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays @@ -1252,7 +1253,7 @@ def visit_BinOp(self, node: ast.BinOp): if isinstance(node.op, ast.Pow): from dace.frontend.python import astutils try: - evaluated_node = astutils.evalnode(node.right, {**self.constants, 'dace': dace,'math': math}) + evaluated_node = astutils.evalnode(node.right, {**self.constants, 'dace': dace, 'math': math}) unparsed = symbolic.pystr_to_symbolic(evaluated_node) evaluated_constant = symbolic.evaluate(unparsed, self.constants) evaluated = symbolic.symstr(evaluated_constant, cpp_mode=True) @@ -1356,8 +1357,8 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream, if isinstance(desc, data.Array) and desc.start_offset != 0: ptrname = f'({ptrname} - {sym2cpp(desc.start_offset)})' if Config.get_bool('compiler', 'cuda', 'syncdebug'): - callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg, - state_id, scope_exit) + callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg, state_id, + scope_exit) callsite_stream.write(f'DACE_GPU_CHECK({backend}DeviceSynchronize());') else: callsite_stream.write(f'{backend}FreeAsync({ptrname}, {cudastream});\n', sdfg, state_id, scope_exit) @@ -1381,7 +1382,8 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream, and edge.dst._cuda_stream != node._cuda_stream): callsite_stream.write( """DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream})); -DACE_GPU_CHECK({backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0));""".format( +DACE_GPU_CHECK({backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0));""" + .format( ev=edge._cuda_event if hasattr(edge, "_cuda_event") else 0, src_stream=cudastream, dst_stream=edge.dst._cuda_stream, diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 83f178c538..eb7d232966 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -222,7 +222,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de # We add the `dfg is not None` check because the `sdutils.is_nonfree_sym_dependent` check will fail if # `nodedesc` is a View and `dfg` is None. if dfg and not sdutils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): - raise NotImplementedError("The declare_array method should only be used for variables " + raise NotImplementedError("The declare_array method should only be used for variables " "that must have their declaration and allocation separate.") name = node.data @@ -278,7 +278,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d declared = self._dispatcher.declared_arrays.has(alloc_name) define_var = self._dispatcher.defined_vars.add - if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent: + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): define_var = self._dispatcher.defined_vars.add_global nodedesc = update_persistent_desc(nodedesc, sdfg) @@ -449,7 +449,8 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, alloc_name = f'({alloc_name} - {cpp.sym2cpp(nodedesc.start_offset)})' if self._dispatcher.declared_arrays.has(alloc_name): - is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent) + is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) self._dispatcher.declared_arrays.remove(alloc_name, is_global=is_global) if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)): @@ -932,7 +933,8 @@ def process_out_memlets(self, desc = sdfg.arrays[memlet.data] ptrname = cpp.ptr(memlet.data, desc, sdfg, self._frame) is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, - dtypes.AllocationLifetime.Persistent) + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) try: defined_type, _ = self._dispatcher.declared_arrays.get(ptrname, is_global=is_global) except KeyError: @@ -1430,7 +1432,8 @@ def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, # If pointer, also point to output desc = sdfg.arrays[edge.data.data] ptrname = cpp.ptr(edge.data.data, desc, sdfg, self._frame) - is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent) + is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) defined_type, _ = self._dispatcher.defined_vars.get(ptrname, is_global=is_global) base_ptr = cpp.cpp_ptr_expr(sdfg, edge.data, defined_type, codegen=self._frame) callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', sdfg, state_id, src_node) @@ -1448,18 +1451,22 @@ def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, # Add "__restrict__" keywords to arguments that do not alias with others in the context of this SDFG restrict_args = [] for atype, aname, _ in memlet_references: + def make_restrict(expr: str) -> str: # Check whether "restrict" has already been added before and can be added if expr.strip().endswith('*'): return '__restrict__' else: return '' + if aname in node.sdfg.arrays and not node.sdfg.arrays[aname].may_alias: restrict_args.append(make_restrict(atype)) else: restrict_args.append('') - arguments += [f'{atype} {restrict} {aname}' for (atype, aname, _), restrict in zip(memlet_references, restrict_args)] + arguments += [ + f'{atype} {restrict} {aname}' for (atype, aname, _), restrict in zip(memlet_references, restrict_args) + ] arguments += [ f'{node.sdfg.symbols[aname].as_arg(aname)}' for aname in sorted(node.symbol_mapping.keys()) if aname not in sdfg.constants diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index f4db868730..1e06a1d3ef 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -497,7 +497,9 @@ def cmake_options(): hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0] flags = Config.get("compiler", "cuda", "hip_args") - flags += ' ' + ' '.join('--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) for arch in hip_arch) + flags += ' ' + ' '.join( + '--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) + for arch in hip_arch) options.append("-DEXTRA_HIP_FLAGS=\"{}\"".format(flags)) if Config.get('compiler', 'cpu', 'executable'): @@ -568,7 +570,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d return self._cpu_codegen.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) - if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent: + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) result_decl = StringIO() @@ -717,7 +719,8 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, dataname = f'({dataname} - {cpp.sym2cpp(nodedesc.start_offset)})' if self._dispatcher.declared_arrays.has(dataname): - is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent) + is_global = nodedesc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) self._dispatcher.declared_arrays.remove(dataname, is_global=is_global) if isinstance(nodedesc, dace.data.Stream): @@ -1449,7 +1452,8 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st if aname in sdfg.arrays: data_desc = sdfg.arrays[aname] is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, - dtypes.AllocationLifetime.Persistent) + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) # Non-free symbol dependent Arrays due to their shape dependent_shape = (isinstance(data_desc, dt.Array) and not isinstance(data_desc, dt.View) and any( str(s) not in self._frame.symbols_and_constants(sdfg) @@ -1482,7 +1486,8 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st data_desc = sdfg.arrays[aname] ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame) is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, - dtypes.AllocationLifetime.Persistent) + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) defined_type, ctype = self._dispatcher.defined_vars.get(ptrname, is_global=is_global) CUDACodeGen._in_device_code = True inner_ptrname = cpp.ptr(aname, data_desc, sdfg, self._frame) diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py index b920b0e9d5..413cb751d6 100644 --- a/dace/codegen/targets/fpga.py +++ b/dace/codegen/targets/fpga.py @@ -1171,7 +1171,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d # NOTE: The code below fixes symbol-related issues with transient data originally defined in a NestedSDFG scope # but promoted to be persistent. These data must have their free symbols replaced with the corresponding # top-level SDFG symbols. - if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent: + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) result_decl = StringIO() diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 09bbd30ab8..6f302c11ba 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -333,6 +333,57 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre callsite_stream.write('delete __state;\n', sdfg) callsite_stream.write('return __err;\n}\n', sdfg) + def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeIOStream): + """ + If external data descriptors are found in the SDFG (or any nested SDFGs), + this function will generate exported functions to (1) get the required memory size + per storage location (``__dace_get_external_memory_size_``, where ```` + can be ``CPU_Heap`` or any other ``dtypes.StorageType``); and (2) set the externally-allocated + pointer to the generated code's internal state (``__dace_set_external_memory_``). + """ + + # Collect external arrays + ext_arrays: Dict[dtypes.StorageType, List[Tuple[SDFG, str, data.Data]]] = collections.defaultdict(list) + for subsdfg, aname, arr in sdfg.arrays_recursive(): + if arr.lifetime == dtypes.AllocationLifetime.External: + ext_arrays[arr.storage].append((subsdfg, aname, arr)) + + # Only generate functions as necessary + if not ext_arrays: + return + + initparams = sdfg.init_signature(free_symbols=self.free_symbols(sdfg)) + initparams_comma = (', ' + initparams) if initparams else '' + + for storage, arrays in ext_arrays.items(): + size = 0 + for subsdfg, aname, arr in arrays: + size += arr.total_size * arr.dtype.bytes + + # Size query functions + callsite_stream.write( + f''' +DACE_EXPORTED size_t __dace_get_external_memory_size_{storage.name}({sdfg.name}_t *__state{initparams_comma}) +{{ + return {sym2cpp(size)}; +}} +''', sdfg) + + # Pointer set functions + callsite_stream.write( + f''' +DACE_EXPORTED void __dace_set_external_memory_{storage.name}({sdfg.name}_t *__state, char *ptr{initparams_comma}) +{{''', sdfg) + + offset = 0 + for subsdfg, aname, arr in arrays: + allocname = f'__state->__{subsdfg.sdfg_id}_{aname}' + callsite_stream.write(f'{allocname} = decltype({allocname})(ptr + {sym2cpp(offset)});', subsdfg) + offset += arr.total_size * arr.dtype.bytes + + # Footer + callsite_stream.write('}', sdfg) + def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True): sid = sdfg.node_id(state) @@ -525,7 +576,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): access_instances[sdfg.sdfg_id].get(name, [(None, None)])[-1] # Cases - if desc.lifetime is dtypes.AllocationLifetime.Persistent: + if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): # Persistent memory is allocated in initialization code and # exists in the library state structure @@ -872,6 +923,7 @@ def generate_code(self, function_signature = ('void __program_%s_internal(%s_t *__state%s)\n{\n' % (sdfg.name, sdfg.name, params)) self.generate_footer(sdfg, footer_global_stream, footer_stream) + self.generate_external_memory_management(sdfg, footer_stream) header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) diff --git a/dace/codegen/targets/snitch.py b/dace/codegen/targets/snitch.py index 1c4ba8f821..1eb6f68a2a 100644 --- a/dace/codegen/targets/snitch.py +++ b/dace/codegen/targets/snitch.py @@ -366,7 +366,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre # NOTE: The code below fixes symbol-related issues with transient data originally defined in a NestedSDFG scope # but promoted to be persistent. These data must have their free symbols replaced with the corresponding # top-level SDFG symbols. - if nodedesc.lifetime == dtypes.AllocationLifetime.Persistent: + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) # Compute array size @@ -411,7 +411,8 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre elif not symbolic.issymbolic(arrsize, sdfg.constants): # static allocation declaration_stream.write(f'// static allocate storage "{nodedesc.storage}"') - if node.desc(sdfg).lifetime == dace.AllocationLifetime.Persistent: + if node.desc(sdfg).lifetime in (dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External): # Don't put a static if it is declared in the state struct for C compliance declaration_stream.write(f'{nodedesc.dtype.ctype} {name}[{cpp.sym2cpp(arrsize)}];\n', sdfg, state_id, node) diff --git a/dace/dtypes.py b/dace/dtypes.py index a86a746884..dee2283f25 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -132,6 +132,7 @@ class AllocationLifetime(aenum.AutoNumberEnum): SDFG = () #: Allocated throughout the innermost SDFG (possibly nested) Global = () #: Allocated throughout the entire program (outer SDFG) Persistent = () #: Allocated throughout multiple invocations (init/exit) + External = () #: Allocated and managed outside the generated code @undefined_safe_enum diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 260360776f..3abef05dc9 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -45,6 +45,7 @@ if TYPE_CHECKING: from dace.codegen.instrumentation.report import InstrumentationReport from dace.codegen.instrumentation.data.data_report import InstrumentedDataReport + from dace.codegen.compiled_sdfg import CompiledSDFG def _arrays_to_json(arrays): @@ -2189,8 +2190,7 @@ def is_loaded(self) -> bool: dll = cs.ReloadableDLL(binary_filename, self.name) return dll.is_loaded() - def compile(self, output_file=None, validate=True) -> \ - 'dace.codegen.compiler.CompiledSDFG': + def compile(self, output_file=None, validate=True) -> 'CompiledSDFG': """ Compiles a runnable binary from this SDFG. :param output_file: If not None, copies the output library file to diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index fa86163063..abad1e7907 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -76,9 +76,10 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context if name is not None and not dtypes.validate_name(name): raise InvalidSDFGError("Invalid array name %s" % name, sdfg, None) # Allocation lifetime checks - if (desc.lifetime is dtypes.AllocationLifetime.Persistent and desc.storage is dtypes.StorageType.Register): + if (desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External) + and desc.storage == dtypes.StorageType.Register): raise InvalidSDFGError( - "Array %s cannot be both persistent and use Register as " + "Array %s cannot be both persistent/external and use Register as " "storage type. Please use a different storage location." % name, sdfg, None) # Check for valid bank assignments @@ -320,7 +321,8 @@ def validate_state(state: 'dace.sdfg.SDFGState', raise InvalidSDFGError("Invalid state name", sdfg, state_id) if state._parent != sdfg: - raise InvalidSDFGError("State does not point to the correct " "parent", sdfg, state_id) + raise InvalidSDFGError("State does not point to the correct " + "parent", sdfg, state_id) # Unreachable ######################################## @@ -736,6 +738,7 @@ def validate_state(state: 'dace.sdfg.SDFGState', class InvalidSDFGError(Exception): """ A class of exceptions thrown when SDFG validation fails. """ + def __init__(self, message: str, sdfg: 'SDFG', state_id: int): self.message = message self.sdfg = sdfg @@ -759,7 +762,8 @@ def _getlineinfo(self, obj) -> str: if lineinfo.start_line >= 0: if lineinfo.start_column > 0: - return (f'File "{lineinfo.filename}", line {lineinfo.start_line}, ' f'column {lineinfo.start_column}') + return (f'File "{lineinfo.filename}", line {lineinfo.start_line}, ' + f'column {lineinfo.start_column}') return f'File "{lineinfo.filename}", line {lineinfo.start_line}' return f'File "{lineinfo.filename}"' @@ -790,6 +794,7 @@ def __str__(self): class InvalidSDFGInterstateEdgeError(InvalidSDFGError): """ Exceptions of invalid inter-state edges in an SDFG. """ + def __init__(self, message: str, sdfg: 'SDFG', edge_id: int): self.message = message self.sdfg = sdfg @@ -835,6 +840,7 @@ def __str__(self): class InvalidSDFGNodeError(InvalidSDFGError): """ Exceptions of invalid nodes in an SDFG state. """ + def __init__(self, message: str, sdfg: 'SDFG', state_id: int, node_id: int): self.message = message self.sdfg = sdfg @@ -872,12 +878,14 @@ class NodeNotExpandedError(InvalidSDFGNodeError): Exception that is raised whenever a library node was not expanded before code generation. """ + def __init__(self, sdfg: 'SDFG', state_id: int, node_id: int): super().__init__('Library node not expanded', sdfg, state_id, node_id) class InvalidSDFGEdgeError(InvalidSDFGError): """ Exceptions of invalid edges in an SDFG state. """ + def __init__(self, message: str, sdfg: 'SDFG', state_id: int, edge_id: int): self.message = message self.sdfg = sdfg diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py index a63b37aa19..b33ad43a3b 100644 --- a/dace/transformation/interstate/sdfg_nesting.py +++ b/dace/transformation/interstate/sdfg_nesting.py @@ -590,7 +590,7 @@ def apply(self, state: SDFGState, sdfg: SDFG): for dnode in state.data_nodes(): if state.degree(dnode) == 0 and dnode not in isolated_nodes: state.remove_node(dnode) - + sdfg._sdfg_list = sdfg.reset_sdfg_list() def _modify_access_to_access(self, @@ -764,8 +764,8 @@ def _candidates(sdfg: SDFG, graph: SDFGState, nsdfg: nodes.NestedSDFG) -> Dict[s if not desc.transient: continue # Needs to be allocated in "Scope" or "Persistent" lifetime - if (desc.lifetime != dtypes.AllocationLifetime.Scope - and desc.lifetime != dtypes.AllocationLifetime.Persistent): + if (desc.lifetime not in (dtypes.AllocationLifetime.Scope, dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External)): continue # If same transient is connected with multiple connectors, bail # for now diff --git a/dace/transformation/passes/dead_dataflow_elimination.py b/dace/transformation/passes/dead_dataflow_elimination.py index 7c0949ce4d..aeaf1cdbd1 100644 --- a/dace/transformation/passes/dead_dataflow_elimination.py +++ b/dace/transformation/passes/dead_dataflow_elimination.py @@ -222,7 +222,7 @@ def _is_node_dead(self, node: nodes.Node, sdfg: SDFG, state: SDFGState, dead_nod # If access node is persistent, mark as dead only if self.remove_persistent_memory is set if not self.remove_persistent_memory: - if desc.lifetime == dtypes.AllocationLifetime.Persistent: + if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): return False # If data will be used later, cannot remove diff --git a/dace/transformation/passes/scalar_to_symbol.py b/dace/transformation/passes/scalar_to_symbol.py index f751ebc271..124efdaae1 100644 --- a/dace/transformation/passes/scalar_to_symbol.py +++ b/dace/transformation/passes/scalar_to_symbol.py @@ -89,7 +89,7 @@ def find_promotable_scalars(sdfg: sd.SDFG, transients_only: bool = True, integer continue if desc.total_size != 1: continue - if desc.lifetime is dtypes.AllocationLifetime.Persistent: + if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): continue candidates.add(aname) @@ -589,9 +589,7 @@ class ScalarToSymbolPromotion(passes.Pass): CATEGORY: str = 'Simplification' - ignore = props.SetProperty(element_type=str, - default=set(), - desc='Fields that should not be promoted.') + ignore = props.SetProperty(element_type=str, default=set(), desc='Fields that should not be promoted.') transients_only = props.Property(dtype=bool, default=True, desc='Promote only transients.') integers_only = props.Property(dtype=bool, default=True, desc='Allow promotion of integer scalars only.') diff --git a/tests/codegen/external_memory_test.py b/tests/codegen/external_memory_test.py new file mode 100644 index 0000000000..c72c574806 --- /dev/null +++ b/tests/codegen/external_memory_test.py @@ -0,0 +1,98 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" +Tests external memory allocation. +""" +import dace +import numpy as np +import pytest + + +@pytest.mark.parametrize('symbolic', (False, True)) +def test_external_mem(symbolic): + N = dace.symbol('N') if symbolic else 20 + + @dace.program + def tester(a: dace.float64[N]): + workspace = dace.ndarray([N], dace.float64, lifetime=dace.AllocationLifetime.External) + + workspace[:] = a + workspace += 1 + a[:] = workspace + + sdfg = tester.to_sdfg() + + # Test that there is no allocation + code = sdfg.generate_code()[0].clean_code + assert 'new double' not in code + assert 'delete[]' not in code + assert 'set_external_memory' in code + + a = np.random.rand(20) + + if symbolic: + extra_args = dict(a=a, N=20) + else: + extra_args = {} + + # Test workspace size + csdfg = sdfg.compile() + csdfg.initialize(**extra_args) + sizes = csdfg.get_workspace_sizes() + assert sizes == {dace.StorageType.CPU_Heap: 20 * 8} + + # Test setting the workspace + wsp = np.random.rand(20) + csdfg.set_workspace(dace.StorageType.CPU_Heap, wsp) + + ref = a + 1 + + csdfg(a, **extra_args) + + assert np.allclose(a, ref) + assert np.allclose(wsp, ref) + + +def test_external_twobuffers(): + N = dace.symbol('N') + + @dace.program + def tester(a: dace.float64[N]): + workspace = dace.ndarray([N], dace.float64, lifetime=dace.AllocationLifetime.External) + workspace2 = dace.ndarray([2], dace.float64, lifetime=dace.AllocationLifetime.External) + + workspace[:] = a + workspace += 1 + workspace2[0] = np.sum(workspace) + workspace2[1] = np.mean(workspace) + a[0] = workspace2[0] + workspace2[1] + + sdfg = tester.to_sdfg() + csdfg = sdfg.compile() + + # Test workspace size + a = np.random.rand(20) + csdfg.initialize(a=a, N=20) + sizes = csdfg.get_workspace_sizes() + assert sizes == {dace.StorageType.CPU_Heap: 22 * 8} + + # Test setting the workspace + wsp = np.random.rand(22) + csdfg.set_workspace(dace.StorageType.CPU_Heap, wsp) + + ref = a + 1 + ref2 = np.copy(a) + s, m = np.sum(ref), np.mean(ref) + ref2[0] = s + m + + csdfg(a=a, N=20) + + assert np.allclose(a, ref2) + assert np.allclose(wsp[:-2], ref) + assert np.allclose(wsp[-2], s) + assert np.allclose(wsp[-1], m) + + +if __name__ == '__main__': + test_external_mem(False) + test_external_mem(True) + test_external_twobuffers()