Merge pull request #1283 from spcl/gpu-ux

GPU User Experience Improvements
spcl · Jun 29, 2023 · 81b3e4e · 81b3e4e
2 parents 672fc30 + 5d2ce3e
commit 81b3e4e
Show file tree

Hide file tree

Showing 35 changed files with 1,146 additions and 524 deletions.
diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py
@@ -31,7 +31,7 @@ def generate_headers(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str:
     exit_params = (sdfg.name, sdfg.name)
     proto += 'typedef void * %sHandle_t;\n' % sdfg.name
     proto += 'extern "C" %sHandle_t __dace_init_%s(%s);\n' % init_params
-    proto += 'extern "C" void __dace_exit_%s(%sHandle_t handle);\n' % exit_params
+    proto += 'extern "C" int __dace_exit_%s(%sHandle_t handle);\n' % exit_params
     proto += 'extern "C" void __program_%s(%sHandle_t handle%s);\n' % params
     return proto
 
@@ -69,15 +69,16 @@ def generate_dummy(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str:
 
 int main(int argc, char **argv) {{
     {sdfg.name}Handle_t handle;
+    int err;
 {allocations}
 
     handle = __dace_init_{sdfg.name}({init_params});
     __program_{sdfg.name}(handle{params});
-    __dace_exit_{sdfg.name}(handle);
+    err = __dace_exit_{sdfg.name}(handle);
 
 {deallocations}
 
-    return 0;
+    return err;
 }}
 '''
 

diff --git a/dace/codegen/common.py b/dace/codegen/common.py
@@ -144,3 +144,48 @@ def _try_execute(cmd: str) -> bool:
                        'set the DaCe configuration entry ``compiler.cuda.backend`` '
                        'or the ``DACE_compiler_cuda_backend`` environment variable '
                        'to either "cuda" or "hip".')
+
+
+def get_gpu_runtime_library() -> ctypes.CDLL:
+    backend = get_gpu_backend()
+    if backend == 'cuda':
+        libpath = ctypes.util.find_library('cudart')
+        if os.name == 'nt' and not libpath: # Windows-based search
+            for version in (12, 11, 10, 9):
+                libpath = ctypes.util.find_library(f'cudart64_{version}0')
+                if libpath:
+                    break
+    elif backend == 'hip':
+        libpath = ctypes.util.find_library('amdhip64')
+    else:
+        raise RuntimeError(f'Cannot obtain GPU runtime library for backend {backend}')
+
+    if not libpath:
+        envname = 'PATH' if os.name == 'nt' else 'LD_LIBRARY_PATH'
+        raise RuntimeError(f'GPU runtime library for {backend} not found. Please set the {envname} '
+                           'environment variable to point to the libraries.')
+
+    return ctypes.CDLL(libpath)
+
+
+def get_gpu_runtime_error_string(err: int) -> str:
+    lib = get_gpu_runtime_library()
+
+    # Obtain the error string
+    geterrorstring = getattr(lib, f'{get_gpu_backend()}GetErrorString')
+    geterrorstring.restype = ctypes.c_char_p
+    return geterrorstring(err).decode('utf-8')
+
+
+def get_gpu_runtime_last_error() -> str:
+    lib = get_gpu_runtime_library()
+
+    getlasterror = getattr(lib, f'{get_gpu_backend()}GetLastError')
+    res: int = getlasterror()
+    if res == 0:
+        return None
+
+    # Obtain the error string
+    geterrorstring = getattr(lib, f'{get_gpu_backend()}GetErrorString')
+    geterrorstring.restype = ctypes.c_char_p
+    return geterrorstring(res).decode('utf-8')
diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py
@@ -5,14 +5,14 @@
 import re
 import shutil
 import subprocess
-from typing import Any, Callable, Dict, List, Tuple, Optional, Type
+from typing import Any, Callable, Dict, List, Tuple, Optional, Type, Union
 import warnings
 
 import numpy as np
 import sympy as sp
 
 from dace import data as dt, dtypes, hooks, symbolic
-from dace.codegen import exceptions as cgx
+from dace.codegen import exceptions as cgx, common
 from dace.config import Config
 from dace.frontend import operations
 
@@ -22,6 +22,7 @@ class ReloadableDLL(object):
     A reloadable shared object (or dynamically linked library), which
     bypasses Python's dynamic library reloading issues.
     """
+
     def __init__(self, library_filename, program_name):
         """
         Creates a new reloadable shared object.
@@ -181,6 +182,7 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self._init = lib.get_symbol('__dace_init_{}'.format(sdfg.name))
         self._init.restype = ctypes.c_void_p
         self._exit = lib.get_symbol('__dace_exit_{}'.format(sdfg.name))
+        self._exit.restype = ctypes.c_int
         self._cfunc = lib.get_symbol('__program_{}'.format(sdfg.name))
 
         # Cache SDFG return values
@@ -197,6 +199,17 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None):
         self._free_symbols = self._sdfg.free_symbols
         self.argnames = argnames
 
+        self.has_gpu_code = False
+        for _, _, aval in self._sdfg.arrays_recursive():
+            if aval.storage in dtypes.GPU_STORAGES:
+                self.has_gpu_code = True
+                break
+        if not self.has_gpu_code:
+            for node, _ in self._sdfg.all_nodes_recursive():
+                if getattr(node, 'schedule', False) in dtypes.GPU_SCHEDULES:
+                    self.has_gpu_code = True
+                    break
+
     def get_exported_function(self, name: str, restype=None) -> Optional[Callable[..., Any]]:
         """
         Tries to find a symbol by name in the compiled SDFG, and convert it to a callable function
@@ -297,8 +310,20 @@ def initialize(self, *args, **kwargs):
 
     def finalize(self):
         if self._exit is not None:
-            self._exit(self._libhandle)
+            res: int = self._exit(self._libhandle)
             self._initialized = False
+            if res != 0:
+                raise RuntimeError(
+                    f'An error was detected after running "{self._sdfg.name}": {self._get_error_text(res)}')
+
+    def _get_error_text(self, result: Union[str, int]) -> str:
+        if self.has_gpu_code:
+            if isinstance(result, int):
+                result = common.get_gpu_runtime_error_string(result)
+            return (f'{result}. Consider enabling synchronous debugging mode (environment variable: '
+                    'DACE_compiler_cuda_syncdebug=1) to see where the issue originates from.')
+        else:
+            return result
 
     def __call__(self, *args, **kwargs):
         # Update arguments from ordered list
@@ -312,11 +337,23 @@ def __call__(self, *args, **kwargs):
             if self._initialized is False:
                 self._lib.load()
                 self._initialize(initargtuple)
-            
+
             with hooks.invoke_compiled_sdfg_call_hooks(self, argtuple):
                 if self.do_not_execute is False:
                     self._cfunc(self._libhandle, *argtuple)
 
+            if self.has_gpu_code:
+                # Optionally get errors from call
+                try:
+                    lasterror = common.get_gpu_runtime_last_error()
+                except RuntimeError as ex:
+                    warnings.warn(f'Could not get last error from GPU runtime: {ex}')
+                    lasterror = None
+
+                if lasterror is not None:
+                    raise RuntimeError(
+                        f'An error was detected when calling "{self._sdfg.name}": {self._get_error_text(lasterror)}')
+
             return self._convert_return_values()
         except (RuntimeError, TypeError, UnboundLocalError, KeyError, cgx.DuplicateDLLError, ReferenceError):
             self._lib.unload()
@@ -545,7 +582,6 @@ def _initialize_return_values(self, kwargs):
                 arr = self._create_array(*shape_desc)
                 self._return_arrays.append(arr)
 
-
     def _convert_return_values(self):
         # Return the values as they would be from a Python function
         if self._return_arrays is None or len(self._return_arrays) == 0:

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
@@ -1318,7 +1318,7 @@ def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream):
         if hasattr(e.src, "_cuda_stream") and e.src._cuda_stream != 'nullptr':
             cudastream = "__state->gpu_context->streams[%d]" % e.src._cuda_stream
             callsite_stream.write(
-                "%sStreamSynchronize(%s);" % (common.get_gpu_backend(), cudastream),
+                "DACE_GPU_CHECK(%sStreamSynchronize(%s));" % (common.get_gpu_backend(), cudastream),
                 sdfg,
                 state_id,
                 [e.src, e.dst],
@@ -1356,9 +1356,9 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream,
             if isinstance(desc, data.Array) and desc.start_offset != 0:
                 ptrname = f'({ptrname} - {sym2cpp(desc.start_offset)})'
             if Config.get_bool('compiler', 'cuda', 'syncdebug'):
-                callsite_stream.write(f'DACE_CUDA_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg,
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg,
                                       state_id, scope_exit)
-                callsite_stream.write(f'DACE_CUDA_CHECK({backend}DeviceSynchronize());')
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}DeviceSynchronize());')
             else:
                 callsite_stream.write(f'{backend}FreeAsync({ptrname}, {cudastream});\n', sdfg, state_id, scope_exit)
             to_remove.add((sd, name))
@@ -1380,8 +1380,8 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream,
             if (isinstance(edge.dst, nodes.AccessNode) and hasattr(edge.dst, '_cuda_stream')
                     and edge.dst._cuda_stream != node._cuda_stream):
                 callsite_stream.write(
-                    """{backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream});
-{backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0);""".format(
+                    """DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream}));
+DACE_GPU_CHECK({backend}StreamWaitEvent(__state->gpu_context->streams[{dst_stream}], __state->gpu_context->events[{ev}], 0));""".format(
                         ev=edge._cuda_event if hasattr(edge, "_cuda_event") else 0,
                         src_stream=cudastream,
                         dst_stream=edge.dst._cuda_stream,