diff --git a/benchexec/container.py b/benchexec/container.py index efdfcf2eb..0beebe659 100644 --- a/benchexec/container.py +++ b/benchexec/container.py @@ -182,8 +182,9 @@ def execute_in_namespace(func, use_network_ns=True): flags |= libc.CLONE_NEWNET # We use the syscall clone() here, which is similar to fork(). - # Calling it without letting Python know about it is dangerous (especially because - # we want to execute Python code in the child, too), but so far it seems to work. + # Calling it directly without going through a Python API is somewhat dangerous + # (especially because we want to execute Python code in the child, too), but so far + # it seems to work well enough (cf. explanation in _clone_child_callback). # Basically we attempt to do (almost) the same that os.fork() does (cf. os_fork_impl # in https://github.com/python/cpython/blob/master/Modules/posixmodule.c). # On Python >= 3.7 we can call appropriate functions before and after fork that @@ -210,6 +211,16 @@ def execute_in_namespace(func, use_network_ns=True): @libc.CLONE_CALLBACK def _clone_child_callback(func_p): """Used as callback for clone, calls the passed function pointer.""" + # Strictly speaking, PyOS_AfterFork_Child should be called immediately after + # clone calls our callback before executing any Python code because the + # interpreter state is inconsistent, but here we are already in the Python + # world, so it could be too late. A safe way would use a C function to do this. + # A common problem are deadlocks if there is high thread contention in the + # Python interpeter (https://github.com/sosy-lab/benchexec/issues/435). + # For users of benchexec we avoid them in localexecution.py with + # sys.setswitchinterval(). Other users of ContainerExecutor should be safe as + # long as they do not use many threads. We cannot do anything before cloning + # because it might be too late anyway (gil_drop_request could be set already). ctypes.pythonapi.PyOS_AfterFork_Child() return _CLONE_NESTED_CALLBACK(func_p)() diff --git a/benchexec/localexecution.py b/benchexec/localexecution.py index b2cbaad85..f38f6010b 100644 --- a/benchexec/localexecution.py +++ b/benchexec/localexecution.py @@ -215,6 +215,10 @@ def run_finished(): with unfinished_runs_lock: unfinished_runs -= 1 + # Avoid https://github.com/sosy-lab/benchexec/issues/435 + py_switch_interval = sys.getswitchinterval() + sys.setswitchinterval(1000) + # create some workers for i in range(benchmark.num_of_threads): cores = coreAssignment[i] if coreAssignment else None @@ -242,6 +246,8 @@ def run_finished(): if energy and cpu_packages: energy = {pkg: energy[pkg] for pkg in energy if pkg in cpu_packages} + sys.setswitchinterval(py_switch_interval) + if STOPPED_BY_INTERRUPT: output_handler.set_error("interrupted", runSet) output_handler.output_after_run_set( diff --git a/doc/container.md b/doc/container.md index 1fa38d97a..8a9d87f12 100644 --- a/doc/container.md +++ b/doc/container.md @@ -245,3 +245,12 @@ with several versions of the Linux kernel, including at least kernel versions 4. ([bug report](https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1566471)). If a kernel upgrade does not help, please use a different access mode for NFS-mounted directories, such as `--hidden-dir` or `--read-only-dir`. + +### BenchExec sometimes hangs if many parallel runs are executed +This happens if we clone the Python process while it is in an inconsistent state. +Make sure to use BenchExec 1.22 or newer, +where [#435](https://github.com/sosy-lab/benchexec/issues/435) is fixed. +If it still occurs, please attach to all child process of BenchExec +with `sudo gdb -p `, get a stack trace with `bt`, +and [report an issue](https://github.com/sosy-lab/benchexec/issues/new) with as much information as possible. +BenchExec will usually be able to continue if the hanging child process is killed.