Skip to content

Commit

Permalink
Merge pull request #148 from PixelgenTechnologies/feature/exe-1777-ha…
Browse files Browse the repository at this point in the history
…ndle-oom-on-collapse-gracefully

Handle when child process is killed by OOM killer
  • Loading branch information
johandahlberg authored Jun 4, 2024
2 parents 206c806 + 902117f commit 482a2c7
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 3 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

* Fix a bug where `a_pixels_per_b_pixel` summary statistics where equal to the `b_pixels_per_a_pixel` statistics.
* `collapse` will return exit code 137 when one of the child processes is killed by the system (e.g. because it is
to much memory). This allows e.g. Nextflow to retry the process with more memory automatically.

## [0.17.1] - 2024-05-27

Expand Down
39 changes: 36 additions & 3 deletions src/pixelator/cli/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Copyright © 2022 Pixelgen Technologies AB.
"""

import sys
from collections import defaultdict
from concurrent import futures
from pathlib import Path
Expand All @@ -25,6 +26,38 @@
)


def _handle_errors(jobs, executor):
for job in jobs:
exception = job.exception()
if exception is None:
continue

logger.error(
"Found an issue in the process pool. Trying to determine what went wrong and set the correct exit code. Exception was: %s",
exception,
)
process_map = executor._processes
for pid in process_map.keys():
exit_code = process_map[pid].exitcode
if exit_code is not None and exit_code != 0:
logger.error(
"The child process in the process pool returned a non-zero exit code: %s.",
exit_code,
)
# If we have an out of memory exception, make sure we exit with that.
if abs(exit_code) == 9:
logger.error(
"One of the child processes was killed (exit code: 9). "
"Usually this is caused by the out-of-memory killer terminating the process. "
"The parent process will return an exit code of 137 to indicate that it terminated because of a kill signal in the child process."
)
sys.exit(137)
logger.error(
"Was unable to determine what when wrong in process pool. Will raise original exception."
)
raise exception


@click.command(
"collapse",
short_help=(
Expand Down Expand Up @@ -238,12 +271,12 @@ def collapse(
min_count=min_count,
)
)
jobs = list(futures.as_completed(jobs))
_handle_errors(jobs, executor)

total_input_reads = 0
tmp_files = []
for job in futures.as_completed(jobs):
if job.exception() is not None:
raise job.exception()
for job in jobs:
# the worker returns a path to a file (temp antibody edge list)
tmp_file, input_reads_count = job.result()
if tmp_file is not None:
Expand Down
5 changes: 5 additions & 0 deletions src/pixelator/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ def log_exception(exc_type, exc_value, traceback_obj):
# and thus we ignore them here.
return False

if issubclass(exc_type, SystemExit):
# SystemExit is raised when the application has been explicitly
# directed to exit, so we don't what a trace dumped for that.
return False

self._root_logger.critical(
"Unhandled exception of type: {}".format(exc_type.__name__)
)
Expand Down

0 comments on commit 482a2c7

Please sign in to comment.