diff --git a/CHANGELOG.md b/CHANGELOG.md index 62a07f3a..f8a08d99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - If demultiplexing has a success rate lower than 50% the command will exit with a status of 1. This prevents further pipeline stages to be run on what is probably bad data. - Clarify that `--min-size` and `--max-size` in the `annotate` stage should not be used at the same time as `--dynamic-filter`. +- Setting a lower threshold of 300 edges when `--dynamic-filter` is used in the `annotate` stage, components smaller than that will always + be filtered. Note that this can still be overridden by setting `--min-size` explicitly. +- Clarify error message when all components are filtered out in the `annotate` stage. ### Added diff --git a/src/pixelator/annotate/__init__.py b/src/pixelator/annotate/__init__.py index 747abf33..5b5a6bfe 100644 --- a/src/pixelator/annotate/__init__.py +++ b/src/pixelator/annotate/__init__.py @@ -22,6 +22,7 @@ MINIMUM_NBR_OF_CELLS_FOR_ANNOTATION, ) from pixelator.config import AntibodyPanel +from pixelator.exceptions import PixelatorBaseException from pixelator.graph.utils import components_metrics, edgelist_metrics from pixelator.pixeldataset import SIZE_DEFINITION, PixelDataset from pixelator.pixeldataset.utils import edgelist_to_anndata @@ -37,6 +38,12 @@ logger = logging.getLogger(__name__) +class NoCellsFoundException(PixelatorBaseException): + """Raised when no cells are found in the edge list.""" + + pass + + def filter_components_sizes( component_sizes: np.ndarray, min_size: Optional[int], @@ -81,7 +88,9 @@ def filter_components_sizes( # check if none of the components pass the filters n_components = filter_arr.sum() if n_components == 0: - raise RuntimeError("None of the components pass the filters") + raise NoCellsFoundException( + "All cells were filtered by the size filters. Consider either setting different size filters or disabling them." + ) logger.debug( "Filtering resulted in %i components that pass the filters", @@ -199,8 +208,11 @@ def annotate_components( ].sum() # save the components metrics (raw) + component_info_file_path = ( + Path(output) / f"{output_prefix}.raw_components_metrics.csv.gz" + ) component_metrics.to_csv( - Path(output) / f"{output_prefix}.raw_components_metrics.csv.gz", + component_info_file_path, header=True, index=True, sep=",", diff --git a/src/pixelator/annotate/cell_calling.py b/src/pixelator/annotate/cell_calling.py index dd7aed91..a2009a21 100644 --- a/src/pixelator/annotate/cell_calling.py +++ b/src/pixelator/annotate/cell_calling.py @@ -14,6 +14,7 @@ CELL_MAX_SIZE_SMOOTHING_FACTOR, CELL_MIN_SIZE_SMOOTHING_FACTOR, DISTANCE_DEVIATION_FACTOR, + MINIMUM_N_EDGES_CELL_SIZE, MINIMUM_NBR_OF_CELLS_FOR_SIZE_LIMIT, PRE_FILTER_LIMIT, ) @@ -210,4 +211,14 @@ def minimum_der2(df: pd.DataFrame) -> pd.Series: bound = potential_bounds[0] if potential_bounds else None logger.debug("Size limit of %i found", bound) + + if direction == "lower" and bound is not None: + if bound < MINIMUM_N_EDGES_CELL_SIZE: + logger.warning( + "Dynamic minimum component size found is below the minimum size threshold of %s edges. " + "Using that as the minimum size to avoid downstream problems.", + MINIMUM_N_EDGES_CELL_SIZE, + ) + bound = MINIMUM_N_EDGES_CELL_SIZE + return bound diff --git a/src/pixelator/annotate/constants.py b/src/pixelator/annotate/constants.py index c3c05844..b02a56dd 100644 --- a/src/pixelator/annotate/constants.py +++ b/src/pixelator/annotate/constants.py @@ -6,6 +6,7 @@ MINIMUM_NBR_OF_CELLS_FOR_ANNOTATION = 20 # size limit constants +MINIMUM_N_EDGES_CELL_SIZE = 300 MINIMUM_NBR_OF_CELLS_FOR_SIZE_LIMIT = 20 CELL_MIN_SIZE_SMOOTHING_FACTOR = 0.8 CELL_MAX_SIZE_SMOOTHING_FACTOR = 0 diff --git a/src/pixelator/cli/annotate.py b/src/pixelator/cli/annotate.py index 6b0760bc..38e0da03 100644 --- a/src/pixelator/cli/annotate.py +++ b/src/pixelator/cli/annotate.py @@ -6,6 +6,7 @@ import click from pixelator.annotate import annotate_components +from pixelator.annotate.constants import MINIMUM_N_EDGES_CELL_SIZE from pixelator.cli.common import logger, output_option from pixelator.config import config, load_antibody_panel from pixelator.utils import ( @@ -62,8 +63,9 @@ type=click.Choice(["both", "min", "max"]), help=( "Enable the dynamic component size filters. The following modes are available: " - "both/max/min. both: estimates both min and max size, min: estimates min size, max: estimates max size. " - "Note that this cannot be set at the same time as --min-size or --max-size." + "both/max/min. both: estimates both minimum and maximum component size, min: estimates the minimum component " + f"size (or uses {MINIMUM_N_EDGES_CELL_SIZE} edges, whichever is smallest), " + "max: estimates the maximum component size. Note that this cannot be set at the same time as --min-size or --max-size." ), ) @click.option( diff --git a/tests/annotate/test_annotate.py b/tests/annotate/test_annotate.py index 22563200..954c7711 100644 --- a/tests/annotate/test_annotate.py +++ b/tests/annotate/test_annotate.py @@ -10,7 +10,11 @@ import pytest from anndata import AnnData -from pixelator.annotate import cluster_components, filter_components_sizes +from pixelator.annotate import ( + NoCellsFoundException, + cluster_components, + filter_components_sizes, +) from pixelator.cli.annotate import annotate_components from pixelator.config import AntibodyPanel from pixelator.pixeldataset.utils import read_anndata @@ -135,3 +139,28 @@ def test_annotate_adata(edgelist: pd.DataFrame, tmp_path: Path, panel: AntibodyP assert (tmp_path / f"{output_prefix}.raw_components_metrics.csv.gz").is_file() assert (tmp_path / f"{output_prefix}.annotate.dataset.pxl").is_file() assert metrics_file.is_file() + + +@pytest.mark.integration_test +def test_annotate_adata_should_raise_no_cells_count_exception( + edgelist: pd.DataFrame, tmp_path: Path, panel: AntibodyPanel +): + with pytest.raises(NoCellsFoundException) as expected_exception: + output_prefix = "test_filtered" + metrics_file = tmp_path / "metrics.json" + assert not metrics_file.is_file() + tmp_edgelist_file = tmp_path / "tmp_edgelist.parquet" + edgelist.to_parquet(tmp_edgelist_file, index=False) + + annotate_components( + input=str(tmp_edgelist_file), + panel=panel, + output=str(tmp_path), + output_prefix=output_prefix, + metrics_file=str(metrics_file), + min_size=100_000, # Nothing should pass this + max_size=None, + dynamic_filter=None, + verbose=True, + aggregate_calling=True, + ) diff --git a/tests/annotate/test_cell_calling.py b/tests/annotate/test_cell_calling.py index eab54fb4..c995b5bd 100644 --- a/tests/annotate/test_cell_calling.py +++ b/tests/annotate/test_cell_calling.py @@ -9,6 +9,7 @@ from pixelator.annotate.cell_calling import ( find_component_size_limits, ) +from pixelator.annotate.constants import MINIMUM_N_EDGES_CELL_SIZE def test_find_component_min_size_limits_signal_and_noise(): @@ -31,7 +32,7 @@ def test_find_component_min_size_limits_only_signal(): min_bound = find_component_size_limits(np.absolute(test_data), direction="lower") assert min_bound > 100 - assert min_bound < 200 + assert min_bound <= MINIMUM_N_EDGES_CELL_SIZE def test_find_component_min_size_limits_only_noise(): @@ -40,7 +41,7 @@ def test_find_component_min_size_limits_only_noise(): test_data = random_state.poisson(100, 500) min_bound = find_component_size_limits(np.absolute(test_data), direction="lower") - assert min_bound < 100 + assert min_bound >= MINIMUM_N_EDGES_CELL_SIZE def test_find_component_min_size_limits_signal_and_many_doublets():