Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/exe 2065 set hard cutoff on min size #201

Merged
merged 2 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- If demultiplexing has a success rate lower than 50% the command will exit with a status of 1. This prevents further pipeline stages to be run on
what is probably bad data.
- Clarify that `--min-size` and `--max-size` in the `annotate` stage should not be used at the same time as `--dynamic-filter`.
- Setting a lower threshold of 300 edges when `--dynamic-filter` is used in the `annotate` stage, components smaller than that will always
be filtered. Note that this can still be overridden by setting `--min-size` explicitly.
- Clarify error message when all components are filtered out in the `annotate` stage.

### Added

Expand Down
16 changes: 14 additions & 2 deletions src/pixelator/annotate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
MINIMUM_NBR_OF_CELLS_FOR_ANNOTATION,
)
from pixelator.config import AntibodyPanel
from pixelator.exceptions import PixelatorBaseException
from pixelator.graph.utils import components_metrics, edgelist_metrics
from pixelator.pixeldataset import SIZE_DEFINITION, PixelDataset
from pixelator.pixeldataset.utils import edgelist_to_anndata
Expand All @@ -37,6 +38,12 @@
logger = logging.getLogger(__name__)


class NoCellsFoundException(PixelatorBaseException):
"""Raised when no cells are found in the edge list."""

pass


def filter_components_sizes(
component_sizes: np.ndarray,
min_size: Optional[int],
Expand Down Expand Up @@ -81,7 +88,9 @@ def filter_components_sizes(
# check if none of the components pass the filters
n_components = filter_arr.sum()
if n_components == 0:
raise RuntimeError("None of the components pass the filters")
raise NoCellsFoundException(
"All cells were filtered by the size filters. Consider either setting different size filters or disabling them."
)

logger.debug(
"Filtering resulted in %i components that pass the filters",
Expand Down Expand Up @@ -199,8 +208,11 @@ def annotate_components(
].sum()

# save the components metrics (raw)
component_info_file_path = (
Path(output) / f"{output_prefix}.raw_components_metrics.csv.gz"
)
component_metrics.to_csv(
Path(output) / f"{output_prefix}.raw_components_metrics.csv.gz",
component_info_file_path,
header=True,
index=True,
sep=",",
Expand Down
11 changes: 11 additions & 0 deletions src/pixelator/annotate/cell_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
CELL_MAX_SIZE_SMOOTHING_FACTOR,
CELL_MIN_SIZE_SMOOTHING_FACTOR,
DISTANCE_DEVIATION_FACTOR,
MINIMUM_N_EDGES_CELL_SIZE,
MINIMUM_NBR_OF_CELLS_FOR_SIZE_LIMIT,
PRE_FILTER_LIMIT,
)
Expand Down Expand Up @@ -210,4 +211,14 @@ def minimum_der2(df: pd.DataFrame) -> pd.Series:
bound = potential_bounds[0] if potential_bounds else None

logger.debug("Size limit of %i found", bound)

if direction == "lower" and bound is not None:
if bound < MINIMUM_N_EDGES_CELL_SIZE:
logger.warning(
"Dynamic minimum component size found is below the minimum size threshold of %s edges. "
"Using that as the minimum size to avoid downstream problems.",
MINIMUM_N_EDGES_CELL_SIZE,
)
bound = MINIMUM_N_EDGES_CELL_SIZE

return bound
1 change: 1 addition & 0 deletions src/pixelator/annotate/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MINIMUM_NBR_OF_CELLS_FOR_ANNOTATION = 20

# size limit constants
MINIMUM_N_EDGES_CELL_SIZE = 300
MINIMUM_NBR_OF_CELLS_FOR_SIZE_LIMIT = 20
CELL_MIN_SIZE_SMOOTHING_FACTOR = 0.8
CELL_MAX_SIZE_SMOOTHING_FACTOR = 0
Expand Down
6 changes: 4 additions & 2 deletions src/pixelator/cli/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import click

from pixelator.annotate import annotate_components
from pixelator.annotate.constants import MINIMUM_N_EDGES_CELL_SIZE
from pixelator.cli.common import logger, output_option
from pixelator.config import config, load_antibody_panel
from pixelator.utils import (
Expand Down Expand Up @@ -62,8 +63,9 @@
type=click.Choice(["both", "min", "max"]),
help=(
"Enable the dynamic component size filters. The following modes are available: "
"both/max/min. both: estimates both min and max size, min: estimates min size, max: estimates max size. "
"Note that this cannot be set at the same time as --min-size or --max-size."
"both/max/min. both: estimates both minimum and maximum component size, min: estimates the minimum component "
f"size (or uses {MINIMUM_N_EDGES_CELL_SIZE} edges, whichever is smallest), "
"max: estimates the maximum component size. Note that this cannot be set at the same time as --min-size or --max-size."
),
)
@click.option(
Expand Down
31 changes: 30 additions & 1 deletion tests/annotate/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
import pytest
from anndata import AnnData

from pixelator.annotate import cluster_components, filter_components_sizes
from pixelator.annotate import (
NoCellsFoundException,
cluster_components,
filter_components_sizes,
)
from pixelator.cli.annotate import annotate_components
from pixelator.config import AntibodyPanel
from pixelator.pixeldataset.utils import read_anndata
Expand Down Expand Up @@ -135,3 +139,28 @@ def test_annotate_adata(edgelist: pd.DataFrame, tmp_path: Path, panel: AntibodyP
assert (tmp_path / f"{output_prefix}.raw_components_metrics.csv.gz").is_file()
assert (tmp_path / f"{output_prefix}.annotate.dataset.pxl").is_file()
assert metrics_file.is_file()


@pytest.mark.integration_test
def test_annotate_adata_should_raise_no_cells_count_exception(
edgelist: pd.DataFrame, tmp_path: Path, panel: AntibodyPanel
):
with pytest.raises(NoCellsFoundException) as expected_exception:
output_prefix = "test_filtered"
metrics_file = tmp_path / "metrics.json"
assert not metrics_file.is_file()
tmp_edgelist_file = tmp_path / "tmp_edgelist.parquet"
edgelist.to_parquet(tmp_edgelist_file, index=False)

annotate_components(
input=str(tmp_edgelist_file),
panel=panel,
output=str(tmp_path),
output_prefix=output_prefix,
metrics_file=str(metrics_file),
min_size=100_000, # Nothing should pass this
max_size=None,
dynamic_filter=None,
verbose=True,
aggregate_calling=True,
)
5 changes: 3 additions & 2 deletions tests/annotate/test_cell_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pixelator.annotate.cell_calling import (
find_component_size_limits,
)
from pixelator.annotate.constants import MINIMUM_N_EDGES_CELL_SIZE


def test_find_component_min_size_limits_signal_and_noise():
Expand All @@ -31,7 +32,7 @@ def test_find_component_min_size_limits_only_signal():

min_bound = find_component_size_limits(np.absolute(test_data), direction="lower")
assert min_bound > 100
assert min_bound < 200
assert min_bound <= MINIMUM_N_EDGES_CELL_SIZE


def test_find_component_min_size_limits_only_noise():
Expand All @@ -40,7 +41,7 @@ def test_find_component_min_size_limits_only_noise():
test_data = random_state.poisson(100, 500)

min_bound = find_component_size_limits(np.absolute(test_data), direction="lower")
assert min_bound < 100
assert min_bound >= MINIMUM_N_EDGES_CELL_SIZE
ambarrio marked this conversation as resolved.
Show resolved Hide resolved


def test_find_component_min_size_limits_signal_and_many_doublets():
Expand Down
Loading