Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SCDataFrame class to enable file-based or DataFrame-based data handling (input and output) #32

Merged
merged 27 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ profile = "black"
exclude_dirs = ["tests"]

[tool.ruff]
target-version = "py311"
target-version = "py38"
line-length = 88
fix = true

Expand Down Expand Up @@ -62,7 +62,7 @@ select = [
# Ignore `E402` and `F401` (unused imports) in all `__init__.py` files
"__init__.py" = ["E402", "F401"]
# ignore typing rules for tests
"tests/*" = ["ANN201"]
"tests/*" = ["ANN201", "PLR0913"]
d33bs marked this conversation as resolved.
Show resolved Hide resolved

# set dynamic versioning capabilities for project
[tool.poetry-dynamic-versioning]
Expand Down
3 changes: 2 additions & 1 deletion src/cosmicqc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"""

from .analyze import find_outliers
from .scdataframe import SCDataFrame

# note: version placeholder is updated during build
# by poetry-dynamic-versioning.
__version__ = "0.0.1"
__version__ = "0.0.0"
d33bs marked this conversation as resolved.
Show resolved Hide resolved
32 changes: 23 additions & 9 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
import yaml
from scipy.stats import zscore as scipy_zscore

from .scdataframe import SCDataFrame

DEFAULT_QC_THRESHOLD_FILE = (
f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml"
)


def identify_outliers(
df: pd.DataFrame,
df: Union[SCDataFrame, pd.DataFrame, str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
Expand All @@ -30,8 +32,9 @@ def identify_outliers(
threshold of 0 as that would represent the whole dataset.

Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
df: Union[SCDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
Expand All @@ -52,6 +55,9 @@ def identify_outliers(
or not for use within other functions.
"""

# interpret the df as SCDataFrame
d33bs marked this conversation as resolved.
Show resolved Hide resolved
df = SCDataFrame(data=df)

# create a copy of the dataframe to ensure
# we don't modify the supplied dataframe inplace.
outlier_df = df.copy()
d33bs marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -107,7 +113,7 @@ def identify_outliers(


def find_outliers(
df: pd.DataFrame,
df: Union[SCDataFrame, pd.DataFrame, str],
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
Expand All @@ -117,8 +123,9 @@ def find_outliers(
with only the outliers and provided metadata columns.

Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
df: Union[SCDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
Expand All @@ -138,6 +145,9 @@ def find_outliers(
Outlier data frame for the given conditions.
"""

# interpret the df as SCDataFrame
d33bs marked this conversation as resolved.
Show resolved Hide resolved
df = SCDataFrame(data=df)

if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
Expand Down Expand Up @@ -169,7 +179,7 @@ def find_outliers(


def label_outliers(
df: pd.DataFrame,
df: Union[SCDataFrame, pd.DataFrame, str],
feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
Expand All @@ -179,8 +189,9 @@ def label_outliers(
where a cell passed or failed the quality control condition(s).

Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
df: Union[SCDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
feature_thresholds: Dict[str, float]
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
Expand All @@ -201,6 +212,9 @@ def label_outliers(
Full dataframe with optional scores and outlier boolean column.
"""

# interpret the df as SCDataFrame
d33bs marked this conversation as resolved.
Show resolved Hide resolved
df = SCDataFrame(data=df)

# for single outlier processing
if isinstance(feature_thresholds, (str, dict)):
# return the outlier dataframe for one threshold rule
Expand Down
173 changes: 173 additions & 0 deletions src/cosmicqc/scdataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""
Defines a SCDataFrame class for use in coSMicQC.
"""

import pathlib
from typing import Any, Dict, TypeVar, Union

import pandas as pd

# provide backwards compatibility for Self type in earlier Python versions.
# see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods
SCDataFrame_type = TypeVar("SCDataFrame_type", bound="SCDataFrame")


class SCDataFrame:
"""
A class designed to enhance single-cell data handling by wrapping
pandas DataFrame capabilities, providing advanced methods for quality control,
comprehensive analysis, and image-based data processing.

This class can initialize with either a pandas DataFrame or a file path (CSV, TSV,
TXT, or Parquet). When initialized with a file path, it reads the data into a
pandas DataFrame. It also includes capabilities to export data.

Attributes:
data_source (str):
A string indicating the data source, either 'pd.DataFrame'
or the file path.
data (pd.DataFrame):
The loaded data in a pandas DataFrame.

Methods:
__call__():
Returns the underlying pandas DataFrame.
d33bs marked this conversation as resolved.
Show resolved Hide resolved
__repr__():
Returns a representational string of the underlying pandas DataFrame.
__getattr__():
Returns the underlying attributes of the pandas DataFrame.
__getitem__():
Returns slice of data from pandas DataFrame.
"""

def __init__(
self: SCDataFrame_type,
data: Union[SCDataFrame_type, pd.DataFrame, str, pathlib.Path],
**kwargs: Dict[str, Any],
) -> None:
"""
Initializes the SCDataFrame with either a DataFrame or a file path.

Args:
data (Union[pd.DataFrame, str]):
The data source, either a pandas DataFrame or a file path.
**kwargs:
Additional keyword arguments to pass to the pandas read_* methods.
"""

if isinstance(data, SCDataFrame):
# if data is an instance of SCDataFrame, use its data_source and data
self.data_source = data.data_source
self.data = data.data

elif isinstance(data, pd.Series):
# if data is a pd.Series, remember this within the data_source attr
self.data_source = "pandas.Series"
# also cast the series to a dataframe
self.data = pd.DataFrame(data)

elif isinstance(data, pd.DataFrame):
# if data is a pd.DataFrame, remember this within the data_source attr
self.data_source = "pandas.DataFrame"
self.data = data

elif isinstance(data, (pathlib.Path, str)):
# if the data is a string or a pathlib path, remember the original source
# through a data_source attr
self.data_source = data

# interpret the data through pathlib
data_path = pathlib.Path(data)

# Read the data from the file based on its extension
if (
data_path.suffix == ".csv"
or data_path.suffix in (".tsv", ".txt")
or data_path.suffixes == [".csv", ".gz"]
d33bs marked this conversation as resolved.
Show resolved Hide resolved
or data_path.suffixes == [".tsv", ".gz"]
):
# read as a CSV, CSV.GZ, .TSV, or .TXT file
self.data = pd.read_csv(data, **kwargs)
elif data_path.suffix == ".parquet":
# read as a Parquet file
self.data = pd.read_parquet(data, **kwargs)
else:
raise ValueError("Unsupported file format for SCDataFrame.")
else:
raise ValueError("Unsupported data type for SCDataFrame.")

def export(
self: SCDataFrame_type, file_path: str, **kwargs: Dict[str, Any]
) -> None:
"""
Exports the underlying pandas DataFrame to a file.

Args:
file_path (str):
The path where the DataFrame should be saved.
**kwargs:
Additional keyword arguments to pass to the pandas to_* methods.
"""

data_path = pathlib.Path(file_path)

# export to csv
if ".csv" in data_path.suffixes:
self.data.to_csv(file_path, **kwargs)
d33bs marked this conversation as resolved.
Show resolved Hide resolved
# export to tsv
elif any(elem in data_path.suffixes for elem in (".tsv", ".txt")):
self.data.to_csv(file_path, sep="\t", **kwargs)
# export to parquet
elif data_path.suffix == ".parquet":
self.data.to_parquet(file_path, **kwargs)
else:
raise ValueError("Unsupported file format for export.")

def __call__(self: SCDataFrame_type) -> pd.DataFrame:
"""
Returns the underlying pandas DataFrame.

Returns:
pd.DataFrame: The data in a pandas DataFrame.
"""
return self.data

def __repr__(self: SCDataFrame_type) -> str:
"""
Returns the representation of the underlying pandas DataFrame.

Returns:
pd.DataFrame: The data in a pandas DataFrame.
"""
return repr(self.data)
d33bs marked this conversation as resolved.
Show resolved Hide resolved

def __getattr__(self: SCDataFrame_type, attr: str) -> Any: # noqa: ANN401
d33bs marked this conversation as resolved.
Show resolved Hide resolved
"""
Intercept attribute accesses and delegate them to the underlying
pandas DataFrame, except for custom methods.

Args:
attr (str):
The name of the attribute being accessed.

Returns:
Any:
The value of the attribute from the pandas DataFrame.
"""
if attr in self.__dict__:
return self.__dict__[attr]
return getattr(self.data, attr)

def __getitem__(self: SCDataFrame_type, key: Union[int, str]) -> Any: # noqa: ANN401
"""
Returns an element or a slice of the underlying pandas DataFrame.

Args:
key:
The key or slice to access the data.

Returns:
pd.DataFrame or any:
The selected element or slice of data.
"""
return self.data[key]
62 changes: 62 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
https://docs.pytest.org/en/7.1.x/explanation/fixtures.html
"""

import pathlib

import pandas as pd
import pytest

Expand All @@ -24,3 +26,63 @@ def fixture_basic_outlier_dataframe():
Creates basic example data for use in tests
"""
return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})


@pytest.fixture(name="basic_outlier_csv")
def fixture_basic_outlier_csv(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data csv for use in tests
"""

basic_outlier_dataframe.to_csv(
csv_path := tmp_path / "basic_example.csv", index=False
)

return csv_path


@pytest.fixture(name="basic_outlier_csv_gz")
def fixture_basic_outlier_csv_gz(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data csv for use in tests
"""

basic_outlier_dataframe.to_csv(
csv_gz_path := tmp_path / "example.csv.gz", index=False, compression="gzip"
)

return csv_gz_path


@pytest.fixture(name="basic_outlier_tsv")
def fixture_basic_outlier_tsv(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data tsv for use in tests
"""

basic_outlier_dataframe.to_csv(
tsv_path := tmp_path / "example.tsv", sep="\t", index=False
)

return tsv_path


@pytest.fixture(name="basic_outlier_parquet")
def fixture_basic_outlier_parquet(
tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
):
"""
Creates basic example data parquet for use in tests
"""

basic_outlier_dataframe.to_parquet(
parquet_path := tmp_path / "example.parquet", index=False
)

return parquet_path
Loading
Loading