WayScience · d33bs · Jun 25, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ profile = "black"
 exclude_dirs = ["tests"]
 
 [tool.ruff]
-target-version = "py311"
+target-version = "py38"
 line-length = 88
 fix = true
 
@@ -62,7 +62,7 @@ select = [
 # Ignore `E402` and `F401` (unused imports) in all `__init__.py` files
 "__init__.py" = ["E402", "F401"]
 # ignore typing rules for tests
-"tests/*" = ["ANN201"]
+"tests/*" = ["ANN201", "PLR0913"]
 
 # set dynamic versioning capabilities for project
 [tool.poetry-dynamic-versioning]

diff --git a/src/cosmicqc/__init__.py b/src/cosmicqc/__init__.py
@@ -3,7 +3,8 @@
 """
 
 from .analyze import find_outliers
+from .scdataframe import SCDataFrame
 
 # note: version placeholder is updated during build
 # by poetry-dynamic-versioning.
-__version__ = "0.0.1"
+__version__ = "0.0.0"
diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
@@ -11,13 +11,15 @@
 import yaml
 from scipy.stats import zscore as scipy_zscore
 
+from .scdataframe import SCDataFrame
+
 DEFAULT_QC_THRESHOLD_FILE = (
     f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml"
 )
 
 
 def identify_outliers(
-    df: pd.DataFrame,
+    df: Union[SCDataFrame, pd.DataFrame, str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
@@ -30,8 +32,9 @@ def identify_outliers(
     threshold of 0 as that would represent the whole dataset.
 
     Args:
-        df: pd.DataFrame
-            Data frame with converted output from CytoTable.
+        df: Union[SCDataFrame, pd.DataFrame, str]
+            DataFrame or file string-based filepath of a
+            Parquet, CSV, or TSV file with CytoTable output or similar data.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
@@ -52,6 +55,9 @@ def identify_outliers(
             or not for use within other functions.
     """
 
+    # interpret the df as SCDataFrame
+    df = SCDataFrame(data=df)
+
     # create a copy of the dataframe to ensure
     # we don't modify the supplied dataframe inplace.
     outlier_df = df.copy()
@@ -107,7 +113,7 @@ def identify_outliers(
 
 
 def find_outliers(
-    df: pd.DataFrame,
+    df: Union[SCDataFrame, pd.DataFrame, str],
     metadata_columns: List[str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
@@ -117,8 +123,9 @@ def find_outliers(
     with only the outliers and provided metadata columns.
 
     Args:
-        df: pd.DataFrame
-            Data frame with converted output from CytoTable.
+        df: Union[SCDataFrame, pd.DataFrame, str]
+            DataFrame or file string-based filepath of a
+            Parquet, CSV, or TSV file with CytoTable output or similar data.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
@@ -138,6 +145,9 @@ def find_outliers(
             Outlier data frame for the given conditions.
     """
 
+    # interpret the df as SCDataFrame
+    df = SCDataFrame(data=df)
+
     if isinstance(feature_thresholds, str):
         feature_thresholds = read_thresholds_set_from_file(
             feature_thresholds=feature_thresholds,
@@ -169,7 +179,7 @@ def find_outliers(
 
 
 def label_outliers(
-    df: pd.DataFrame,
+    df: Union[SCDataFrame, pd.DataFrame, str],
     feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
@@ -179,8 +189,9 @@ def label_outliers(
     where a cell passed or failed the quality control condition(s).
 
         Args:
-            df: pd.DataFrame
-                Data frame with converted output from CytoTable.
+            df: Union[SCDataFrame, pd.DataFrame, str]
+                DataFrame or file string-based filepath of a
+                Parquet, CSV, or TSV file with CytoTable output or similar data.
             feature_thresholds: Dict[str, float]
                 One of two options:
                 A dictionary with the feature name(s) as the key(s) and their assigned
@@ -201,6 +212,9 @@ def label_outliers(
                 Full dataframe with optional scores and outlier boolean column.
     """
 
+    # interpret the df as SCDataFrame
+    df = SCDataFrame(data=df)
+
     # for single outlier processing
     if isinstance(feature_thresholds, (str, dict)):
         # return the outlier dataframe for one threshold rule

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
@@ -0,0 +1,173 @@
+"""
+Defines a SCDataFrame class for use in coSMicQC.
+"""
+
+import pathlib
+from typing import Any, Dict, TypeVar, Union
+
+import pandas as pd
+
+# provide backwards compatibility for Self type in earlier Python versions.
+# see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods
+SCDataFrame_type = TypeVar("SCDataFrame_type", bound="SCDataFrame")
+
+
+class SCDataFrame:
+    """
+    A class designed to enhance single-cell data handling by wrapping
+    pandas DataFrame capabilities, providing advanced methods for quality control,
+    comprehensive analysis, and image-based data processing.
+
+    This class can initialize with either a pandas DataFrame or a file path (CSV, TSV,
+    TXT, or Parquet). When initialized with a file path, it reads the data into a
+    pandas DataFrame. It also includes capabilities to export data.
+
+    Attributes:
+        data_source (str):
+            A string indicating the data source, either 'pd.DataFrame'
+            or the file path.
+        data (pd.DataFrame):
+            The loaded data in a pandas DataFrame.
+
+    Methods:
+        __call__():
+            Returns the underlying pandas DataFrame.
+        __repr__():
+            Returns a representational string of the underlying pandas DataFrame.
+        __getattr__():
+            Returns the underlying attributes of the pandas DataFrame.
+        __getitem__():
+            Returns slice of data from pandas DataFrame.
+    """
+
+    def __init__(
+        self: SCDataFrame_type,
+        data: Union[SCDataFrame_type, pd.DataFrame, str, pathlib.Path],
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        """
+        Initializes the SCDataFrame with either a DataFrame or a file path.
+
+        Args:
+            data (Union[pd.DataFrame, str]):
+                The data source, either a pandas DataFrame or a file path.
+            **kwargs:
+                Additional keyword arguments to pass to the pandas read_* methods.
+        """
+
+        if isinstance(data, SCDataFrame):
+            # if data is an instance of SCDataFrame, use its data_source and data
+            self.data_source = data.data_source
+            self.data = data.data
+
+        elif isinstance(data, pd.Series):
+            # if data is a pd.Series, remember this within the data_source attr
+            self.data_source = "pandas.Series"
+            # also cast the series to a dataframe
+            self.data = pd.DataFrame(data)
+
+        elif isinstance(data, pd.DataFrame):
+            # if data is a pd.DataFrame, remember this within the data_source attr
+            self.data_source = "pandas.DataFrame"
+            self.data = data
+
+        elif isinstance(data, (pathlib.Path, str)):
+            # if the data is a string or a pathlib path, remember the original source
+            # through a data_source attr
+            self.data_source = data
+
+            # interpret the data through pathlib
+            data_path = pathlib.Path(data)
+
+            # Read the data from the file based on its extension
+            if (
+                data_path.suffix == ".csv"
+                or data_path.suffix in (".tsv", ".txt")
+                or data_path.suffixes == [".csv", ".gz"]
+                or data_path.suffixes == [".tsv", ".gz"]
+            ):
+                # read as a CSV, CSV.GZ, .TSV, or .TXT file
+                self.data = pd.read_csv(data, **kwargs)
+            elif data_path.suffix == ".parquet":
+                # read as a Parquet file
+                self.data = pd.read_parquet(data, **kwargs)
+            else:
+                raise ValueError("Unsupported file format for SCDataFrame.")
+        else:
+            raise ValueError("Unsupported data type for SCDataFrame.")
+
+    def export(
+        self: SCDataFrame_type, file_path: str, **kwargs: Dict[str, Any]
+    ) -> None:
+        """
+        Exports the underlying pandas DataFrame to a file.
+
+        Args:
+            file_path (str):
+                The path where the DataFrame should be saved.
+            **kwargs:
+                Additional keyword arguments to pass to the pandas to_* methods.
+        """
+
+        data_path = pathlib.Path(file_path)
+
+        # export to csv
+        if ".csv" in data_path.suffixes:
+            self.data.to_csv(file_path, **kwargs)
+        # export to tsv
+        elif any(elem in data_path.suffixes for elem in (".tsv", ".txt")):
+            self.data.to_csv(file_path, sep="\t", **kwargs)
+        # export to parquet
+        elif data_path.suffix == ".parquet":
+            self.data.to_parquet(file_path, **kwargs)
+        else:
+            raise ValueError("Unsupported file format for export.")
+
+    def __call__(self: SCDataFrame_type) -> pd.DataFrame:
+        """
+        Returns the underlying pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: The data in a pandas DataFrame.
+        """
+        return self.data
+
+    def __repr__(self: SCDataFrame_type) -> str:
+        """
+        Returns the representation of the underlying pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: The data in a pandas DataFrame.
+        """
+        return repr(self.data)
+
+    def __getattr__(self: SCDataFrame_type, attr: str) -> Any:  # noqa: ANN401
+        """
+        Intercept attribute accesses and delegate them to the underlying
+        pandas DataFrame, except for custom methods.
+
+        Args:
+            attr (str):
+                The name of the attribute being accessed.
+
+        Returns:
+            Any:
+                The value of the attribute from the pandas DataFrame.
+        """
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        return getattr(self.data, attr)
+
+    def __getitem__(self: SCDataFrame_type, key: Union[int, str]) -> Any:  # noqa: ANN401
+        """
+        Returns an element or a slice of the underlying pandas DataFrame.
+
+        Args:
+            key:
+                The key or slice to access the data.
+
+        Returns:
+            pd.DataFrame or any:
+                The selected element or slice of data.
+        """
+        return self.data[key]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,8 @@
 https://docs.pytest.org/en/7.1.x/explanation/fixtures.html
 """
 
+import pathlib
+
 import pandas as pd
 import pytest
 
@@ -24,3 +26,63 @@ def fixture_basic_outlier_dataframe():
     Creates basic example data for use in tests
     """
     return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
+
+
+@pytest.fixture(name="basic_outlier_csv")
+def fixture_basic_outlier_csv(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data csv for use in tests
+    """
+
+    basic_outlier_dataframe.to_csv(
+        csv_path := tmp_path / "basic_example.csv", index=False
+    )
+
+    return csv_path
+
+
+@pytest.fixture(name="basic_outlier_csv_gz")
+def fixture_basic_outlier_csv_gz(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data csv for use in tests
+    """
+
+    basic_outlier_dataframe.to_csv(
+        csv_gz_path := tmp_path / "example.csv.gz", index=False, compression="gzip"
+    )
+
+    return csv_gz_path
+
+
+@pytest.fixture(name="basic_outlier_tsv")
+def fixture_basic_outlier_tsv(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data tsv for use in tests
+    """
+
+    basic_outlier_dataframe.to_csv(
+        tsv_path := tmp_path / "example.tsv", sep="\t", index=False
+    )
+
+    return tsv_path
+
+
+@pytest.fixture(name="basic_outlier_parquet")
+def fixture_basic_outlier_parquet(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data parquet for use in tests
+    """
+
+    basic_outlier_dataframe.to_parquet(
+        parquet_path := tmp_path / "example.parquet", index=False
+    )
+
+    return parquet_path