From 1b7844cf8d6fdd2b631e0b731edf75fb756390ea Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Jun 2024 12:21:05 -0600 Subject: [PATCH 01/27] Create qcdataframe.py --- src/cosmicqc/qcdataframe.py | 63 +++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/cosmicqc/qcdataframe.py diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py new file mode 100644 index 0000000..ba25c6f --- /dev/null +++ b/src/cosmicqc/qcdataframe.py @@ -0,0 +1,63 @@ +""" +Defines a QCDataFrame class for use in coSMicQC. +""" + +import pandas as pd +from typing import Union + +class QCDataFrame: + """ + A class to handle and load different types of data files into a pandas DataFrame. + + This class can initialize with either a pandas DataFrame or a file path (CSV, TSV, + TXT, or Parquet). When initialized with a file path, it reads the data into a + pandas DataFrame. + + Attributes: + reference (str): + A string indicating the type of data source, either 'pd.DataFrame' + or the file path. + data (pd.DataFrame): + The loaded data in a pandas DataFrame. + + Methods: + __call__(): + Returns the underlying pandas DataFrame. + """ + + def __init__(self, data: Union[pd.DataFrame, str], **kwargs) -> None: + """ + Initializes the QCDataFrame with either a DataFrame or a file path. + + Args: + data (Union[pd.DataFrame, str]): + The data source, either a pandas DataFrame or a file path. + **kwargs: + Additional keyword arguments to pass to the pandas read functions. + """ + if isinstance(data, pd.DataFrame): + # if data is a pd.DataFrame, remember this within the reference attr + self.reference = "pd.DataFrame" + self.data = data + elif isinstance(data, str): + # if the data is a string, remember the original source + # through a reference attr + self.reference = data + + # Read the data from the file based on its extension + if data.endswith(".csv"): + self.data = pd.read_csv(data, **kwargs) + elif data.endswith(".tsv") or data.endswith(".txt"): + self.data = pd.read_csv(data, delimiter="\t", **kwargs) + elif data.endswith(".parquet"): + self.data = pd.read_parquet(data, **kwargs) + + def __call__(self) -> pd.DataFrame: + """ + Returns the underlying pandas DataFrame. + + Returns: + pd.DataFrame: The data in a pandas DataFrame. + """ + return self.data + From 978484127fd4bf00a96547d0b8b08bcd68495755 Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Jun 2024 12:33:33 -0600 Subject: [PATCH 02/27] linting --- src/cosmicqc/qcdataframe.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py index ba25c6f..7a229fb 100644 --- a/src/cosmicqc/qcdataframe.py +++ b/src/cosmicqc/qcdataframe.py @@ -2,8 +2,10 @@ Defines a QCDataFrame class for use in coSMicQC. """ +from typing import Any, Dict, Self, Union + import pandas as pd -from typing import Union + class QCDataFrame: """ @@ -25,7 +27,9 @@ class QCDataFrame: Returns the underlying pandas DataFrame. """ - def __init__(self, data: Union[pd.DataFrame, str], **kwargs) -> None: + def __init__( + self: Self, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any] + ) -> None: """ Initializes the QCDataFrame with either a DataFrame or a file path. @@ -52,7 +56,7 @@ def __init__(self, data: Union[pd.DataFrame, str], **kwargs) -> None: elif data.endswith(".parquet"): self.data = pd.read_parquet(data, **kwargs) - def __call__(self) -> pd.DataFrame: + def __call__(self: Self) -> pd.DataFrame: """ Returns the underlying pandas DataFrame. @@ -60,4 +64,3 @@ def __call__(self) -> pd.DataFrame: pd.DataFrame: The data in a pandas DataFrame. """ return self.data - From c39e2b0c4f000289ffcb87b107ce964bb1ef80fe Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Jun 2024 14:04:42 -0600 Subject: [PATCH 03/27] add qcdataframe --- src/cosmicqc/__init__.py | 1 + src/cosmicqc/analyze.py | 32 ++++++++++++++------ src/cosmicqc/qcdataframe.py | 58 ++++++++++++++++++++++++++++++++++--- tests/conftest.py | 41 ++++++++++++++++++++++++++ tests/test_analyze.py | 18 +++++++++++- tests/test_qcdataframe.py | 42 +++++++++++++++++++++++++++ 6 files changed, 178 insertions(+), 14 deletions(-) create mode 100644 tests/test_qcdataframe.py diff --git a/src/cosmicqc/__init__.py b/src/cosmicqc/__init__.py index aca6bd8..82f4e10 100644 --- a/src/cosmicqc/__init__.py +++ b/src/cosmicqc/__init__.py @@ -3,6 +3,7 @@ """ from .analyze import find_outliers +from .qcdataframe import QCDataFrame # note: version placeholder is updated during build # by poetry-dynamic-versioning. diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index 9f33c4b..70833a6 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -11,13 +11,15 @@ import yaml from scipy.stats import zscore as scipy_zscore +from .qcdataframe import QCDataFrame + DEFAULT_QC_THRESHOLD_FILE = ( f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml" ) def identify_outliers( - df: pd.DataFrame, + df: Union[QCDataFrame, pd.DataFrame, str], feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, include_threshold_scores: bool = False, @@ -30,8 +32,8 @@ def identify_outliers( threshold of 0 as that would represent the whole dataset. Args: - df: pd.DataFrame - Data frame with converted output from CytoTable. + df: Union[QCDataFrame, pd.DataFrame, str] + DataFrame or file with converted output from CytoTable. metadata_columns: List[str] List of metadata columns that should be outputted with the outlier data. feature_thresholds: Dict[str, float] @@ -52,6 +54,10 @@ def identify_outliers( or not for use within other functions. """ + # interpret the df as QCDataFrame + if not isinstance(df, QCDataFrame): + df = QCDataFrame(data=df) + # create a copy of the dataframe to ensure # we don't modify the supplied dataframe inplace. outlier_df = df.copy() @@ -107,7 +113,7 @@ def identify_outliers( def find_outliers( - df: pd.DataFrame, + df: Union[QCDataFrame, pd.DataFrame, str], metadata_columns: List[str], feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, @@ -117,8 +123,8 @@ def find_outliers( with only the outliers and provided metadata columns. Args: - df: pd.DataFrame - Data frame with converted output from CytoTable. + df: Union[QCDataFrame, pd.DataFrame, str] + DataFrame or file with converted output from CytoTable. metadata_columns: List[str] List of metadata columns that should be outputted with the outlier data. feature_thresholds: Dict[str, float] @@ -138,6 +144,10 @@ def find_outliers( Outlier data frame for the given conditions. """ + # interpret the df as QCDataFrame + if not isinstance(df, QCDataFrame): + df = QCDataFrame(data=df) + if isinstance(feature_thresholds, str): feature_thresholds = read_thresholds_set_from_file( feature_thresholds=feature_thresholds, @@ -169,7 +179,7 @@ def find_outliers( def label_outliers( - df: pd.DataFrame, + df: Union[QCDataFrame, pd.DataFrame, str], feature_thresholds: Optional[Union[Dict[str, float], str]] = None, feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, include_threshold_scores: bool = False, @@ -179,8 +189,8 @@ def label_outliers( where a cell passed or failed the quality control condition(s). Args: - df: pd.DataFrame - Data frame with converted output from CytoTable. + df: Union[QCDataFrame, pd.DataFrame, str] + DataFrame or file with converted output from CytoTable. feature_thresholds: Dict[str, float] One of two options: A dictionary with the feature name(s) as the key(s) and their assigned @@ -201,6 +211,10 @@ def label_outliers( Full dataframe with optional scores and outlier boolean column. """ + # interpret the df as QCDataFrame + if not isinstance(df, QCDataFrame): + df = QCDataFrame(data=df) + # for single outlier processing if isinstance(feature_thresholds, (str, dict)): # return the outlier dataframe for one threshold rule diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py index 7a229fb..3ed24fa 100644 --- a/src/cosmicqc/qcdataframe.py +++ b/src/cosmicqc/qcdataframe.py @@ -2,6 +2,7 @@ Defines a QCDataFrame class for use in coSMicQC. """ +import pathlib from typing import Any, Dict, Self, Union import pandas as pd @@ -39,23 +40,38 @@ def __init__( **kwargs: Additional keyword arguments to pass to the pandas read functions. """ + + # print(data) + # print(type(data)) + # print(isinstance(data, QCDataFrame)) + if isinstance(data, pd.DataFrame): # if data is a pd.DataFrame, remember this within the reference attr self.reference = "pd.DataFrame" self.data = data - elif isinstance(data, str): + + elif isinstance(data, pathlib.Path | str): # if the data is a string, remember the original source # through a reference attr self.reference = data + # interpret the data through pathlib + data_path = pathlib.Path(data) + # Read the data from the file based on its extension - if data.endswith(".csv"): + if data_path.suffix == ".csv": + # read as a CSV self.data = pd.read_csv(data, **kwargs) - elif data.endswith(".tsv") or data.endswith(".txt"): + elif data_path.suffix in (".tsv", ".txt"): + # read as a TSV self.data = pd.read_csv(data, delimiter="\t", **kwargs) - elif data.endswith(".parquet"): + elif data_path.suffix == ".parquet": + # read as a Parquet file self.data = pd.read_parquet(data, **kwargs) + else: + raise ValueError("Unsupported file format for QCDataFrame.") + def __call__(self: Self) -> pd.DataFrame: """ Returns the underlying pandas DataFrame. @@ -64,3 +80,37 @@ def __call__(self: Self) -> pd.DataFrame: pd.DataFrame: The data in a pandas DataFrame. """ return self.data + + def __repr__(self: Self) -> pd.DataFrame: + """ + Returns the underlying pandas DataFrame. + + Returns: + pd.DataFrame: The data in a pandas DataFrame. + """ + return self.data + + def __getattr__(self: Self, attr: str) -> Any: # noqa: ANN401 + """ + Intercept attribute accesses and delegate them to the underlying + pandas DataFrame. + + Args: + attr (str): The name of the attribute being accessed. + + Returns: + Any: The value of the attribute from the pandas DataFrame. + """ + return getattr(self.data, attr) + + def __getitem__(self: Self, key: Union[int, str]) -> Any: # noqa: ANN401 + """ + Returns an element or a slice of the underlying pandas DataFrame. + + Args: + key: The key or slice to access the data. + + Returns: + pd.DataFrame or any: The selected element or slice of data. + """ + return self.data[key] diff --git a/tests/conftest.py b/tests/conftest.py index a8f796a..967ad2a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,8 @@ https://docs.pytest.org/en/7.1.x/explanation/fixtures.html """ +import pathlib + import pandas as pd import pytest @@ -24,3 +26,42 @@ def fixture_basic_outlier_dataframe(): Creates basic example data for use in tests """ return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + + +@pytest.fixture(name="basic_outlier_csv") +def fixture_basic_outlier_csv( + tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame +): + """ + Creates basic example data csv for use in tests + """ + + basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv") + + return csv_path + + +@pytest.fixture(name="basic_outlier_tsv") +def fixture_basic_outlier_tsv( + tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame +): + """ + Creates basic example data tsv for use in tests + """ + + basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t") + + return tsv_path + + +@pytest.fixture(name="basic_outlier_parquet") +def fixture_basic_outlier_parquet( + tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame +): + """ + Creates basic example data parquet for use in tests + """ + + basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet") + + return parquet_path diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 6c7a879..a55cf6b 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -7,7 +7,7 @@ from cosmicqc import analyze -def test_find_outliers_basic(basic_outlier_dataframe: pd.DataFrame): +def test_find_outliers_basic_dataframe(basic_outlier_dataframe: pd.DataFrame): """ Testing find_outliers with basic/simulated data. """ @@ -27,6 +27,21 @@ def test_find_outliers_basic(basic_outlier_dataframe: pd.DataFrame): } +def test_find_outliers_basic_csv(basic_outlier_csv: str): + """ + Testing find_outliers with csv data. + """ + + # assert that we have the output we expect + assert analyze.find_outliers( + df=basic_outlier_csv, + feature_thresholds={"example_feature": 1}, + metadata_columns=[], + ).to_dict(orient="dict") == { + "example_feature": {8: 9, 9: 10}, + } + + def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame): """ Testing find_outliers with CytoTable CFReT data. @@ -315,6 +330,7 @@ def test_find_outliers_dict_and_default_config_cfret( def test_label_outliers( basic_outlier_dataframe: pd.DataFrame, + basic_outlier_csv: str, cytotable_CFReT_data_df: pd.DataFrame, ): """ diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py new file mode 100644 index 0000000..1124305 --- /dev/null +++ b/tests/test_qcdataframe.py @@ -0,0 +1,42 @@ +""" +Tests cosmicqc qcdataframe module +""" + +import pandas as pd +from cosmicqc.qcdataframe import QCDataFrame + +def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame): + """ + Tests QCDataFrame with pd.DataFrame input. + """ + qc_df = QCDataFrame(data=basic_outlier_dataframe) + assert qc_df.reference == "pd.DataFrame" + assert qc_df.equals(basic_outlier_dataframe) + +def test_qcdataframe_init_with_csv(basic_outlier_csv:str): + """ + Tests QCDataFrame with CSV input. + """ + qc_df = QCDataFrame(data=basic_outlier_csv) + expected_df = pd.read_csv(basic_outlier_csv) + assert qc_df.reference == basic_outlier_csv + assert qc_df.equals(expected_df) + +def test_qcdataframe_init_with_tsv(basic_outlier_tsv:str): + """ + Tests QCDataFrame with TSV input. + """ + qc_df = QCDataFrame(data=basic_outlier_tsv) + expected_df = pd.read_csv(basic_outlier_tsv, delimiter='\t') + assert qc_df.reference == basic_outlier_tsv + assert qc_df.equals(expected_df) + +def test_qcdataframe_init_with_parquet(basic_outlier_parquet:str): + """ + Tests QCDataFrame with TSV input. + """ + qc_df = QCDataFrame(data=basic_outlier_parquet) + expected_df = pd.read_parquet(basic_outlier_parquet) + assert qc_df.reference == basic_outlier_parquet + assert qc_df.equals(expected_df) + From f3003e1de72a83a281308149acefc29fe4e63683 Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Jun 2024 14:04:54 -0600 Subject: [PATCH 04/27] linting --- tests/test_qcdataframe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py index 1124305..1b1fad4 100644 --- a/tests/test_qcdataframe.py +++ b/tests/test_qcdataframe.py @@ -5,6 +5,7 @@ import pandas as pd from cosmicqc.qcdataframe import QCDataFrame + def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame): """ Tests QCDataFrame with pd.DataFrame input. @@ -13,7 +14,8 @@ def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame): assert qc_df.reference == "pd.DataFrame" assert qc_df.equals(basic_outlier_dataframe) -def test_qcdataframe_init_with_csv(basic_outlier_csv:str): + +def test_qcdataframe_init_with_csv(basic_outlier_csv: str): """ Tests QCDataFrame with CSV input. """ @@ -22,16 +24,18 @@ def test_qcdataframe_init_with_csv(basic_outlier_csv:str): assert qc_df.reference == basic_outlier_csv assert qc_df.equals(expected_df) -def test_qcdataframe_init_with_tsv(basic_outlier_tsv:str): + +def test_qcdataframe_init_with_tsv(basic_outlier_tsv: str): """ Tests QCDataFrame with TSV input. """ qc_df = QCDataFrame(data=basic_outlier_tsv) - expected_df = pd.read_csv(basic_outlier_tsv, delimiter='\t') + expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t") assert qc_df.reference == basic_outlier_tsv assert qc_df.equals(expected_df) -def test_qcdataframe_init_with_parquet(basic_outlier_parquet:str): + +def test_qcdataframe_init_with_parquet(basic_outlier_parquet: str): """ Tests QCDataFrame with TSV input. """ @@ -39,4 +43,3 @@ def test_qcdataframe_init_with_parquet(basic_outlier_parquet:str): expected_df = pd.read_parquet(basic_outlier_parquet) assert qc_df.reference == basic_outlier_parquet assert qc_df.equals(expected_df) - From b97f3a57aaa273b19fd61276db5bfe391b21de6d Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Jun 2024 16:14:27 -0600 Subject: [PATCH 05/27] adding tests --- src/cosmicqc/qcdataframe.py | 4 ---- tests/conftest.py | 6 +++--- tests/test_analyze.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py index 3ed24fa..8498311 100644 --- a/src/cosmicqc/qcdataframe.py +++ b/src/cosmicqc/qcdataframe.py @@ -41,10 +41,6 @@ def __init__( Additional keyword arguments to pass to the pandas read functions. """ - # print(data) - # print(type(data)) - # print(isinstance(data, QCDataFrame)) - if isinstance(data, pd.DataFrame): # if data is a pd.DataFrame, remember this within the reference attr self.reference = "pd.DataFrame" diff --git a/tests/conftest.py b/tests/conftest.py index 967ad2a..ad0c17e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,7 @@ def fixture_basic_outlier_csv( Creates basic example data csv for use in tests """ - basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv") + basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv", index=False) return csv_path @@ -49,7 +49,7 @@ def fixture_basic_outlier_tsv( Creates basic example data tsv for use in tests """ - basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t") + basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t", index=False) return tsv_path @@ -62,6 +62,6 @@ def fixture_basic_outlier_parquet( Creates basic example data parquet for use in tests """ - basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet") + basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet", index=False) return parquet_path diff --git a/tests/test_analyze.py b/tests/test_analyze.py index a55cf6b..26ea6db 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -337,6 +337,21 @@ def test_label_outliers( Tests label_outliers """ + pd.testing.assert_frame_equal( + analyze.label_outliers( + df=basic_outlier_dataframe, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=True, + ), + analyze.label_outliers( + df=basic_outlier_csv, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=True, + ), + ) + + + # test basic single-column result with zscores assert analyze.label_outliers( df=basic_outlier_dataframe, From 1069a4af958de2ff12041d4282ca55c457ce7143 Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Jun 2024 16:14:38 -0600 Subject: [PATCH 06/27] linting --- tests/conftest.py | 8 ++++++-- tests/test_analyze.py | 2 -- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ad0c17e..e3fb24c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -49,7 +49,9 @@ def fixture_basic_outlier_tsv( Creates basic example data tsv for use in tests """ - basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t", index=False) + basic_outlier_dataframe.to_csv( + tsv_path := tmp_path / "example.tsv", sep="\t", index=False + ) return tsv_path @@ -62,6 +64,8 @@ def fixture_basic_outlier_parquet( Creates basic example data parquet for use in tests """ - basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet", index=False) + basic_outlier_dataframe.to_parquet( + parquet_path := tmp_path / "example.parquet", index=False + ) return parquet_path diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 26ea6db..21244b9 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -350,8 +350,6 @@ def test_label_outliers( ), ) - - # test basic single-column result with zscores assert analyze.label_outliers( df=basic_outlier_dataframe, From d52a89f639fe9395f0a3384677333996a39b8233 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 11 Jun 2024 08:48:27 -0600 Subject: [PATCH 07/27] update name, tests --- src/cosmicqc/__init__.py | 4 +- src/cosmicqc/analyze.py | 32 +++++++------- .../{qcdataframe.py => scdataframe.py} | 33 ++++++++------ tests/test_analyze.py | 16 +++++++ tests/test_qcdataframe.py | 44 +++++++++---------- 5 files changed, 76 insertions(+), 53 deletions(-) rename src/cosmicqc/{qcdataframe.py => scdataframe.py} (77%) diff --git a/src/cosmicqc/__init__.py b/src/cosmicqc/__init__.py index 82f4e10..05b2bbe 100644 --- a/src/cosmicqc/__init__.py +++ b/src/cosmicqc/__init__.py @@ -3,8 +3,8 @@ """ from .analyze import find_outliers -from .qcdataframe import QCDataFrame +from .scdataframe import SCDataFrame # note: version placeholder is updated during build # by poetry-dynamic-versioning. -__version__ = "0.0.1" +__version__ = "0.0.0" diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index 70833a6..d616f78 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -11,7 +11,7 @@ import yaml from scipy.stats import zscore as scipy_zscore -from .qcdataframe import QCDataFrame +from .scdataframe import SCDataFrame DEFAULT_QC_THRESHOLD_FILE = ( f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml" @@ -19,7 +19,7 @@ def identify_outliers( - df: Union[QCDataFrame, pd.DataFrame, str], + df: Union[SCDataFrame, pd.DataFrame, str], feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, include_threshold_scores: bool = False, @@ -32,7 +32,7 @@ def identify_outliers( threshold of 0 as that would represent the whole dataset. Args: - df: Union[QCDataFrame, pd.DataFrame, str] + df: Union[SCDataFrame, pd.DataFrame, str] DataFrame or file with converted output from CytoTable. metadata_columns: List[str] List of metadata columns that should be outputted with the outlier data. @@ -54,9 +54,9 @@ def identify_outliers( or not for use within other functions. """ - # interpret the df as QCDataFrame - if not isinstance(df, QCDataFrame): - df = QCDataFrame(data=df) + # interpret the df as SCDataFrame + if not isinstance(df, SCDataFrame): + df = SCDataFrame(data=df) # create a copy of the dataframe to ensure # we don't modify the supplied dataframe inplace. @@ -113,7 +113,7 @@ def identify_outliers( def find_outliers( - df: Union[QCDataFrame, pd.DataFrame, str], + df: Union[SCDataFrame, pd.DataFrame, str], metadata_columns: List[str], feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, @@ -123,7 +123,7 @@ def find_outliers( with only the outliers and provided metadata columns. Args: - df: Union[QCDataFrame, pd.DataFrame, str] + df: Union[SCDataFrame, pd.DataFrame, str] DataFrame or file with converted output from CytoTable. metadata_columns: List[str] List of metadata columns that should be outputted with the outlier data. @@ -144,9 +144,9 @@ def find_outliers( Outlier data frame for the given conditions. """ - # interpret the df as QCDataFrame - if not isinstance(df, QCDataFrame): - df = QCDataFrame(data=df) + # interpret the df as SCDataFrame + if not isinstance(df, SCDataFrame): + df = SCDataFrame(data=df) if isinstance(feature_thresholds, str): feature_thresholds = read_thresholds_set_from_file( @@ -179,7 +179,7 @@ def find_outliers( def label_outliers( - df: Union[QCDataFrame, pd.DataFrame, str], + df: Union[SCDataFrame, pd.DataFrame, str], feature_thresholds: Optional[Union[Dict[str, float], str]] = None, feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, include_threshold_scores: bool = False, @@ -189,7 +189,7 @@ def label_outliers( where a cell passed or failed the quality control condition(s). Args: - df: Union[QCDataFrame, pd.DataFrame, str] + df: Union[SCDataFrame, pd.DataFrame, str] DataFrame or file with converted output from CytoTable. feature_thresholds: Dict[str, float] One of two options: @@ -211,9 +211,9 @@ def label_outliers( Full dataframe with optional scores and outlier boolean column. """ - # interpret the df as QCDataFrame - if not isinstance(df, QCDataFrame): - df = QCDataFrame(data=df) + # interpret the df as SCDataFrame + if not isinstance(df, SCDataFrame): + df = SCDataFrame(data=df) # for single outlier processing if isinstance(feature_thresholds, (str, dict)): diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/scdataframe.py similarity index 77% rename from src/cosmicqc/qcdataframe.py rename to src/cosmicqc/scdataframe.py index 8498311..da0e1c1 100644 --- a/src/cosmicqc/qcdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -1,5 +1,5 @@ """ -Defines a QCDataFrame class for use in coSMicQC. +Defines a SCDataFrame class for use in coSMicQC. """ import pathlib @@ -8,7 +8,7 @@ import pandas as pd -class QCDataFrame: +class SCDataFrame: """ A class to handle and load different types of data files into a pandas DataFrame. @@ -17,8 +17,8 @@ class QCDataFrame: pandas DataFrame. Attributes: - reference (str): - A string indicating the type of data source, either 'pd.DataFrame' + data_source (str): + A string indicating the data source, either 'pd.DataFrame' or the file path. data (pd.DataFrame): The loaded data in a pandas DataFrame. @@ -26,13 +26,19 @@ class QCDataFrame: Methods: __call__(): Returns the underlying pandas DataFrame. + __repr__(): + Returns representation of underlying pandas DataFrame. + __getattr__(): + Returns underlying attributes of pandas DataFrame. + __getitem__(): + Returns slice of data from pandas DataFrame. """ def __init__( self: Self, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any] ) -> None: """ - Initializes the QCDataFrame with either a DataFrame or a file path. + Initializes the SCDataFrame with either a DataFrame or a file path. Args: data (Union[pd.DataFrame, str]): @@ -42,14 +48,14 @@ def __init__( """ if isinstance(data, pd.DataFrame): - # if data is a pd.DataFrame, remember this within the reference attr - self.reference = "pd.DataFrame" + # if data is a pd.DataFrame, remember this within the data_source attr + self.data_source = "pd.DataFrame" self.data = data elif isinstance(data, pathlib.Path | str): # if the data is a string, remember the original source - # through a reference attr - self.reference = data + # through a data_source attr + self.data_source = data # interpret the data through pathlib data_path = pathlib.Path(data) @@ -64,9 +70,10 @@ def __init__( elif data_path.suffix == ".parquet": # read as a Parquet file self.data = pd.read_parquet(data, **kwargs) - + else: + raise ValueError("Unsupported file format for SCDataFrame.") else: - raise ValueError("Unsupported file format for QCDataFrame.") + raise ValueError("Unsupported input type for SCDataFrame.") def __call__(self: Self) -> pd.DataFrame: """ @@ -79,12 +86,12 @@ def __call__(self: Self) -> pd.DataFrame: def __repr__(self: Self) -> pd.DataFrame: """ - Returns the underlying pandas DataFrame. + Returns the representation of underlying pandas DataFrame. Returns: pd.DataFrame: The data in a pandas DataFrame. """ - return self.data + return repr(self.data) def __getattr__(self: Self, attr: str) -> Any: # noqa: ANN401 """ diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 21244b9..6ade9a2 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -337,6 +337,7 @@ def test_label_outliers( Tests label_outliers """ + # compare the dataframe vs csv output to make sure they are equivalent pd.testing.assert_frame_equal( analyze.label_outliers( df=basic_outlier_dataframe, @@ -451,12 +452,27 @@ def test_label_outliers( def test_identify_outliers( basic_outlier_dataframe: pd.DataFrame, + basic_outlier_csv: str, cytotable_CFReT_data_df: pd.DataFrame, ): """ Tests identify_outliers """ + # show that dataframe and csv output are the same + pd.testing.assert_frame_equal( + analyze.identify_outliers( + df=basic_outlier_dataframe, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=True, + ), + analyze.identify_outliers( + df=basic_outlier_csv, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=True, + ), + ) + assert analyze.identify_outliers( df=basic_outlier_dataframe, feature_thresholds={"example_feature": 1}, diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py index 1b1fad4..d2ec019 100644 --- a/tests/test_qcdataframe.py +++ b/tests/test_qcdataframe.py @@ -1,45 +1,45 @@ """ -Tests cosmicqc qcdataframe module +Tests cosmicqc SCDataFrame module """ import pandas as pd -from cosmicqc.qcdataframe import QCDataFrame +from cosmicqc.scdataframe import SCDataFrame -def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame): +def test_SCDataFrame_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame): """ - Tests QCDataFrame with pd.DataFrame input. + Tests SCDataFrame with pd.DataFrame input. """ - qc_df = QCDataFrame(data=basic_outlier_dataframe) - assert qc_df.reference == "pd.DataFrame" - assert qc_df.equals(basic_outlier_dataframe) + sc_df = SCDataFrame(data=basic_outlier_dataframe) + assert sc_df.data_source == "pd.DataFrame" + assert sc_df.equals(basic_outlier_dataframe) -def test_qcdataframe_init_with_csv(basic_outlier_csv: str): +def test_SCDataFrame_init_with_csv(basic_outlier_csv: str): """ - Tests QCDataFrame with CSV input. + Tests SCDataFrame with CSV input. """ - qc_df = QCDataFrame(data=basic_outlier_csv) + sc_df = SCDataFrame(data=basic_outlier_csv) expected_df = pd.read_csv(basic_outlier_csv) - assert qc_df.reference == basic_outlier_csv - assert qc_df.equals(expected_df) + assert sc_df.data_source == basic_outlier_csv + assert sc_df.equals(expected_df) -def test_qcdataframe_init_with_tsv(basic_outlier_tsv: str): +def test_SCDataFrame_init_with_tsv(basic_outlier_tsv: str): """ - Tests QCDataFrame with TSV input. + Tests SCDataFrame with TSV input. """ - qc_df = QCDataFrame(data=basic_outlier_tsv) + sc_df = SCDataFrame(data=basic_outlier_tsv) expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t") - assert qc_df.reference == basic_outlier_tsv - assert qc_df.equals(expected_df) + assert sc_df.data_source == basic_outlier_tsv + assert sc_df.equals(expected_df) -def test_qcdataframe_init_with_parquet(basic_outlier_parquet: str): +def test_SCDataFrame_init_with_parquet(basic_outlier_parquet: str): """ - Tests QCDataFrame with TSV input. + Tests SCDataFrame with TSV input. """ - qc_df = QCDataFrame(data=basic_outlier_parquet) + sc_df = SCDataFrame(data=basic_outlier_parquet) expected_df = pd.read_parquet(basic_outlier_parquet) - assert qc_df.reference == basic_outlier_parquet - assert qc_df.equals(expected_df) + assert sc_df.data_source == basic_outlier_parquet + assert sc_df.equals(expected_df) From 1bf8262dcdf160592ffef24cdb0c62b9c0c277a4 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 11 Jun 2024 08:58:09 -0600 Subject: [PATCH 08/27] add back compat for self type --- src/cosmicqc/scdataframe.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index da0e1c1..37e52a7 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -3,10 +3,14 @@ """ import pathlib -from typing import Any, Dict, Self, Union +from typing import Any, Dict, TypeVar, Union import pandas as pd +# provide backwards compatibility for Self type in earlier Python versions. +# see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods +Self_SCDataFrame = TypeVar("Self_SCDataFrame", bound="SCDataFrame") + class SCDataFrame: """ @@ -35,7 +39,7 @@ class SCDataFrame: """ def __init__( - self: Self, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any] + self: Self_SCDataFrame, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any] ) -> None: """ Initializes the SCDataFrame with either a DataFrame or a file path. @@ -75,7 +79,7 @@ def __init__( else: raise ValueError("Unsupported input type for SCDataFrame.") - def __call__(self: Self) -> pd.DataFrame: + def __call__(self: Self_SCDataFrame) -> pd.DataFrame: """ Returns the underlying pandas DataFrame. @@ -84,7 +88,7 @@ def __call__(self: Self) -> pd.DataFrame: """ return self.data - def __repr__(self: Self) -> pd.DataFrame: + def __repr__(self: Self_SCDataFrame) -> pd.DataFrame: """ Returns the representation of underlying pandas DataFrame. @@ -93,7 +97,7 @@ def __repr__(self: Self) -> pd.DataFrame: """ return repr(self.data) - def __getattr__(self: Self, attr: str) -> Any: # noqa: ANN401 + def __getattr__(self: Self_SCDataFrame, attr: str) -> Any: # noqa: ANN401 """ Intercept attribute accesses and delegate them to the underlying pandas DataFrame. @@ -106,7 +110,7 @@ def __getattr__(self: Self, attr: str) -> Any: # noqa: ANN401 """ return getattr(self.data, attr) - def __getitem__(self: Self, key: Union[int, str]) -> Any: # noqa: ANN401 + def __getitem__(self: Self_SCDataFrame, key: Union[int, str]) -> Any: # noqa: ANN401 """ Returns an element or a slice of the underlying pandas DataFrame. From 772f89560e7fac449cd4bf6396779ef5182b46d9 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 11 Jun 2024 09:07:13 -0600 Subject: [PATCH 09/27] back compat for isinstance --- src/cosmicqc/scdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 37e52a7..3fc349c 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -56,7 +56,7 @@ def __init__( self.data_source = "pd.DataFrame" self.data = data - elif isinstance(data, pathlib.Path | str): + elif isinstance(data, pathlib.Path) or isinstance(data, str): # noqa: PLR1701, SIM101 # if the data is a string, remember the original source # through a data_source attr self.data_source = data From d0ea33c83e114a1454ed7448e957da97b0b18e8d Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 11 Jun 2024 09:08:18 -0600 Subject: [PATCH 10/27] linting --- src/cosmicqc/scdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 3fc349c..f94bdd2 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -56,7 +56,7 @@ def __init__( self.data_source = "pd.DataFrame" self.data = data - elif isinstance(data, pathlib.Path) or isinstance(data, str): # noqa: PLR1701, SIM101 + elif isinstance(data, pathlib.Path) or isinstance(data, str): # noqa: PLR1701, SIM101 # if the data is a string, remember the original source # through a data_source attr self.data_source = data From f8773b26de54a6ac306ae9ac1c1a220aa21ae427 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 14 Jun 2024 15:42:39 -0600 Subject: [PATCH 11/27] add csv.gz compatibility --- src/cosmicqc/scdataframe.py | 3 +++ tests/conftest.py | 15 +++++++++++++++ tests/test_qcdataframe.py | 10 ++++++++++ 3 files changed, 28 insertions(+) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index f94bdd2..a06b643 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -68,6 +68,9 @@ def __init__( if data_path.suffix == ".csv": # read as a CSV self.data = pd.read_csv(data, **kwargs) + elif data_path.suffixes == [".csv", ".gz"]: + # read as a CSV.GZ file + self.data = pd.read_csv(data, compression="gzip", **kwargs) elif data_path.suffix in (".tsv", ".txt"): # read as a TSV self.data = pd.read_csv(data, delimiter="\t", **kwargs) diff --git a/tests/conftest.py b/tests/conftest.py index e3fb24c..ed8591e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,6 +41,21 @@ def fixture_basic_outlier_csv( return csv_path +@pytest.fixture(name="basic_outlier_csv_gz") +def fixture_basic_outlier_csv_gz( + tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame +): + """ + Creates basic example data csv for use in tests + """ + + basic_outlier_dataframe.to_csv( + csv_gz_path := tmp_path / "example.csv.gz", index=False, compression="gzip" + ) + + return csv_gz_path + + @pytest.fixture(name="basic_outlier_tsv") def fixture_basic_outlier_tsv( tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py index d2ec019..75e7448 100644 --- a/tests/test_qcdataframe.py +++ b/tests/test_qcdataframe.py @@ -25,6 +25,16 @@ def test_SCDataFrame_init_with_csv(basic_outlier_csv: str): assert sc_df.equals(expected_df) +def test_SCDataFrame_init_with_csv_gz(basic_outlier_csv_gz: str): + """ + Tests SCDataFrame with CSV input. + """ + sc_df = SCDataFrame(data=basic_outlier_csv_gz) + expected_df = pd.read_csv(basic_outlier_csv_gz) + assert sc_df.data_source == basic_outlier_csv_gz + assert sc_df.equals(expected_df) + + def test_SCDataFrame_init_with_tsv(basic_outlier_tsv: str): """ Tests SCDataFrame with TSV input. From 0ad522f32b8008b2176a9f41a47f6754bde6f637 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 18 Jun 2024 09:17:31 -0600 Subject: [PATCH 12/27] add export capabilities --- src/cosmicqc/scdataframe.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index a06b643..730718a 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -81,6 +81,32 @@ def __init__( raise ValueError("Unsupported file format for SCDataFrame.") else: raise ValueError("Unsupported input type for SCDataFrame.") + + def export(self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]) -> None: + """ + Exports the underlying pandas DataFrame to a file. + + Args: + file_path (str): The path where the DataFrame should be saved. + **kwargs: Additional keyword arguments to pass to the pandas to_* methods. + """ + + data_path = pathlib.Path(file_path) + + # export to csv + if data_path.suffix == ".csv": + self.data.to_csv(file_path, **kwargs) + + # export to tsv + elif data_path.suffix in (".tsv", ".txt"): + self.data.to_csv(file_path, sep='\t', **kwargs) + + # export to parquet + elif data_path.suffix == ".parquet": + self.data.to_parquet(file_path, **kwargs) + + else: + raise ValueError("Unsupported file format for export.") def __call__(self: Self_SCDataFrame) -> pd.DataFrame: """ @@ -100,10 +126,10 @@ def __repr__(self: Self_SCDataFrame) -> pd.DataFrame: """ return repr(self.data) - def __getattr__(self: Self_SCDataFrame, attr: str) -> Any: # noqa: ANN401 + def __getattr__(self, attr: str) -> Any: # noqa: ANN401 """ Intercept attribute accesses and delegate them to the underlying - pandas DataFrame. + pandas DataFrame, except for custom methods. Args: attr (str): The name of the attribute being accessed. @@ -111,6 +137,8 @@ def __getattr__(self: Self_SCDataFrame, attr: str) -> Any: # noqa: ANN401 Returns: Any: The value of the attribute from the pandas DataFrame. """ + if attr in self.__dict__: + return self.__dict__[attr] return getattr(self.data, attr) def __getitem__(self: Self_SCDataFrame, key: Union[int, str]) -> Any: # noqa: ANN401 From 6d33de4e8571bfb780fa64ce277814b56c84adbf Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 18 Jun 2024 09:18:00 -0600 Subject: [PATCH 13/27] rename file to correct module name --- tests/{test_qcdataframe.py => test_scdataframe.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_qcdataframe.py => test_scdataframe.py} (100%) diff --git a/tests/test_qcdataframe.py b/tests/test_scdataframe.py similarity index 100% rename from tests/test_qcdataframe.py rename to tests/test_scdataframe.py From 34cf9bfc796f084a205a0e74075c41f2236e8c81 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 18 Jun 2024 09:58:03 -0600 Subject: [PATCH 14/27] add export capabilities --- src/cosmicqc/scdataframe.py | 17 +++++----- tests/test_scdataframe.py | 62 ++++++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 14 deletions(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 730718a..4a4acae 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -81,8 +81,10 @@ def __init__( raise ValueError("Unsupported file format for SCDataFrame.") else: raise ValueError("Unsupported input type for SCDataFrame.") - - def export(self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]) -> None: + + def export( + self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any] + ) -> None: """ Exports the underlying pandas DataFrame to a file. @@ -94,17 +96,14 @@ def export(self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]) -> data_path = pathlib.Path(file_path) # export to csv - if data_path.suffix == ".csv": + if ".csv" in data_path.suffixes: self.data.to_csv(file_path, **kwargs) - # export to tsv - elif data_path.suffix in (".tsv", ".txt"): - self.data.to_csv(file_path, sep='\t', **kwargs) - + elif any(elem in data_path.suffixes for elem in (".tsv", ".txt")): + self.data.to_csv(file_path, sep="\t", **kwargs) # export to parquet elif data_path.suffix == ".parquet": self.data.to_parquet(file_path, **kwargs) - else: raise ValueError("Unsupported file format for export.") @@ -126,7 +125,7 @@ def __repr__(self: Self_SCDataFrame) -> pd.DataFrame: """ return repr(self.data) - def __getattr__(self, attr: str) -> Any: # noqa: ANN401 + def __getattr__(self: Self_SCDataFrame, attr: str) -> Any: # noqa: ANN401 """ Intercept attribute accesses and delegate them to the underlying pandas DataFrame, except for custom methods. diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index 75e7448..78e5dd9 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -2,54 +2,106 @@ Tests cosmicqc SCDataFrame module """ +import pathlib + import pandas as pd from cosmicqc.scdataframe import SCDataFrame +from pyarrow import parquet -def test_SCDataFrame_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame): +def test_SCDataFrame_with_dataframe( + tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame +): """ Tests SCDataFrame with pd.DataFrame input. """ + sc_df = SCDataFrame(data=basic_outlier_dataframe) + + # test that we ingested the data properly assert sc_df.data_source == "pd.DataFrame" assert sc_df.equals(basic_outlier_dataframe) + # test export + basic_outlier_dataframe.to_parquet( + control_path := f"{tmp_path}/df_input_example.parquet" + ) + sc_df.export(test_path := f"{tmp_path}/df_input_example1.parquet") -def test_SCDataFrame_init_with_csv(basic_outlier_csv: str): + assert parquet.read_table(control_path).equals(parquet.read_table(test_path)) + + +def test_SCDataFrame_with_csv(tmp_path: pathlib.Path, basic_outlier_csv: str): """ Tests SCDataFrame with CSV input. """ + sc_df = SCDataFrame(data=basic_outlier_csv) expected_df = pd.read_csv(basic_outlier_csv) + + # test that we ingested the data properly assert sc_df.data_source == basic_outlier_csv assert sc_df.equals(expected_df) + # test export + sc_df.export(test_path := f"{tmp_path}/df_input_example.csv", index=False) + + pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path)) + -def test_SCDataFrame_init_with_csv_gz(basic_outlier_csv_gz: str): +def test_SCDataFrame_with_csv_gz(tmp_path: pathlib.Path, basic_outlier_csv_gz: str): """ Tests SCDataFrame with CSV input. """ + sc_df = SCDataFrame(data=basic_outlier_csv_gz) expected_df = pd.read_csv(basic_outlier_csv_gz) + + # test that we ingested the data properly assert sc_df.data_source == basic_outlier_csv_gz assert sc_df.equals(expected_df) + # test export + sc_df.export(test_path := f"{tmp_path}/df_input_example.csv.gz", index=False) -def test_SCDataFrame_init_with_tsv(basic_outlier_tsv: str): + pd.testing.assert_frame_equal( + expected_df, pd.read_csv(test_path, compression="gzip") + ) + + +def test_SCDataFrame_with_tsv(tmp_path: pathlib.Path, basic_outlier_tsv: str): """ Tests SCDataFrame with TSV input. """ + sc_df = SCDataFrame(data=basic_outlier_tsv) expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t") + + # test that we ingested the data properly assert sc_df.data_source == basic_outlier_tsv assert sc_df.equals(expected_df) + # test export + sc_df.export(test_path := f"{tmp_path}/df_input_example.tsv", index=False) + + pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path, sep="\t")) -def test_SCDataFrame_init_with_parquet(basic_outlier_parquet: str): + +def test_SCDataFrame_with_parquet(tmp_path: pathlib.Path, basic_outlier_parquet: str): """ Tests SCDataFrame with TSV input. """ + sc_df = SCDataFrame(data=basic_outlier_parquet) expected_df = pd.read_parquet(basic_outlier_parquet) + + # test that we ingested the data properly assert sc_df.data_source == basic_outlier_parquet assert sc_df.equals(expected_df) + + # test export + sc_df.export(test_path := f"{tmp_path}/df_input_example2.parquet") + + assert parquet.read_table(basic_outlier_parquet).equals( + parquet.read_table(test_path) + ) From fbedb7dfa5302d1ae5dd436cc052ebc6ede116b8 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Tue, 18 Jun 2024 16:53:09 -0600 Subject: [PATCH 15/27] Apply suggestions from code review Co-authored-by: Gregory Way --- src/cosmicqc/scdataframe.py | 6 +++--- tests/conftest.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 4a4acae..d364764 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -31,9 +31,9 @@ class SCDataFrame: __call__(): Returns the underlying pandas DataFrame. __repr__(): - Returns representation of underlying pandas DataFrame. + Returns a representation of the underlying pandas DataFrame. __getattr__(): - Returns underlying attributes of pandas DataFrame. + Returns the underlying attributes of the pandas DataFrame. __getitem__(): Returns slice of data from pandas DataFrame. """ @@ -118,7 +118,7 @@ def __call__(self: Self_SCDataFrame) -> pd.DataFrame: def __repr__(self: Self_SCDataFrame) -> pd.DataFrame: """ - Returns the representation of underlying pandas DataFrame. + Returns the representation of the underlying pandas DataFrame. Returns: pd.DataFrame: The data in a pandas DataFrame. diff --git a/tests/conftest.py b/tests/conftest.py index ed8591e..bc2883d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,7 @@ def fixture_basic_outlier_csv( Creates basic example data csv for use in tests """ - basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv", index=False) + basic_outlier_dataframe.to_csv(csv_path := tmp_path / "basic_example.csv", index=False) return csv_path From 025701497204e9bae941a5ccdf1b4123741c3290 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 18 Jun 2024 17:00:49 -0600 Subject: [PATCH 16/27] update tests and docs --- pyproject.toml | 2 +- src/cosmicqc/scdataframe.py | 20 +++++++++----------- tests/conftest.py | 4 +++- tests/test_scdataframe.py | 15 ++++++--------- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3231ed..67af76e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ select = [ # Ignore `E402` and `F401` (unused imports) in all `__init__.py` files "__init__.py" = ["E402", "F401"] # ignore typing rules for tests -"tests/*" = ["ANN201"] +"tests/*" = ["ANN201", "PLR0913"] # set dynamic versioning capabilities for project [tool.poetry-dynamic-versioning] diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index d364764..17dd0ce 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -31,7 +31,7 @@ class SCDataFrame: __call__(): Returns the underlying pandas DataFrame. __repr__(): - Returns a representation of the underlying pandas DataFrame. + Returns a representational string of the underlying pandas DataFrame. __getattr__(): Returns the underlying attributes of the pandas DataFrame. __getitem__(): @@ -53,10 +53,10 @@ def __init__( if isinstance(data, pd.DataFrame): # if data is a pd.DataFrame, remember this within the data_source attr - self.data_source = "pd.DataFrame" + self.data_source = "pandas.DataFrame" self.data = data - elif isinstance(data, pathlib.Path) or isinstance(data, str): # noqa: PLR1701, SIM101 + elif isinstance(data, pathlib.Path | str): # if the data is a string, remember the original source # through a data_source attr self.data_source = data @@ -65,15 +65,13 @@ def __init__( data_path = pathlib.Path(data) # Read the data from the file based on its extension - if data_path.suffix == ".csv": - # read as a CSV + if ( + data_path.suffix == ".csv" + or data_path.suffix in (".tsv", ".txt") + or data_path.suffixes == [".csv", ".gz"] + ): + # read as a CSV, CSV.GZ, .TSV, or .TXT file self.data = pd.read_csv(data, **kwargs) - elif data_path.suffixes == [".csv", ".gz"]: - # read as a CSV.GZ file - self.data = pd.read_csv(data, compression="gzip", **kwargs) - elif data_path.suffix in (".tsv", ".txt"): - # read as a TSV - self.data = pd.read_csv(data, delimiter="\t", **kwargs) elif data_path.suffix == ".parquet": # read as a Parquet file self.data = pd.read_parquet(data, **kwargs) diff --git a/tests/conftest.py b/tests/conftest.py index bc2883d..8f97176 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,9 @@ def fixture_basic_outlier_csv( Creates basic example data csv for use in tests """ - basic_outlier_dataframe.to_csv(csv_path := tmp_path / "basic_example.csv", index=False) + basic_outlier_dataframe.to_csv( + csv_path := tmp_path / "basic_example.csv", index=False + ) return csv_path diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index 78e5dd9..713fc97 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -10,7 +10,12 @@ def test_SCDataFrame_with_dataframe( - tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame + tmp_path: pathlib.Path, + basic_outlier_dataframe: pd.DataFrame, + basic_outlier_csv: str, + basic_outlier_csv_gz: str, + basic_outlier_tsv: str, + basic_outlier_parquet: str, ): """ Tests SCDataFrame with pd.DataFrame input. @@ -30,8 +35,6 @@ def test_SCDataFrame_with_dataframe( assert parquet.read_table(control_path).equals(parquet.read_table(test_path)) - -def test_SCDataFrame_with_csv(tmp_path: pathlib.Path, basic_outlier_csv: str): """ Tests SCDataFrame with CSV input. """ @@ -48,8 +51,6 @@ def test_SCDataFrame_with_csv(tmp_path: pathlib.Path, basic_outlier_csv: str): pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path)) - -def test_SCDataFrame_with_csv_gz(tmp_path: pathlib.Path, basic_outlier_csv_gz: str): """ Tests SCDataFrame with CSV input. """ @@ -68,8 +69,6 @@ def test_SCDataFrame_with_csv_gz(tmp_path: pathlib.Path, basic_outlier_csv_gz: s expected_df, pd.read_csv(test_path, compression="gzip") ) - -def test_SCDataFrame_with_tsv(tmp_path: pathlib.Path, basic_outlier_tsv: str): """ Tests SCDataFrame with TSV input. """ @@ -86,8 +85,6 @@ def test_SCDataFrame_with_tsv(tmp_path: pathlib.Path, basic_outlier_tsv: str): pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path, sep="\t")) - -def test_SCDataFrame_with_parquet(tmp_path: pathlib.Path, basic_outlier_parquet: str): """ Tests SCDataFrame with TSV input. """ From 985a6dda77d5d57fe894f262ba1aa6e3538eb6a6 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 19 Jun 2024 08:31:14 -0600 Subject: [PATCH 17/27] fix tests --- pyproject.toml | 2 +- src/cosmicqc/scdataframe.py | 6 ++++-- tests/test_scdataframe.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 67af76e..5adde66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ profile = "black" exclude_dirs = ["tests"] [tool.ruff] -target-version = "py311" +target-version = "py38" line-length = 88 fix = true diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 17dd0ce..d96cbee 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -39,7 +39,9 @@ class SCDataFrame: """ def __init__( - self: Self_SCDataFrame, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any] + self: Self_SCDataFrame, + data: Union[pd.DataFrame, str, pathlib.Path], + **kwargs: Dict[str, Any], ) -> None: """ Initializes the SCDataFrame with either a DataFrame or a file path. @@ -56,7 +58,7 @@ def __init__( self.data_source = "pandas.DataFrame" self.data = data - elif isinstance(data, pathlib.Path | str): + elif isinstance(data, (pathlib.Path, str)): # if the data is a string, remember the original source # through a data_source attr self.data_source = data diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index 713fc97..bcb520c 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -24,7 +24,7 @@ def test_SCDataFrame_with_dataframe( sc_df = SCDataFrame(data=basic_outlier_dataframe) # test that we ingested the data properly - assert sc_df.data_source == "pd.DataFrame" + assert sc_df.data_source == "pandas.DataFrame" assert sc_df.equals(basic_outlier_dataframe) # test export From 5034a0727473fd382944761e4ccce8754d628795 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 19 Jun 2024 11:01:36 -0600 Subject: [PATCH 18/27] update tests; add constructor path for scdataframe --- src/cosmicqc/analyze.py | 9 +++------ src/cosmicqc/scdataframe.py | 25 +++++++++++++++---------- tests/test_scdataframe.py | 31 +++++++++++-------------------- 3 files changed, 29 insertions(+), 36 deletions(-) diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index d616f78..abe43d6 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -55,8 +55,7 @@ def identify_outliers( """ # interpret the df as SCDataFrame - if not isinstance(df, SCDataFrame): - df = SCDataFrame(data=df) + df = SCDataFrame(data=df) # create a copy of the dataframe to ensure # we don't modify the supplied dataframe inplace. @@ -145,8 +144,7 @@ def find_outliers( """ # interpret the df as SCDataFrame - if not isinstance(df, SCDataFrame): - df = SCDataFrame(data=df) + df = SCDataFrame(data=df) if isinstance(feature_thresholds, str): feature_thresholds = read_thresholds_set_from_file( @@ -212,8 +210,7 @@ def label_outliers( """ # interpret the df as SCDataFrame - if not isinstance(df, SCDataFrame): - df = SCDataFrame(data=df) + df = SCDataFrame(data=df) # for single outlier processing if isinstance(feature_thresholds, (str, dict)): diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index d96cbee..bbe29db 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -9,7 +9,7 @@ # provide backwards compatibility for Self type in earlier Python versions. # see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods -Self_SCDataFrame = TypeVar("Self_SCDataFrame", bound="SCDataFrame") +SCDataFrame_type = TypeVar("SCDataFrame_type", bound="SCDataFrame") class SCDataFrame: @@ -39,8 +39,8 @@ class SCDataFrame: """ def __init__( - self: Self_SCDataFrame, - data: Union[pd.DataFrame, str, pathlib.Path], + self: SCDataFrame_type, + data: Union[SCDataFrame_type, pd.DataFrame, str, pathlib.Path], **kwargs: Dict[str, Any], ) -> None: """ @@ -53,7 +53,12 @@ def __init__( Additional keyword arguments to pass to the pandas read functions. """ - if isinstance(data, pd.DataFrame): + if isinstance(data, SCDataFrame): + # if data is an instance of SCDataFrame, use its data_source and data + self.data_source = data.data_source + self.data = data.data + + elif isinstance(data, pd.DataFrame): # if data is a pd.DataFrame, remember this within the data_source attr self.data_source = "pandas.DataFrame" self.data = data @@ -80,10 +85,10 @@ def __init__( else: raise ValueError("Unsupported file format for SCDataFrame.") else: - raise ValueError("Unsupported input type for SCDataFrame.") + raise ValueError("Unsupported data type for SCDataFrame.") def export( - self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any] + self: SCDataFrame_type, file_path: str, **kwargs: Dict[str, Any] ) -> None: """ Exports the underlying pandas DataFrame to a file. @@ -107,7 +112,7 @@ def export( else: raise ValueError("Unsupported file format for export.") - def __call__(self: Self_SCDataFrame) -> pd.DataFrame: + def __call__(self: SCDataFrame_type) -> pd.DataFrame: """ Returns the underlying pandas DataFrame. @@ -116,7 +121,7 @@ def __call__(self: Self_SCDataFrame) -> pd.DataFrame: """ return self.data - def __repr__(self: Self_SCDataFrame) -> pd.DataFrame: + def __repr__(self: SCDataFrame_type) -> pd.DataFrame: """ Returns the representation of the underlying pandas DataFrame. @@ -125,7 +130,7 @@ def __repr__(self: Self_SCDataFrame) -> pd.DataFrame: """ return repr(self.data) - def __getattr__(self: Self_SCDataFrame, attr: str) -> Any: # noqa: ANN401 + def __getattr__(self: SCDataFrame_type, attr: str) -> Any: # noqa: ANN401 """ Intercept attribute accesses and delegate them to the underlying pandas DataFrame, except for custom methods. @@ -140,7 +145,7 @@ def __getattr__(self: Self_SCDataFrame, attr: str) -> Any: # noqa: ANN401 return self.__dict__[attr] return getattr(self.data, attr) - def __getitem__(self: Self_SCDataFrame, key: Union[int, str]) -> Any: # noqa: ANN401 + def __getitem__(self: SCDataFrame_type, key: Union[int, str]) -> Any: # noqa: ANN401 """ Returns an element or a slice of the underlying pandas DataFrame. diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index bcb520c..2627b88 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -17,10 +17,8 @@ def test_SCDataFrame_with_dataframe( basic_outlier_tsv: str, basic_outlier_parquet: str, ): - """ - Tests SCDataFrame with pd.DataFrame input. - """ - + + # Tests SCDataFrame with pd.DataFrame input. sc_df = SCDataFrame(data=basic_outlier_dataframe) # test that we ingested the data properly @@ -35,10 +33,7 @@ def test_SCDataFrame_with_dataframe( assert parquet.read_table(control_path).equals(parquet.read_table(test_path)) - """ - Tests SCDataFrame with CSV input. - """ - + # Tests SCDataFrame with CSV input. sc_df = SCDataFrame(data=basic_outlier_csv) expected_df = pd.read_csv(basic_outlier_csv) @@ -51,10 +46,7 @@ def test_SCDataFrame_with_dataframe( pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path)) - """ - Tests SCDataFrame with CSV input. - """ - + # Tests SCDataFrame with CSV input. sc_df = SCDataFrame(data=basic_outlier_csv_gz) expected_df = pd.read_csv(basic_outlier_csv_gz) @@ -69,10 +61,7 @@ def test_SCDataFrame_with_dataframe( expected_df, pd.read_csv(test_path, compression="gzip") ) - """ - Tests SCDataFrame with TSV input. - """ - + # Tests SCDataFrame with TSV input. sc_df = SCDataFrame(data=basic_outlier_tsv) expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t") @@ -85,10 +74,7 @@ def test_SCDataFrame_with_dataframe( pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path, sep="\t")) - """ - Tests SCDataFrame with TSV input. - """ - + # Tests SCDataFrame with parquet input. sc_df = SCDataFrame(data=basic_outlier_parquet) expected_df = pd.read_parquet(basic_outlier_parquet) @@ -102,3 +88,8 @@ def test_SCDataFrame_with_dataframe( assert parquet.read_table(basic_outlier_parquet).equals( parquet.read_table(test_path) ) + + # test SCDataFrame with SCDataFrame input + copy_sc_df = SCDataFrame(data=sc_df) + + pd.testing.assert_frame_equal(copy_sc_df.data, sc_df.data) From fd818685c9b3d4ae5dea88d7ca3a40a5e1c3615f Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 19 Jun 2024 11:01:48 -0600 Subject: [PATCH 19/27] linting --- tests/test_scdataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index 2627b88..2781c8c 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -17,7 +17,6 @@ def test_SCDataFrame_with_dataframe( basic_outlier_tsv: str, basic_outlier_parquet: str, ): - # Tests SCDataFrame with pd.DataFrame input. sc_df = SCDataFrame(data=basic_outlier_dataframe) From 6d61bf3d0cec43780dea125c14bae4a9f27e8724 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 19 Jun 2024 11:39:05 -0600 Subject: [PATCH 20/27] enable pd.series compatibility --- src/cosmicqc/scdataframe.py | 6 ++++++ tests/test_scdataframe.py | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index bbe29db..50eb0f8 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -58,6 +58,12 @@ def __init__( self.data_source = data.data_source self.data = data.data + elif isinstance(data, pd.Series): + # if data is a pd.Series, remember this within the data_source attr + self.data_source = "pandas.Series" + # also cast the series to a dataframe + self.data = pd.DataFrame(data) + elif isinstance(data, pd.DataFrame): # if data is a pd.DataFrame, remember this within the data_source attr self.data_source = "pandas.DataFrame" diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index 2781c8c..2ddcb03 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -32,6 +32,13 @@ def test_SCDataFrame_with_dataframe( assert parquet.read_table(control_path).equals(parquet.read_table(test_path)) + # Tests SCDataFrame with pd.Series input. + sc_df = SCDataFrame(data=basic_outlier_dataframe.loc[0]) + + # test that we ingested the data properly + assert sc_df.data_source == "pandas.Series" + assert sc_df.equals(pd.DataFrame(basic_outlier_dataframe.loc[0])) + # Tests SCDataFrame with CSV input. sc_df = SCDataFrame(data=basic_outlier_csv) expected_df = pd.read_csv(basic_outlier_csv) From f696b27499e503df64ce40c53d7a80b952454732 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 25 Jun 2024 11:56:35 -0600 Subject: [PATCH 21/27] update comments about input --- src/cosmicqc/analyze.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index abe43d6..bf926e3 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -33,7 +33,8 @@ def identify_outliers( Args: df: Union[SCDataFrame, pd.DataFrame, str] - DataFrame or file with converted output from CytoTable. + DataFrame or file string-based filepath of a + Parquet, CSV, or TSV file with CytoTable output or similar data. metadata_columns: List[str] List of metadata columns that should be outputted with the outlier data. feature_thresholds: Dict[str, float] @@ -123,7 +124,8 @@ def find_outliers( Args: df: Union[SCDataFrame, pd.DataFrame, str] - DataFrame or file with converted output from CytoTable. + DataFrame or file string-based filepath of a + Parquet, CSV, or TSV file with CytoTable output or similar data. metadata_columns: List[str] List of metadata columns that should be outputted with the outlier data. feature_thresholds: Dict[str, float] @@ -188,7 +190,8 @@ def label_outliers( Args: df: Union[SCDataFrame, pd.DataFrame, str] - DataFrame or file with converted output from CytoTable. + DataFrame or file string-based filepath of a + Parquet, CSV, or TSV file with CytoTable output or similar data. feature_thresholds: Dict[str, float] One of two options: A dictionary with the feature name(s) as the key(s) and their assigned From 07e18c8198e85c5765866b572aace8dd9a97a3ca Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 25 Jun 2024 11:58:07 -0600 Subject: [PATCH 22/27] tsv.gz compression option addition --- src/cosmicqc/scdataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 50eb0f8..78245f2 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -82,6 +82,7 @@ def __init__( data_path.suffix == ".csv" or data_path.suffix in (".tsv", ".txt") or data_path.suffixes == [".csv", ".gz"] + or data_path.suffixes == [".tsv", ".gz"] ): # read as a CSV, CSV.GZ, .TSV, or .TXT file self.data = pd.read_csv(data, **kwargs) From 457da2dfd5eb1f73e256046165683ac34b82d75b Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 25 Jun 2024 11:59:54 -0600 Subject: [PATCH 23/27] general docs consistency --- src/cosmicqc/scdataframe.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 78245f2..b683b90 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -50,7 +50,7 @@ def __init__( data (Union[pd.DataFrame, str]): The data source, either a pandas DataFrame or a file path. **kwargs: - Additional keyword arguments to pass to the pandas read functions. + Additional keyword arguments to pass to the pandas read_* methods. """ if isinstance(data, SCDataFrame): @@ -101,8 +101,10 @@ def export( Exports the underlying pandas DataFrame to a file. Args: - file_path (str): The path where the DataFrame should be saved. - **kwargs: Additional keyword arguments to pass to the pandas to_* methods. + file_path (str): + The path where the DataFrame should be saved. + **kwargs: + Additional keyword arguments to pass to the pandas to_* methods. """ data_path = pathlib.Path(file_path) @@ -143,10 +145,12 @@ def __getattr__(self: SCDataFrame_type, attr: str) -> Any: # noqa: ANN401 pandas DataFrame, except for custom methods. Args: - attr (str): The name of the attribute being accessed. + attr (str): + The name of the attribute being accessed. Returns: - Any: The value of the attribute from the pandas DataFrame. + Any: + The value of the attribute from the pandas DataFrame. """ if attr in self.__dict__: return self.__dict__[attr] @@ -157,9 +161,11 @@ def __getitem__(self: SCDataFrame_type, key: Union[int, str]) -> Any: # noqa: A Returns an element or a slice of the underlying pandas DataFrame. Args: - key: The key or slice to access the data. + key: + The key or slice to access the data. Returns: - pd.DataFrame or any: The selected element or slice of data. + pd.DataFrame or any: + The selected element or slice of data. """ return self.data[key] From ccb7cce044081a03c4a1b5abce5ebb8ae0db6a28 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 25 Jun 2024 12:07:19 -0600 Subject: [PATCH 24/27] class docs --- src/cosmicqc/scdataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index b683b90..27ca5c9 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -14,11 +14,13 @@ class SCDataFrame: """ - A class to handle and load different types of data files into a pandas DataFrame. + A class designed to enhance single-cell data handling by wrapping + pandas DataFrame capabilities, providing advanced methods for quality control, + comprehensive analysis, and image-based data processing. This class can initialize with either a pandas DataFrame or a file path (CSV, TSV, TXT, or Parquet). When initialized with a file path, it reads the data into a - pandas DataFrame. + pandas DataFrame. It also includes capabilities to export data. Attributes: data_source (str): From 341514d40ebcd07e12dee4685bdf8a4e6ade1133 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 25 Jun 2024 12:07:53 -0600 Subject: [PATCH 25/27] string repr --- src/cosmicqc/scdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index 27ca5c9..eb0aa15 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -132,7 +132,7 @@ def __call__(self: SCDataFrame_type) -> pd.DataFrame: """ return self.data - def __repr__(self: SCDataFrame_type) -> pd.DataFrame: + def __repr__(self: SCDataFrame_type) -> str: """ Returns the representation of the underlying pandas DataFrame. From 28c7931be361a0db6bee10e336bc814eff05700c Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 25 Jun 2024 12:21:27 -0600 Subject: [PATCH 26/27] add str-based checks for scdataframe output --- tests/test_scdataframe.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py index 2ddcb03..3ca01c3 100644 --- a/tests/test_scdataframe.py +++ b/tests/test_scdataframe.py @@ -23,6 +23,7 @@ def test_SCDataFrame_with_dataframe( # test that we ingested the data properly assert sc_df.data_source == "pandas.DataFrame" assert sc_df.equals(basic_outlier_dataframe) + assert str(sc_df) == str(basic_outlier_dataframe) # test export basic_outlier_dataframe.to_parquet( @@ -38,6 +39,7 @@ def test_SCDataFrame_with_dataframe( # test that we ingested the data properly assert sc_df.data_source == "pandas.Series" assert sc_df.equals(pd.DataFrame(basic_outlier_dataframe.loc[0])) + assert str(sc_df) == str(pd.DataFrame(basic_outlier_dataframe.loc[0])) # Tests SCDataFrame with CSV input. sc_df = SCDataFrame(data=basic_outlier_csv) @@ -46,6 +48,7 @@ def test_SCDataFrame_with_dataframe( # test that we ingested the data properly assert sc_df.data_source == basic_outlier_csv assert sc_df.equals(expected_df) + assert str(sc_df) == str(expected_df) # test export sc_df.export(test_path := f"{tmp_path}/df_input_example.csv", index=False) @@ -59,6 +62,7 @@ def test_SCDataFrame_with_dataframe( # test that we ingested the data properly assert sc_df.data_source == basic_outlier_csv_gz assert sc_df.equals(expected_df) + assert str(sc_df) == str(expected_df) # test export sc_df.export(test_path := f"{tmp_path}/df_input_example.csv.gz", index=False) @@ -74,6 +78,7 @@ def test_SCDataFrame_with_dataframe( # test that we ingested the data properly assert sc_df.data_source == basic_outlier_tsv assert sc_df.equals(expected_df) + assert str(sc_df) == str(expected_df) # test export sc_df.export(test_path := f"{tmp_path}/df_input_example.tsv", index=False) @@ -87,6 +92,7 @@ def test_SCDataFrame_with_dataframe( # test that we ingested the data properly assert sc_df.data_source == basic_outlier_parquet assert sc_df.equals(expected_df) + assert str(sc_df) == str(expected_df) # test export sc_df.export(test_path := f"{tmp_path}/df_input_example2.parquet") From fe73f4038d45ddee9b32c1a3f493977378adaa52 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Tue, 25 Jun 2024 12:22:58 -0600 Subject: [PATCH 27/27] Update src/cosmicqc/scdataframe.py Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com> --- src/cosmicqc/scdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index eb0aa15..b722a50 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -72,7 +72,7 @@ def __init__( self.data = data elif isinstance(data, (pathlib.Path, str)): - # if the data is a string, remember the original source + # if the data is a string or a pathlib path, remember the original source # through a data_source attr self.data_source = data