From 1b7844cf8d6fdd2b631e0b731edf75fb756390ea Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Jun 2024 12:21:05 -0600
Subject: [PATCH 01/27] Create qcdataframe.py

---
 src/cosmicqc/qcdataframe.py | 63 +++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 src/cosmicqc/qcdataframe.py

diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py
new file mode 100644
index 0000000..ba25c6f
--- /dev/null
+++ b/src/cosmicqc/qcdataframe.py
@@ -0,0 +1,63 @@
+"""
+Defines a QCDataFrame class for use in coSMicQC.
+"""
+
+import pandas as pd
+from typing import Union
+
+class QCDataFrame:
+    """
+    A class to handle and load different types of data files into a pandas DataFrame.
+
+    This class can initialize with either a pandas DataFrame or a file path (CSV, TSV,
+    TXT, or Parquet). When initialized with a file path, it reads the data into a
+    pandas DataFrame.
+
+    Attributes:
+        reference (str):
+            A string indicating the type of data source, either 'pd.DataFrame'
+            or the file path.
+        data (pd.DataFrame):
+            The loaded data in a pandas DataFrame.
+
+    Methods:
+        __call__():
+            Returns the underlying pandas DataFrame.
+    """
+
+    def __init__(self, data: Union[pd.DataFrame, str], **kwargs) -> None:
+        """
+        Initializes the QCDataFrame with either a DataFrame or a file path.
+
+        Args:
+            data (Union[pd.DataFrame, str]):
+                The data source, either a pandas DataFrame or a file path.
+            **kwargs:
+                Additional keyword arguments to pass to the pandas read functions.
+        """
+        if isinstance(data, pd.DataFrame):
+            # if data is a pd.DataFrame, remember this within the reference attr
+            self.reference = "pd.DataFrame"
+            self.data = data
+        elif isinstance(data, str):
+            # if the data is a string, remember the original source
+            # through a reference attr
+            self.reference = data
+
+            # Read the data from the file based on its extension
+            if data.endswith(".csv"):
+                self.data = pd.read_csv(data, **kwargs)
+            elif data.endswith(".tsv") or data.endswith(".txt"):
+                self.data = pd.read_csv(data, delimiter="\t", **kwargs)
+            elif data.endswith(".parquet"):
+                self.data = pd.read_parquet(data, **kwargs)
+
+    def __call__(self) -> pd.DataFrame:
+        """
+        Returns the underlying pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: The data in a pandas DataFrame.
+        """
+        return self.data
+

From 978484127fd4bf00a96547d0b8b08bcd68495755 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Jun 2024 12:33:33 -0600
Subject: [PATCH 02/27] linting

---
 src/cosmicqc/qcdataframe.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py
index ba25c6f..7a229fb 100644
--- a/src/cosmicqc/qcdataframe.py
+++ b/src/cosmicqc/qcdataframe.py
@@ -2,8 +2,10 @@
 Defines a QCDataFrame class for use in coSMicQC.
 """
 
+from typing import Any, Dict, Self, Union
+
 import pandas as pd
-from typing import Union
+
 
 class QCDataFrame:
     """
@@ -25,7 +27,9 @@ class QCDataFrame:
             Returns the underlying pandas DataFrame.
     """
 
-    def __init__(self, data: Union[pd.DataFrame, str], **kwargs) -> None:
+    def __init__(
+        self: Self, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any]
+    ) -> None:
         """
         Initializes the QCDataFrame with either a DataFrame or a file path.
 
@@ -52,7 +56,7 @@ def __init__(self, data: Union[pd.DataFrame, str], **kwargs) -> None:
             elif data.endswith(".parquet"):
                 self.data = pd.read_parquet(data, **kwargs)
 
-    def __call__(self) -> pd.DataFrame:
+    def __call__(self: Self) -> pd.DataFrame:
         """
         Returns the underlying pandas DataFrame.
 
@@ -60,4 +64,3 @@ def __call__(self) -> pd.DataFrame:
             pd.DataFrame: The data in a pandas DataFrame.
         """
         return self.data
-

From c39e2b0c4f000289ffcb87b107ce964bb1ef80fe Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Jun 2024 14:04:42 -0600
Subject: [PATCH 03/27] add qcdataframe

---
 src/cosmicqc/__init__.py    |  1 +
 src/cosmicqc/analyze.py     | 32 ++++++++++++++------
 src/cosmicqc/qcdataframe.py | 58 ++++++++++++++++++++++++++++++++++---
 tests/conftest.py           | 41 ++++++++++++++++++++++++++
 tests/test_analyze.py       | 18 +++++++++++-
 tests/test_qcdataframe.py   | 42 +++++++++++++++++++++++++++
 6 files changed, 178 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_qcdataframe.py

diff --git a/src/cosmicqc/__init__.py b/src/cosmicqc/__init__.py
index aca6bd8..82f4e10 100644
--- a/src/cosmicqc/__init__.py
+++ b/src/cosmicqc/__init__.py
@@ -3,6 +3,7 @@
 """
 
 from .analyze import find_outliers
+from .qcdataframe import QCDataFrame
 
 # note: version placeholder is updated during build
 # by poetry-dynamic-versioning.
diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
index 9f33c4b..70833a6 100644
--- a/src/cosmicqc/analyze.py
+++ b/src/cosmicqc/analyze.py
@@ -11,13 +11,15 @@
 import yaml
 from scipy.stats import zscore as scipy_zscore
 
+from .qcdataframe import QCDataFrame
+
 DEFAULT_QC_THRESHOLD_FILE = (
     f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml"
 )
 
 
 def identify_outliers(
-    df: pd.DataFrame,
+    df: Union[QCDataFrame, pd.DataFrame, str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
@@ -30,8 +32,8 @@ def identify_outliers(
     threshold of 0 as that would represent the whole dataset.
 
     Args:
-        df: pd.DataFrame
-            Data frame with converted output from CytoTable.
+        df: Union[QCDataFrame, pd.DataFrame, str]
+            DataFrame or file with converted output from CytoTable.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
@@ -52,6 +54,10 @@ def identify_outliers(
             or not for use within other functions.
     """
 
+    # interpret the df as QCDataFrame
+    if not isinstance(df, QCDataFrame):
+        df = QCDataFrame(data=df)
+
     # create a copy of the dataframe to ensure
     # we don't modify the supplied dataframe inplace.
     outlier_df = df.copy()
@@ -107,7 +113,7 @@ def identify_outliers(
 
 
 def find_outliers(
-    df: pd.DataFrame,
+    df: Union[QCDataFrame, pd.DataFrame, str],
     metadata_columns: List[str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
@@ -117,8 +123,8 @@ def find_outliers(
     with only the outliers and provided metadata columns.
 
     Args:
-        df: pd.DataFrame
-            Data frame with converted output from CytoTable.
+        df: Union[QCDataFrame, pd.DataFrame, str]
+            DataFrame or file with converted output from CytoTable.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
@@ -138,6 +144,10 @@ def find_outliers(
             Outlier data frame for the given conditions.
     """
 
+    # interpret the df as QCDataFrame
+    if not isinstance(df, QCDataFrame):
+        df = QCDataFrame(data=df)
+
     if isinstance(feature_thresholds, str):
         feature_thresholds = read_thresholds_set_from_file(
             feature_thresholds=feature_thresholds,
@@ -169,7 +179,7 @@ def find_outliers(
 
 
 def label_outliers(
-    df: pd.DataFrame,
+    df: Union[QCDataFrame, pd.DataFrame, str],
     feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
@@ -179,8 +189,8 @@ def label_outliers(
     where a cell passed or failed the quality control condition(s).
 
         Args:
-            df: pd.DataFrame
-                Data frame with converted output from CytoTable.
+            df: Union[QCDataFrame, pd.DataFrame, str]
+                DataFrame or file with converted output from CytoTable.
             feature_thresholds: Dict[str, float]
                 One of two options:
                 A dictionary with the feature name(s) as the key(s) and their assigned
@@ -201,6 +211,10 @@ def label_outliers(
                 Full dataframe with optional scores and outlier boolean column.
     """
 
+    # interpret the df as QCDataFrame
+    if not isinstance(df, QCDataFrame):
+        df = QCDataFrame(data=df)
+
     # for single outlier processing
     if isinstance(feature_thresholds, (str, dict)):
         # return the outlier dataframe for one threshold rule
diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py
index 7a229fb..3ed24fa 100644
--- a/src/cosmicqc/qcdataframe.py
+++ b/src/cosmicqc/qcdataframe.py
@@ -2,6 +2,7 @@
 Defines a QCDataFrame class for use in coSMicQC.
 """
 
+import pathlib
 from typing import Any, Dict, Self, Union
 
 import pandas as pd
@@ -39,23 +40,38 @@ def __init__(
             **kwargs:
                 Additional keyword arguments to pass to the pandas read functions.
         """
+
+        # print(data)
+        # print(type(data))
+        # print(isinstance(data, QCDataFrame))
+
         if isinstance(data, pd.DataFrame):
             # if data is a pd.DataFrame, remember this within the reference attr
             self.reference = "pd.DataFrame"
             self.data = data
-        elif isinstance(data, str):
+
+        elif isinstance(data, pathlib.Path | str):
             # if the data is a string, remember the original source
             # through a reference attr
             self.reference = data
 
+            # interpret the data through pathlib
+            data_path = pathlib.Path(data)
+
             # Read the data from the file based on its extension
-            if data.endswith(".csv"):
+            if data_path.suffix == ".csv":
+                # read as a CSV
                 self.data = pd.read_csv(data, **kwargs)
-            elif data.endswith(".tsv") or data.endswith(".txt"):
+            elif data_path.suffix in (".tsv", ".txt"):
+                # read as a TSV
                 self.data = pd.read_csv(data, delimiter="\t", **kwargs)
-            elif data.endswith(".parquet"):
+            elif data_path.suffix == ".parquet":
+                # read as a Parquet file
                 self.data = pd.read_parquet(data, **kwargs)
 
+        else:
+            raise ValueError("Unsupported file format for QCDataFrame.")
+
     def __call__(self: Self) -> pd.DataFrame:
         """
         Returns the underlying pandas DataFrame.
@@ -64,3 +80,37 @@ def __call__(self: Self) -> pd.DataFrame:
             pd.DataFrame: The data in a pandas DataFrame.
         """
         return self.data
+
+    def __repr__(self: Self) -> pd.DataFrame:
+        """
+        Returns the underlying pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: The data in a pandas DataFrame.
+        """
+        return self.data
+
+    def __getattr__(self: Self, attr: str) -> Any:  # noqa: ANN401
+        """
+        Intercept attribute accesses and delegate them to the underlying
+        pandas DataFrame.
+
+        Args:
+            attr (str): The name of the attribute being accessed.
+
+        Returns:
+            Any: The value of the attribute from the pandas DataFrame.
+        """
+        return getattr(self.data, attr)
+
+    def __getitem__(self: Self, key: Union[int, str]) -> Any:  # noqa: ANN401
+        """
+        Returns an element or a slice of the underlying pandas DataFrame.
+
+        Args:
+            key: The key or slice to access the data.
+
+        Returns:
+            pd.DataFrame or any: The selected element or slice of data.
+        """
+        return self.data[key]
diff --git a/tests/conftest.py b/tests/conftest.py
index a8f796a..967ad2a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,8 @@
 https://docs.pytest.org/en/7.1.x/explanation/fixtures.html
 """
 
+import pathlib
+
 import pandas as pd
 import pytest
 
@@ -24,3 +26,42 @@ def fixture_basic_outlier_dataframe():
     Creates basic example data for use in tests
     """
     return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
+
+
+@pytest.fixture(name="basic_outlier_csv")
+def fixture_basic_outlier_csv(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data csv for use in tests
+    """
+
+    basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv")
+
+    return csv_path
+
+
+@pytest.fixture(name="basic_outlier_tsv")
+def fixture_basic_outlier_tsv(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data tsv for use in tests
+    """
+
+    basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t")
+
+    return tsv_path
+
+
+@pytest.fixture(name="basic_outlier_parquet")
+def fixture_basic_outlier_parquet(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data parquet for use in tests
+    """
+
+    basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet")
+
+    return parquet_path
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
index 6c7a879..a55cf6b 100644
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
@@ -7,7 +7,7 @@
 from cosmicqc import analyze
 
 
-def test_find_outliers_basic(basic_outlier_dataframe: pd.DataFrame):
+def test_find_outliers_basic_dataframe(basic_outlier_dataframe: pd.DataFrame):
     """
     Testing find_outliers with basic/simulated data.
     """
@@ -27,6 +27,21 @@ def test_find_outliers_basic(basic_outlier_dataframe: pd.DataFrame):
     }
 
 
+def test_find_outliers_basic_csv(basic_outlier_csv: str):
+    """
+    Testing find_outliers with csv data.
+    """
+
+    # assert that we have the output we expect
+    assert analyze.find_outliers(
+        df=basic_outlier_csv,
+        feature_thresholds={"example_feature": 1},
+        metadata_columns=[],
+    ).to_dict(orient="dict") == {
+        "example_feature": {8: 9, 9: 10},
+    }
+
+
 def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame):
     """
     Testing find_outliers with CytoTable CFReT data.
@@ -315,6 +330,7 @@ def test_find_outliers_dict_and_default_config_cfret(
 
 def test_label_outliers(
     basic_outlier_dataframe: pd.DataFrame,
+    basic_outlier_csv: str,
     cytotable_CFReT_data_df: pd.DataFrame,
 ):
     """
diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py
new file mode 100644
index 0000000..1124305
--- /dev/null
+++ b/tests/test_qcdataframe.py
@@ -0,0 +1,42 @@
+"""
+Tests cosmicqc qcdataframe module
+"""
+
+import pandas as pd
+from cosmicqc.qcdataframe import QCDataFrame
+
+def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame):
+    """
+    Tests QCDataFrame with pd.DataFrame input.
+    """
+    qc_df = QCDataFrame(data=basic_outlier_dataframe)
+    assert qc_df.reference == "pd.DataFrame"
+    assert qc_df.equals(basic_outlier_dataframe)
+
+def test_qcdataframe_init_with_csv(basic_outlier_csv:str):
+    """
+    Tests QCDataFrame with CSV input.
+    """
+    qc_df = QCDataFrame(data=basic_outlier_csv)
+    expected_df = pd.read_csv(basic_outlier_csv)
+    assert qc_df.reference == basic_outlier_csv
+    assert qc_df.equals(expected_df)
+
+def test_qcdataframe_init_with_tsv(basic_outlier_tsv:str):
+    """
+    Tests QCDataFrame with TSV input.
+    """
+    qc_df = QCDataFrame(data=basic_outlier_tsv)
+    expected_df = pd.read_csv(basic_outlier_tsv, delimiter='\t')
+    assert qc_df.reference == basic_outlier_tsv
+    assert qc_df.equals(expected_df)
+
+def test_qcdataframe_init_with_parquet(basic_outlier_parquet:str):
+    """
+    Tests QCDataFrame with TSV input.
+    """
+    qc_df = QCDataFrame(data=basic_outlier_parquet)
+    expected_df = pd.read_parquet(basic_outlier_parquet)
+    assert qc_df.reference == basic_outlier_parquet
+    assert qc_df.equals(expected_df)
+

From f3003e1de72a83a281308149acefc29fe4e63683 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Jun 2024 14:04:54 -0600
Subject: [PATCH 04/27] linting

---
 tests/test_qcdataframe.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py
index 1124305..1b1fad4 100644
--- a/tests/test_qcdataframe.py
+++ b/tests/test_qcdataframe.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from cosmicqc.qcdataframe import QCDataFrame
 
+
 def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame):
     """
     Tests QCDataFrame with pd.DataFrame input.
@@ -13,7 +14,8 @@ def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame):
     assert qc_df.reference == "pd.DataFrame"
     assert qc_df.equals(basic_outlier_dataframe)
 
-def test_qcdataframe_init_with_csv(basic_outlier_csv:str):
+
+def test_qcdataframe_init_with_csv(basic_outlier_csv: str):
     """
     Tests QCDataFrame with CSV input.
     """
@@ -22,16 +24,18 @@ def test_qcdataframe_init_with_csv(basic_outlier_csv:str):
     assert qc_df.reference == basic_outlier_csv
     assert qc_df.equals(expected_df)
 
-def test_qcdataframe_init_with_tsv(basic_outlier_tsv:str):
+
+def test_qcdataframe_init_with_tsv(basic_outlier_tsv: str):
     """
     Tests QCDataFrame with TSV input.
     """
     qc_df = QCDataFrame(data=basic_outlier_tsv)
-    expected_df = pd.read_csv(basic_outlier_tsv, delimiter='\t')
+    expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t")
     assert qc_df.reference == basic_outlier_tsv
     assert qc_df.equals(expected_df)
 
-def test_qcdataframe_init_with_parquet(basic_outlier_parquet:str):
+
+def test_qcdataframe_init_with_parquet(basic_outlier_parquet: str):
     """
     Tests QCDataFrame with TSV input.
     """
@@ -39,4 +43,3 @@ def test_qcdataframe_init_with_parquet(basic_outlier_parquet:str):
     expected_df = pd.read_parquet(basic_outlier_parquet)
     assert qc_df.reference == basic_outlier_parquet
     assert qc_df.equals(expected_df)
-

From b97f3a57aaa273b19fd61276db5bfe391b21de6d Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Jun 2024 16:14:27 -0600
Subject: [PATCH 05/27] adding tests

---
 src/cosmicqc/qcdataframe.py |  4 ----
 tests/conftest.py           |  6 +++---
 tests/test_analyze.py       | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/qcdataframe.py
index 3ed24fa..8498311 100644
--- a/src/cosmicqc/qcdataframe.py
+++ b/src/cosmicqc/qcdataframe.py
@@ -41,10 +41,6 @@ def __init__(
                 Additional keyword arguments to pass to the pandas read functions.
         """
 
-        # print(data)
-        # print(type(data))
-        # print(isinstance(data, QCDataFrame))
-
         if isinstance(data, pd.DataFrame):
             # if data is a pd.DataFrame, remember this within the reference attr
             self.reference = "pd.DataFrame"
diff --git a/tests/conftest.py b/tests/conftest.py
index 967ad2a..ad0c17e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,7 +36,7 @@ def fixture_basic_outlier_csv(
     Creates basic example data csv for use in tests
     """
 
-    basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv")
+    basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv", index=False)
 
     return csv_path
 
@@ -49,7 +49,7 @@ def fixture_basic_outlier_tsv(
     Creates basic example data tsv for use in tests
     """
 
-    basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t")
+    basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t", index=False)
 
     return tsv_path
 
@@ -62,6 +62,6 @@ def fixture_basic_outlier_parquet(
     Creates basic example data parquet for use in tests
     """
 
-    basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet")
+    basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet", index=False)
 
     return parquet_path
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
index a55cf6b..26ea6db 100644
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
@@ -337,6 +337,21 @@ def test_label_outliers(
     Tests label_outliers
     """
 
+    pd.testing.assert_frame_equal(
+        analyze.label_outliers(
+            df=basic_outlier_dataframe,
+            feature_thresholds={"example_feature": 1},
+            include_threshold_scores=True,
+        ),
+        analyze.label_outliers(
+            df=basic_outlier_csv,
+            feature_thresholds={"example_feature": 1},
+            include_threshold_scores=True,
+        ),
+    )
+
+    
+
     # test basic single-column result with zscores
     assert analyze.label_outliers(
         df=basic_outlier_dataframe,

From 1069a4af958de2ff12041d4282ca55c457ce7143 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Jun 2024 16:14:38 -0600
Subject: [PATCH 06/27] linting

---
 tests/conftest.py     | 8 ++++++--
 tests/test_analyze.py | 2 --
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index ad0c17e..e3fb24c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -49,7 +49,9 @@ def fixture_basic_outlier_tsv(
     Creates basic example data tsv for use in tests
     """
 
-    basic_outlier_dataframe.to_csv(tsv_path := tmp_path / "example.tsv", sep="\t", index=False)
+    basic_outlier_dataframe.to_csv(
+        tsv_path := tmp_path / "example.tsv", sep="\t", index=False
+    )
 
     return tsv_path
 
@@ -62,6 +64,8 @@ def fixture_basic_outlier_parquet(
     Creates basic example data parquet for use in tests
     """
 
-    basic_outlier_dataframe.to_parquet(parquet_path := tmp_path / "example.parquet", index=False)
+    basic_outlier_dataframe.to_parquet(
+        parquet_path := tmp_path / "example.parquet", index=False
+    )
 
     return parquet_path
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
index 26ea6db..21244b9 100644
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
@@ -350,8 +350,6 @@ def test_label_outliers(
         ),
     )
 
-    
-
     # test basic single-column result with zscores
     assert analyze.label_outliers(
         df=basic_outlier_dataframe,

From d52a89f639fe9395f0a3384677333996a39b8233 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 11 Jun 2024 08:48:27 -0600
Subject: [PATCH 07/27] update name, tests

---
 src/cosmicqc/__init__.py                      |  4 +-
 src/cosmicqc/analyze.py                       | 32 +++++++-------
 .../{qcdataframe.py => scdataframe.py}        | 33 ++++++++------
 tests/test_analyze.py                         | 16 +++++++
 tests/test_qcdataframe.py                     | 44 +++++++++----------
 5 files changed, 76 insertions(+), 53 deletions(-)
 rename src/cosmicqc/{qcdataframe.py => scdataframe.py} (77%)

diff --git a/src/cosmicqc/__init__.py b/src/cosmicqc/__init__.py
index 82f4e10..05b2bbe 100644
--- a/src/cosmicqc/__init__.py
+++ b/src/cosmicqc/__init__.py
@@ -3,8 +3,8 @@
 """
 
 from .analyze import find_outliers
-from .qcdataframe import QCDataFrame
+from .scdataframe import SCDataFrame
 
 # note: version placeholder is updated during build
 # by poetry-dynamic-versioning.
-__version__ = "0.0.1"
+__version__ = "0.0.0"
diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
index 70833a6..d616f78 100644
--- a/src/cosmicqc/analyze.py
+++ b/src/cosmicqc/analyze.py
@@ -11,7 +11,7 @@
 import yaml
 from scipy.stats import zscore as scipy_zscore
 
-from .qcdataframe import QCDataFrame
+from .scdataframe import SCDataFrame
 
 DEFAULT_QC_THRESHOLD_FILE = (
     f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml"
@@ -19,7 +19,7 @@
 
 
 def identify_outliers(
-    df: Union[QCDataFrame, pd.DataFrame, str],
+    df: Union[SCDataFrame, pd.DataFrame, str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
@@ -32,7 +32,7 @@ def identify_outliers(
     threshold of 0 as that would represent the whole dataset.
 
     Args:
-        df: Union[QCDataFrame, pd.DataFrame, str]
+        df: Union[SCDataFrame, pd.DataFrame, str]
             DataFrame or file with converted output from CytoTable.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
@@ -54,9 +54,9 @@ def identify_outliers(
             or not for use within other functions.
     """
 
-    # interpret the df as QCDataFrame
-    if not isinstance(df, QCDataFrame):
-        df = QCDataFrame(data=df)
+    # interpret the df as SCDataFrame
+    if not isinstance(df, SCDataFrame):
+        df = SCDataFrame(data=df)
 
     # create a copy of the dataframe to ensure
     # we don't modify the supplied dataframe inplace.
@@ -113,7 +113,7 @@ def identify_outliers(
 
 
 def find_outliers(
-    df: Union[QCDataFrame, pd.DataFrame, str],
+    df: Union[SCDataFrame, pd.DataFrame, str],
     metadata_columns: List[str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
@@ -123,7 +123,7 @@ def find_outliers(
     with only the outliers and provided metadata columns.
 
     Args:
-        df: Union[QCDataFrame, pd.DataFrame, str]
+        df: Union[SCDataFrame, pd.DataFrame, str]
             DataFrame or file with converted output from CytoTable.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
@@ -144,9 +144,9 @@ def find_outliers(
             Outlier data frame for the given conditions.
     """
 
-    # interpret the df as QCDataFrame
-    if not isinstance(df, QCDataFrame):
-        df = QCDataFrame(data=df)
+    # interpret the df as SCDataFrame
+    if not isinstance(df, SCDataFrame):
+        df = SCDataFrame(data=df)
 
     if isinstance(feature_thresholds, str):
         feature_thresholds = read_thresholds_set_from_file(
@@ -179,7 +179,7 @@ def find_outliers(
 
 
 def label_outliers(
-    df: Union[QCDataFrame, pd.DataFrame, str],
+    df: Union[SCDataFrame, pd.DataFrame, str],
     feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
@@ -189,7 +189,7 @@ def label_outliers(
     where a cell passed or failed the quality control condition(s).
 
         Args:
-            df: Union[QCDataFrame, pd.DataFrame, str]
+            df: Union[SCDataFrame, pd.DataFrame, str]
                 DataFrame or file with converted output from CytoTable.
             feature_thresholds: Dict[str, float]
                 One of two options:
@@ -211,9 +211,9 @@ def label_outliers(
                 Full dataframe with optional scores and outlier boolean column.
     """
 
-    # interpret the df as QCDataFrame
-    if not isinstance(df, QCDataFrame):
-        df = QCDataFrame(data=df)
+    # interpret the df as SCDataFrame
+    if not isinstance(df, SCDataFrame):
+        df = SCDataFrame(data=df)
 
     # for single outlier processing
     if isinstance(feature_thresholds, (str, dict)):
diff --git a/src/cosmicqc/qcdataframe.py b/src/cosmicqc/scdataframe.py
similarity index 77%
rename from src/cosmicqc/qcdataframe.py
rename to src/cosmicqc/scdataframe.py
index 8498311..da0e1c1 100644
--- a/src/cosmicqc/qcdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -1,5 +1,5 @@
 """
-Defines a QCDataFrame class for use in coSMicQC.
+Defines a SCDataFrame class for use in coSMicQC.
 """
 
 import pathlib
@@ -8,7 +8,7 @@
 import pandas as pd
 
 
-class QCDataFrame:
+class SCDataFrame:
     """
     A class to handle and load different types of data files into a pandas DataFrame.
 
@@ -17,8 +17,8 @@ class QCDataFrame:
     pandas DataFrame.
 
     Attributes:
-        reference (str):
-            A string indicating the type of data source, either 'pd.DataFrame'
+        data_source (str):
+            A string indicating the data source, either 'pd.DataFrame'
             or the file path.
         data (pd.DataFrame):
             The loaded data in a pandas DataFrame.
@@ -26,13 +26,19 @@ class QCDataFrame:
     Methods:
         __call__():
             Returns the underlying pandas DataFrame.
+        __repr__():
+            Returns representation of underlying pandas DataFrame.
+        __getattr__():
+            Returns underlying attributes of pandas DataFrame.
+        __getitem__():
+            Returns slice of data from pandas DataFrame.
     """
 
     def __init__(
         self: Self, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any]
     ) -> None:
         """
-        Initializes the QCDataFrame with either a DataFrame or a file path.
+        Initializes the SCDataFrame with either a DataFrame or a file path.
 
         Args:
             data (Union[pd.DataFrame, str]):
@@ -42,14 +48,14 @@ def __init__(
         """
 
         if isinstance(data, pd.DataFrame):
-            # if data is a pd.DataFrame, remember this within the reference attr
-            self.reference = "pd.DataFrame"
+            # if data is a pd.DataFrame, remember this within the data_source attr
+            self.data_source = "pd.DataFrame"
             self.data = data
 
         elif isinstance(data, pathlib.Path | str):
             # if the data is a string, remember the original source
-            # through a reference attr
-            self.reference = data
+            # through a data_source attr
+            self.data_source = data
 
             # interpret the data through pathlib
             data_path = pathlib.Path(data)
@@ -64,9 +70,10 @@ def __init__(
             elif data_path.suffix == ".parquet":
                 # read as a Parquet file
                 self.data = pd.read_parquet(data, **kwargs)
-
+            else:
+                raise ValueError("Unsupported file format for SCDataFrame.")
         else:
-            raise ValueError("Unsupported file format for QCDataFrame.")
+            raise ValueError("Unsupported input type for SCDataFrame.")
 
     def __call__(self: Self) -> pd.DataFrame:
         """
@@ -79,12 +86,12 @@ def __call__(self: Self) -> pd.DataFrame:
 
     def __repr__(self: Self) -> pd.DataFrame:
         """
-        Returns the underlying pandas DataFrame.
+        Returns the representation of underlying pandas DataFrame.
 
         Returns:
             pd.DataFrame: The data in a pandas DataFrame.
         """
-        return self.data
+        return repr(self.data)
 
     def __getattr__(self: Self, attr: str) -> Any:  # noqa: ANN401
         """
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
index 21244b9..6ade9a2 100644
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
@@ -337,6 +337,7 @@ def test_label_outliers(
     Tests label_outliers
     """
 
+    # compare the dataframe vs csv output to make sure they are equivalent
     pd.testing.assert_frame_equal(
         analyze.label_outliers(
             df=basic_outlier_dataframe,
@@ -451,12 +452,27 @@ def test_label_outliers(
 
 def test_identify_outliers(
     basic_outlier_dataframe: pd.DataFrame,
+    basic_outlier_csv: str,
     cytotable_CFReT_data_df: pd.DataFrame,
 ):
     """
     Tests identify_outliers
     """
 
+    # show that dataframe and csv output are the same
+    pd.testing.assert_frame_equal(
+        analyze.identify_outliers(
+            df=basic_outlier_dataframe,
+            feature_thresholds={"example_feature": 1},
+            include_threshold_scores=True,
+        ),
+        analyze.identify_outliers(
+            df=basic_outlier_csv,
+            feature_thresholds={"example_feature": 1},
+            include_threshold_scores=True,
+        ),
+    )
+
     assert analyze.identify_outliers(
         df=basic_outlier_dataframe,
         feature_thresholds={"example_feature": 1},
diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py
index 1b1fad4..d2ec019 100644
--- a/tests/test_qcdataframe.py
+++ b/tests/test_qcdataframe.py
@@ -1,45 +1,45 @@
 """
-Tests cosmicqc qcdataframe module
+Tests cosmicqc SCDataFrame module
 """
 
 import pandas as pd
-from cosmicqc.qcdataframe import QCDataFrame
+from cosmicqc.scdataframe import SCDataFrame
 
 
-def test_qcdataframe_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame):
+def test_SCDataFrame_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame):
     """
-    Tests QCDataFrame with pd.DataFrame input.
+    Tests SCDataFrame with pd.DataFrame input.
     """
-    qc_df = QCDataFrame(data=basic_outlier_dataframe)
-    assert qc_df.reference == "pd.DataFrame"
-    assert qc_df.equals(basic_outlier_dataframe)
+    sc_df = SCDataFrame(data=basic_outlier_dataframe)
+    assert sc_df.data_source == "pd.DataFrame"
+    assert sc_df.equals(basic_outlier_dataframe)
 
 
-def test_qcdataframe_init_with_csv(basic_outlier_csv: str):
+def test_SCDataFrame_init_with_csv(basic_outlier_csv: str):
     """
-    Tests QCDataFrame with CSV input.
+    Tests SCDataFrame with CSV input.
     """
-    qc_df = QCDataFrame(data=basic_outlier_csv)
+    sc_df = SCDataFrame(data=basic_outlier_csv)
     expected_df = pd.read_csv(basic_outlier_csv)
-    assert qc_df.reference == basic_outlier_csv
-    assert qc_df.equals(expected_df)
+    assert sc_df.data_source == basic_outlier_csv
+    assert sc_df.equals(expected_df)
 
 
-def test_qcdataframe_init_with_tsv(basic_outlier_tsv: str):
+def test_SCDataFrame_init_with_tsv(basic_outlier_tsv: str):
     """
-    Tests QCDataFrame with TSV input.
+    Tests SCDataFrame with TSV input.
     """
-    qc_df = QCDataFrame(data=basic_outlier_tsv)
+    sc_df = SCDataFrame(data=basic_outlier_tsv)
     expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t")
-    assert qc_df.reference == basic_outlier_tsv
-    assert qc_df.equals(expected_df)
+    assert sc_df.data_source == basic_outlier_tsv
+    assert sc_df.equals(expected_df)
 
 
-def test_qcdataframe_init_with_parquet(basic_outlier_parquet: str):
+def test_SCDataFrame_init_with_parquet(basic_outlier_parquet: str):
     """
-    Tests QCDataFrame with TSV input.
+    Tests SCDataFrame with TSV input.
     """
-    qc_df = QCDataFrame(data=basic_outlier_parquet)
+    sc_df = SCDataFrame(data=basic_outlier_parquet)
     expected_df = pd.read_parquet(basic_outlier_parquet)
-    assert qc_df.reference == basic_outlier_parquet
-    assert qc_df.equals(expected_df)
+    assert sc_df.data_source == basic_outlier_parquet
+    assert sc_df.equals(expected_df)

From 1bf8262dcdf160592ffef24cdb0c62b9c0c277a4 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 11 Jun 2024 08:58:09 -0600
Subject: [PATCH 08/27] add back compat for self type

---
 src/cosmicqc/scdataframe.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index da0e1c1..37e52a7 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -3,10 +3,14 @@
 """
 
 import pathlib
-from typing import Any, Dict, Self, Union
+from typing import Any, Dict, TypeVar, Union
 
 import pandas as pd
 
+# provide backwards compatibility for Self type in earlier Python versions.
+# see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods
+Self_SCDataFrame = TypeVar("Self_SCDataFrame", bound="SCDataFrame")
+
 
 class SCDataFrame:
     """
@@ -35,7 +39,7 @@ class SCDataFrame:
     """
 
     def __init__(
-        self: Self, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any]
+        self: Self_SCDataFrame, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any]
     ) -> None:
         """
         Initializes the SCDataFrame with either a DataFrame or a file path.
@@ -75,7 +79,7 @@ def __init__(
         else:
             raise ValueError("Unsupported input type for SCDataFrame.")
 
-    def __call__(self: Self) -> pd.DataFrame:
+    def __call__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
         Returns the underlying pandas DataFrame.
 
@@ -84,7 +88,7 @@ def __call__(self: Self) -> pd.DataFrame:
         """
         return self.data
 
-    def __repr__(self: Self) -> pd.DataFrame:
+    def __repr__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
         Returns the representation of underlying pandas DataFrame.
 
@@ -93,7 +97,7 @@ def __repr__(self: Self) -> pd.DataFrame:
         """
         return repr(self.data)
 
-    def __getattr__(self: Self, attr: str) -> Any:  # noqa: ANN401
+    def __getattr__(self: Self_SCDataFrame, attr: str) -> Any:  # noqa: ANN401
         """
         Intercept attribute accesses and delegate them to the underlying
         pandas DataFrame.
@@ -106,7 +110,7 @@ def __getattr__(self: Self, attr: str) -> Any:  # noqa: ANN401
         """
         return getattr(self.data, attr)
 
-    def __getitem__(self: Self, key: Union[int, str]) -> Any:  # noqa: ANN401
+    def __getitem__(self: Self_SCDataFrame, key: Union[int, str]) -> Any:  # noqa: ANN401
         """
         Returns an element or a slice of the underlying pandas DataFrame.
 

From 772f89560e7fac449cd4bf6396779ef5182b46d9 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 11 Jun 2024 09:07:13 -0600
Subject: [PATCH 09/27] back compat for isinstance

---
 src/cosmicqc/scdataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 37e52a7..3fc349c 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -56,7 +56,7 @@ def __init__(
             self.data_source = "pd.DataFrame"
             self.data = data
 
-        elif isinstance(data, pathlib.Path | str):
+        elif isinstance(data, pathlib.Path) or isinstance(data, str): # noqa: PLR1701, SIM101
             # if the data is a string, remember the original source
             # through a data_source attr
             self.data_source = data

From d0ea33c83e114a1454ed7448e957da97b0b18e8d Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 11 Jun 2024 09:08:18 -0600
Subject: [PATCH 10/27] linting

---
 src/cosmicqc/scdataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 3fc349c..f94bdd2 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -56,7 +56,7 @@ def __init__(
             self.data_source = "pd.DataFrame"
             self.data = data
 
-        elif isinstance(data, pathlib.Path) or isinstance(data, str): # noqa: PLR1701, SIM101
+        elif isinstance(data, pathlib.Path) or isinstance(data, str):  # noqa: PLR1701, SIM101
             # if the data is a string, remember the original source
             # through a data_source attr
             self.data_source = data

From f8773b26de54a6ac306ae9ac1c1a220aa21ae427 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Fri, 14 Jun 2024 15:42:39 -0600
Subject: [PATCH 11/27] add csv.gz compatibility

---
 src/cosmicqc/scdataframe.py |  3 +++
 tests/conftest.py           | 15 +++++++++++++++
 tests/test_qcdataframe.py   | 10 ++++++++++
 3 files changed, 28 insertions(+)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index f94bdd2..a06b643 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -68,6 +68,9 @@ def __init__(
             if data_path.suffix == ".csv":
                 # read as a CSV
                 self.data = pd.read_csv(data, **kwargs)
+            elif data_path.suffixes == [".csv", ".gz"]:
+                # read as a CSV.GZ file
+                self.data = pd.read_csv(data, compression="gzip", **kwargs)
             elif data_path.suffix in (".tsv", ".txt"):
                 # read as a TSV
                 self.data = pd.read_csv(data, delimiter="\t", **kwargs)
diff --git a/tests/conftest.py b/tests/conftest.py
index e3fb24c..ed8591e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -41,6 +41,21 @@ def fixture_basic_outlier_csv(
     return csv_path
 
 
+@pytest.fixture(name="basic_outlier_csv_gz")
+def fixture_basic_outlier_csv_gz(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
+    """
+    Creates basic example data csv for use in tests
+    """
+
+    basic_outlier_dataframe.to_csv(
+        csv_gz_path := tmp_path / "example.csv.gz", index=False, compression="gzip"
+    )
+
+    return csv_gz_path
+
+
 @pytest.fixture(name="basic_outlier_tsv")
 def fixture_basic_outlier_tsv(
     tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
diff --git a/tests/test_qcdataframe.py b/tests/test_qcdataframe.py
index d2ec019..75e7448 100644
--- a/tests/test_qcdataframe.py
+++ b/tests/test_qcdataframe.py
@@ -25,6 +25,16 @@ def test_SCDataFrame_init_with_csv(basic_outlier_csv: str):
     assert sc_df.equals(expected_df)
 
 
+def test_SCDataFrame_init_with_csv_gz(basic_outlier_csv_gz: str):
+    """
+    Tests SCDataFrame with CSV input.
+    """
+    sc_df = SCDataFrame(data=basic_outlier_csv_gz)
+    expected_df = pd.read_csv(basic_outlier_csv_gz)
+    assert sc_df.data_source == basic_outlier_csv_gz
+    assert sc_df.equals(expected_df)
+
+
 def test_SCDataFrame_init_with_tsv(basic_outlier_tsv: str):
     """
     Tests SCDataFrame with TSV input.

From 0ad522f32b8008b2176a9f41a47f6754bde6f637 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 18 Jun 2024 09:17:31 -0600
Subject: [PATCH 12/27] add export capabilities

---
 src/cosmicqc/scdataframe.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index a06b643..730718a 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -81,6 +81,32 @@ def __init__(
                 raise ValueError("Unsupported file format for SCDataFrame.")
         else:
             raise ValueError("Unsupported input type for SCDataFrame.")
+        
+    def export(self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]) -> None:
+        """
+        Exports the underlying pandas DataFrame to a file.
+
+        Args:
+            file_path (str): The path where the DataFrame should be saved.
+            **kwargs: Additional keyword arguments to pass to the pandas to_* methods.
+        """
+
+        data_path = pathlib.Path(file_path)
+
+        # export to csv
+        if data_path.suffix == ".csv":
+            self.data.to_csv(file_path, **kwargs)
+
+        # export to tsv
+        elif data_path.suffix in (".tsv", ".txt"):
+            self.data.to_csv(file_path, sep='\t', **kwargs)
+
+        # export to parquet
+        elif data_path.suffix == ".parquet":
+            self.data.to_parquet(file_path, **kwargs)
+
+        else:
+            raise ValueError("Unsupported file format for export.")
 
     def __call__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
@@ -100,10 +126,10 @@ def __repr__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
         return repr(self.data)
 
-    def __getattr__(self: Self_SCDataFrame, attr: str) -> Any:  # noqa: ANN401
+    def __getattr__(self, attr: str) -> Any:  # noqa: ANN401
         """
         Intercept attribute accesses and delegate them to the underlying
-        pandas DataFrame.
+        pandas DataFrame, except for custom methods.
 
         Args:
             attr (str): The name of the attribute being accessed.
@@ -111,6 +137,8 @@ def __getattr__(self: Self_SCDataFrame, attr: str) -> Any:  # noqa: ANN401
         Returns:
             Any: The value of the attribute from the pandas DataFrame.
         """
+        if attr in self.__dict__:
+            return self.__dict__[attr]
         return getattr(self.data, attr)
 
     def __getitem__(self: Self_SCDataFrame, key: Union[int, str]) -> Any:  # noqa: ANN401

From 6d33de4e8571bfb780fa64ce277814b56c84adbf Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 18 Jun 2024 09:18:00 -0600
Subject: [PATCH 13/27] rename file to correct module name

---
 tests/{test_qcdataframe.py => test_scdataframe.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_qcdataframe.py => test_scdataframe.py} (100%)

diff --git a/tests/test_qcdataframe.py b/tests/test_scdataframe.py
similarity index 100%
rename from tests/test_qcdataframe.py
rename to tests/test_scdataframe.py

From 34cf9bfc796f084a205a0e74075c41f2236e8c81 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 18 Jun 2024 09:58:03 -0600
Subject: [PATCH 14/27] add export capabilities

---
 src/cosmicqc/scdataframe.py | 17 +++++-----
 tests/test_scdataframe.py   | 62 ++++++++++++++++++++++++++++++++++---
 2 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 730718a..4a4acae 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -81,8 +81,10 @@ def __init__(
                 raise ValueError("Unsupported file format for SCDataFrame.")
         else:
             raise ValueError("Unsupported input type for SCDataFrame.")
-        
-    def export(self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]) -> None:
+
+    def export(
+        self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]
+    ) -> None:
         """
         Exports the underlying pandas DataFrame to a file.
 
@@ -94,17 +96,14 @@ def export(self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]) ->
         data_path = pathlib.Path(file_path)
 
         # export to csv
-        if data_path.suffix == ".csv":
+        if ".csv" in data_path.suffixes:
             self.data.to_csv(file_path, **kwargs)
-
         # export to tsv
-        elif data_path.suffix in (".tsv", ".txt"):
-            self.data.to_csv(file_path, sep='\t', **kwargs)
-
+        elif any(elem in data_path.suffixes for elem in (".tsv", ".txt")):
+            self.data.to_csv(file_path, sep="\t", **kwargs)
         # export to parquet
         elif data_path.suffix == ".parquet":
             self.data.to_parquet(file_path, **kwargs)
-
         else:
             raise ValueError("Unsupported file format for export.")
 
@@ -126,7 +125,7 @@ def __repr__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
         return repr(self.data)
 
-    def __getattr__(self, attr: str) -> Any:  # noqa: ANN401
+    def __getattr__(self: Self_SCDataFrame, attr: str) -> Any:  # noqa: ANN401
         """
         Intercept attribute accesses and delegate them to the underlying
         pandas DataFrame, except for custom methods.
diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index 75e7448..78e5dd9 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -2,54 +2,106 @@
 Tests cosmicqc SCDataFrame module
 """
 
+import pathlib
+
 import pandas as pd
 from cosmicqc.scdataframe import SCDataFrame
+from pyarrow import parquet
 
 
-def test_SCDataFrame_init_with_dataframe(basic_outlier_dataframe: pd.DataFrame):
+def test_SCDataFrame_with_dataframe(
+    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+):
     """
     Tests SCDataFrame with pd.DataFrame input.
     """
+
     sc_df = SCDataFrame(data=basic_outlier_dataframe)
+
+    # test that we ingested the data properly
     assert sc_df.data_source == "pd.DataFrame"
     assert sc_df.equals(basic_outlier_dataframe)
 
+    # test export
+    basic_outlier_dataframe.to_parquet(
+        control_path := f"{tmp_path}/df_input_example.parquet"
+    )
+    sc_df.export(test_path := f"{tmp_path}/df_input_example1.parquet")
 
-def test_SCDataFrame_init_with_csv(basic_outlier_csv: str):
+    assert parquet.read_table(control_path).equals(parquet.read_table(test_path))
+
+
+def test_SCDataFrame_with_csv(tmp_path: pathlib.Path, basic_outlier_csv: str):
     """
     Tests SCDataFrame with CSV input.
     """
+
     sc_df = SCDataFrame(data=basic_outlier_csv)
     expected_df = pd.read_csv(basic_outlier_csv)
+
+    # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_csv
     assert sc_df.equals(expected_df)
 
+    # test export
+    sc_df.export(test_path := f"{tmp_path}/df_input_example.csv", index=False)
+
+    pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path))
+
 
-def test_SCDataFrame_init_with_csv_gz(basic_outlier_csv_gz: str):
+def test_SCDataFrame_with_csv_gz(tmp_path: pathlib.Path, basic_outlier_csv_gz: str):
     """
     Tests SCDataFrame with CSV input.
     """
+
     sc_df = SCDataFrame(data=basic_outlier_csv_gz)
     expected_df = pd.read_csv(basic_outlier_csv_gz)
+
+    # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_csv_gz
     assert sc_df.equals(expected_df)
 
+    # test export
+    sc_df.export(test_path := f"{tmp_path}/df_input_example.csv.gz", index=False)
 
-def test_SCDataFrame_init_with_tsv(basic_outlier_tsv: str):
+    pd.testing.assert_frame_equal(
+        expected_df, pd.read_csv(test_path, compression="gzip")
+    )
+
+
+def test_SCDataFrame_with_tsv(tmp_path: pathlib.Path, basic_outlier_tsv: str):
     """
     Tests SCDataFrame with TSV input.
     """
+
     sc_df = SCDataFrame(data=basic_outlier_tsv)
     expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t")
+
+    # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_tsv
     assert sc_df.equals(expected_df)
 
+    # test export
+    sc_df.export(test_path := f"{tmp_path}/df_input_example.tsv", index=False)
+
+    pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path, sep="\t"))
 
-def test_SCDataFrame_init_with_parquet(basic_outlier_parquet: str):
+
+def test_SCDataFrame_with_parquet(tmp_path: pathlib.Path, basic_outlier_parquet: str):
     """
     Tests SCDataFrame with TSV input.
     """
+
     sc_df = SCDataFrame(data=basic_outlier_parquet)
     expected_df = pd.read_parquet(basic_outlier_parquet)
+
+    # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_parquet
     assert sc_df.equals(expected_df)
+
+    # test export
+    sc_df.export(test_path := f"{tmp_path}/df_input_example2.parquet")
+
+    assert parquet.read_table(basic_outlier_parquet).equals(
+        parquet.read_table(test_path)
+    )

From fbedb7dfa5302d1ae5dd436cc052ebc6ede116b8 Mon Sep 17 00:00:00 2001
From: Dave Bunten <ekgto445@gmail.com>
Date: Tue, 18 Jun 2024 16:53:09 -0600
Subject: [PATCH 15/27] Apply suggestions from code review

Co-authored-by: Gregory Way <gregory.way@gmail.com>
---
 src/cosmicqc/scdataframe.py | 6 +++---
 tests/conftest.py           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 4a4acae..d364764 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -31,9 +31,9 @@ class SCDataFrame:
         __call__():
             Returns the underlying pandas DataFrame.
         __repr__():
-            Returns representation of underlying pandas DataFrame.
+            Returns a representation of the underlying pandas DataFrame.
         __getattr__():
-            Returns underlying attributes of pandas DataFrame.
+            Returns the underlying attributes of the pandas DataFrame.
         __getitem__():
             Returns slice of data from pandas DataFrame.
     """
@@ -118,7 +118,7 @@ def __call__(self: Self_SCDataFrame) -> pd.DataFrame:
 
     def __repr__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
-        Returns the representation of underlying pandas DataFrame.
+        Returns the representation of the underlying pandas DataFrame.
 
         Returns:
             pd.DataFrame: The data in a pandas DataFrame.
diff --git a/tests/conftest.py b/tests/conftest.py
index ed8591e..bc2883d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,7 +36,7 @@ def fixture_basic_outlier_csv(
     Creates basic example data csv for use in tests
     """
 
-    basic_outlier_dataframe.to_csv(csv_path := tmp_path / "example.csv", index=False)
+    basic_outlier_dataframe.to_csv(csv_path := tmp_path / "basic_example.csv", index=False)
 
     return csv_path
 

From 025701497204e9bae941a5ccdf1b4123741c3290 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 18 Jun 2024 17:00:49 -0600
Subject: [PATCH 16/27] update tests and docs

---
 pyproject.toml              |  2 +-
 src/cosmicqc/scdataframe.py | 20 +++++++++-----------
 tests/conftest.py           |  4 +++-
 tests/test_scdataframe.py   | 15 ++++++---------
 4 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f3231ed..67af76e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ select = [
 # Ignore `E402` and `F401` (unused imports) in all `__init__.py` files
 "__init__.py" = ["E402", "F401"]
 # ignore typing rules for tests
-"tests/*" = ["ANN201"]
+"tests/*" = ["ANN201", "PLR0913"]
 
 # set dynamic versioning capabilities for project
 [tool.poetry-dynamic-versioning]
diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index d364764..17dd0ce 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -31,7 +31,7 @@ class SCDataFrame:
         __call__():
             Returns the underlying pandas DataFrame.
         __repr__():
-            Returns a representation of the underlying pandas DataFrame.
+            Returns a representational string of the underlying pandas DataFrame.
         __getattr__():
             Returns the underlying attributes of the pandas DataFrame.
         __getitem__():
@@ -53,10 +53,10 @@ def __init__(
 
         if isinstance(data, pd.DataFrame):
             # if data is a pd.DataFrame, remember this within the data_source attr
-            self.data_source = "pd.DataFrame"
+            self.data_source = "pandas.DataFrame"
             self.data = data
 
-        elif isinstance(data, pathlib.Path) or isinstance(data, str):  # noqa: PLR1701, SIM101
+        elif isinstance(data, pathlib.Path | str):
             # if the data is a string, remember the original source
             # through a data_source attr
             self.data_source = data
@@ -65,15 +65,13 @@ def __init__(
             data_path = pathlib.Path(data)
 
             # Read the data from the file based on its extension
-            if data_path.suffix == ".csv":
-                # read as a CSV
+            if (
+                data_path.suffix == ".csv"
+                or data_path.suffix in (".tsv", ".txt")
+                or data_path.suffixes == [".csv", ".gz"]
+            ):
+                # read as a CSV, CSV.GZ, .TSV, or .TXT file
                 self.data = pd.read_csv(data, **kwargs)
-            elif data_path.suffixes == [".csv", ".gz"]:
-                # read as a CSV.GZ file
-                self.data = pd.read_csv(data, compression="gzip", **kwargs)
-            elif data_path.suffix in (".tsv", ".txt"):
-                # read as a TSV
-                self.data = pd.read_csv(data, delimiter="\t", **kwargs)
             elif data_path.suffix == ".parquet":
                 # read as a Parquet file
                 self.data = pd.read_parquet(data, **kwargs)
diff --git a/tests/conftest.py b/tests/conftest.py
index bc2883d..8f97176 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,7 +36,9 @@ def fixture_basic_outlier_csv(
     Creates basic example data csv for use in tests
     """
 
-    basic_outlier_dataframe.to_csv(csv_path := tmp_path / "basic_example.csv", index=False)
+    basic_outlier_dataframe.to_csv(
+        csv_path := tmp_path / "basic_example.csv", index=False
+    )
 
     return csv_path
 
diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index 78e5dd9..713fc97 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -10,7 +10,12 @@
 
 
 def test_SCDataFrame_with_dataframe(
-    tmp_path: pathlib.Path, basic_outlier_dataframe: pd.DataFrame
+    tmp_path: pathlib.Path,
+    basic_outlier_dataframe: pd.DataFrame,
+    basic_outlier_csv: str,
+    basic_outlier_csv_gz: str,
+    basic_outlier_tsv: str,
+    basic_outlier_parquet: str,
 ):
     """
     Tests SCDataFrame with pd.DataFrame input.
@@ -30,8 +35,6 @@ def test_SCDataFrame_with_dataframe(
 
     assert parquet.read_table(control_path).equals(parquet.read_table(test_path))
 
-
-def test_SCDataFrame_with_csv(tmp_path: pathlib.Path, basic_outlier_csv: str):
     """
     Tests SCDataFrame with CSV input.
     """
@@ -48,8 +51,6 @@ def test_SCDataFrame_with_csv(tmp_path: pathlib.Path, basic_outlier_csv: str):
 
     pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path))
 
-
-def test_SCDataFrame_with_csv_gz(tmp_path: pathlib.Path, basic_outlier_csv_gz: str):
     """
     Tests SCDataFrame with CSV input.
     """
@@ -68,8 +69,6 @@ def test_SCDataFrame_with_csv_gz(tmp_path: pathlib.Path, basic_outlier_csv_gz: s
         expected_df, pd.read_csv(test_path, compression="gzip")
     )
 
-
-def test_SCDataFrame_with_tsv(tmp_path: pathlib.Path, basic_outlier_tsv: str):
     """
     Tests SCDataFrame with TSV input.
     """
@@ -86,8 +85,6 @@ def test_SCDataFrame_with_tsv(tmp_path: pathlib.Path, basic_outlier_tsv: str):
 
     pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path, sep="\t"))
 
-
-def test_SCDataFrame_with_parquet(tmp_path: pathlib.Path, basic_outlier_parquet: str):
     """
     Tests SCDataFrame with TSV input.
     """

From 985a6dda77d5d57fe894f262ba1aa6e3538eb6a6 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Wed, 19 Jun 2024 08:31:14 -0600
Subject: [PATCH 17/27] fix tests

---
 pyproject.toml              | 2 +-
 src/cosmicqc/scdataframe.py | 6 ++++--
 tests/test_scdataframe.py   | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 67af76e..5adde66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ profile = "black"
 exclude_dirs = ["tests"]
 
 [tool.ruff]
-target-version = "py311"
+target-version = "py38"
 line-length = 88
 fix = true
 
diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 17dd0ce..d96cbee 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -39,7 +39,9 @@ class SCDataFrame:
     """
 
     def __init__(
-        self: Self_SCDataFrame, data: Union[pd.DataFrame, str], **kwargs: Dict[str, Any]
+        self: Self_SCDataFrame,
+        data: Union[pd.DataFrame, str, pathlib.Path],
+        **kwargs: Dict[str, Any],
     ) -> None:
         """
         Initializes the SCDataFrame with either a DataFrame or a file path.
@@ -56,7 +58,7 @@ def __init__(
             self.data_source = "pandas.DataFrame"
             self.data = data
 
-        elif isinstance(data, pathlib.Path | str):
+        elif isinstance(data, (pathlib.Path, str)):
             # if the data is a string, remember the original source
             # through a data_source attr
             self.data_source = data
diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index 713fc97..bcb520c 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -24,7 +24,7 @@ def test_SCDataFrame_with_dataframe(
     sc_df = SCDataFrame(data=basic_outlier_dataframe)
 
     # test that we ingested the data properly
-    assert sc_df.data_source == "pd.DataFrame"
+    assert sc_df.data_source == "pandas.DataFrame"
     assert sc_df.equals(basic_outlier_dataframe)
 
     # test export

From 5034a0727473fd382944761e4ccce8754d628795 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Wed, 19 Jun 2024 11:01:36 -0600
Subject: [PATCH 18/27] update tests; add constructor path for scdataframe

---
 src/cosmicqc/analyze.py     |  9 +++------
 src/cosmicqc/scdataframe.py | 25 +++++++++++++++----------
 tests/test_scdataframe.py   | 31 +++++++++++--------------------
 3 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
index d616f78..abe43d6 100644
--- a/src/cosmicqc/analyze.py
+++ b/src/cosmicqc/analyze.py
@@ -55,8 +55,7 @@ def identify_outliers(
     """
 
     # interpret the df as SCDataFrame
-    if not isinstance(df, SCDataFrame):
-        df = SCDataFrame(data=df)
+    df = SCDataFrame(data=df)
 
     # create a copy of the dataframe to ensure
     # we don't modify the supplied dataframe inplace.
@@ -145,8 +144,7 @@ def find_outliers(
     """
 
     # interpret the df as SCDataFrame
-    if not isinstance(df, SCDataFrame):
-        df = SCDataFrame(data=df)
+    df = SCDataFrame(data=df)
 
     if isinstance(feature_thresholds, str):
         feature_thresholds = read_thresholds_set_from_file(
@@ -212,8 +210,7 @@ def label_outliers(
     """
 
     # interpret the df as SCDataFrame
-    if not isinstance(df, SCDataFrame):
-        df = SCDataFrame(data=df)
+    df = SCDataFrame(data=df)
 
     # for single outlier processing
     if isinstance(feature_thresholds, (str, dict)):
diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index d96cbee..bbe29db 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -9,7 +9,7 @@
 
 # provide backwards compatibility for Self type in earlier Python versions.
 # see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods
-Self_SCDataFrame = TypeVar("Self_SCDataFrame", bound="SCDataFrame")
+SCDataFrame_type = TypeVar("SCDataFrame_type", bound="SCDataFrame")
 
 
 class SCDataFrame:
@@ -39,8 +39,8 @@ class SCDataFrame:
     """
 
     def __init__(
-        self: Self_SCDataFrame,
-        data: Union[pd.DataFrame, str, pathlib.Path],
+        self: SCDataFrame_type,
+        data: Union[SCDataFrame_type, pd.DataFrame, str, pathlib.Path],
         **kwargs: Dict[str, Any],
     ) -> None:
         """
@@ -53,7 +53,12 @@ def __init__(
                 Additional keyword arguments to pass to the pandas read functions.
         """
 
-        if isinstance(data, pd.DataFrame):
+        if isinstance(data, SCDataFrame):
+            # if data is an instance of SCDataFrame, use its data_source and data
+            self.data_source = data.data_source
+            self.data = data.data
+
+        elif isinstance(data, pd.DataFrame):
             # if data is a pd.DataFrame, remember this within the data_source attr
             self.data_source = "pandas.DataFrame"
             self.data = data
@@ -80,10 +85,10 @@ def __init__(
             else:
                 raise ValueError("Unsupported file format for SCDataFrame.")
         else:
-            raise ValueError("Unsupported input type for SCDataFrame.")
+            raise ValueError("Unsupported data type for SCDataFrame.")
 
     def export(
-        self: Self_SCDataFrame, file_path: str, **kwargs: Dict[str, Any]
+        self: SCDataFrame_type, file_path: str, **kwargs: Dict[str, Any]
     ) -> None:
         """
         Exports the underlying pandas DataFrame to a file.
@@ -107,7 +112,7 @@ def export(
         else:
             raise ValueError("Unsupported file format for export.")
 
-    def __call__(self: Self_SCDataFrame) -> pd.DataFrame:
+    def __call__(self: SCDataFrame_type) -> pd.DataFrame:
         """
         Returns the underlying pandas DataFrame.
 
@@ -116,7 +121,7 @@ def __call__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
         return self.data
 
-    def __repr__(self: Self_SCDataFrame) -> pd.DataFrame:
+    def __repr__(self: SCDataFrame_type) -> pd.DataFrame:
         """
         Returns the representation of the underlying pandas DataFrame.
 
@@ -125,7 +130,7 @@ def __repr__(self: Self_SCDataFrame) -> pd.DataFrame:
         """
         return repr(self.data)
 
-    def __getattr__(self: Self_SCDataFrame, attr: str) -> Any:  # noqa: ANN401
+    def __getattr__(self: SCDataFrame_type, attr: str) -> Any:  # noqa: ANN401
         """
         Intercept attribute accesses and delegate them to the underlying
         pandas DataFrame, except for custom methods.
@@ -140,7 +145,7 @@ def __getattr__(self: Self_SCDataFrame, attr: str) -> Any:  # noqa: ANN401
             return self.__dict__[attr]
         return getattr(self.data, attr)
 
-    def __getitem__(self: Self_SCDataFrame, key: Union[int, str]) -> Any:  # noqa: ANN401
+    def __getitem__(self: SCDataFrame_type, key: Union[int, str]) -> Any:  # noqa: ANN401
         """
         Returns an element or a slice of the underlying pandas DataFrame.
 
diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index bcb520c..2627b88 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -17,10 +17,8 @@ def test_SCDataFrame_with_dataframe(
     basic_outlier_tsv: str,
     basic_outlier_parquet: str,
 ):
-    """
-    Tests SCDataFrame with pd.DataFrame input.
-    """
-
+    
+    # Tests SCDataFrame with pd.DataFrame input.
     sc_df = SCDataFrame(data=basic_outlier_dataframe)
 
     # test that we ingested the data properly
@@ -35,10 +33,7 @@ def test_SCDataFrame_with_dataframe(
 
     assert parquet.read_table(control_path).equals(parquet.read_table(test_path))
 
-    """
-    Tests SCDataFrame with CSV input.
-    """
-
+    # Tests SCDataFrame with CSV input.
     sc_df = SCDataFrame(data=basic_outlier_csv)
     expected_df = pd.read_csv(basic_outlier_csv)
 
@@ -51,10 +46,7 @@ def test_SCDataFrame_with_dataframe(
 
     pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path))
 
-    """
-    Tests SCDataFrame with CSV input.
-    """
-
+    # Tests SCDataFrame with CSV input.
     sc_df = SCDataFrame(data=basic_outlier_csv_gz)
     expected_df = pd.read_csv(basic_outlier_csv_gz)
 
@@ -69,10 +61,7 @@ def test_SCDataFrame_with_dataframe(
         expected_df, pd.read_csv(test_path, compression="gzip")
     )
 
-    """
-    Tests SCDataFrame with TSV input.
-    """
-
+    # Tests SCDataFrame with TSV input.
     sc_df = SCDataFrame(data=basic_outlier_tsv)
     expected_df = pd.read_csv(basic_outlier_tsv, delimiter="\t")
 
@@ -85,10 +74,7 @@ def test_SCDataFrame_with_dataframe(
 
     pd.testing.assert_frame_equal(expected_df, pd.read_csv(test_path, sep="\t"))
 
-    """
-    Tests SCDataFrame with TSV input.
-    """
-
+    # Tests SCDataFrame with parquet input.
     sc_df = SCDataFrame(data=basic_outlier_parquet)
     expected_df = pd.read_parquet(basic_outlier_parquet)
 
@@ -102,3 +88,8 @@ def test_SCDataFrame_with_dataframe(
     assert parquet.read_table(basic_outlier_parquet).equals(
         parquet.read_table(test_path)
     )
+
+    # test SCDataFrame with SCDataFrame input
+    copy_sc_df = SCDataFrame(data=sc_df)
+
+    pd.testing.assert_frame_equal(copy_sc_df.data, sc_df.data)

From fd818685c9b3d4ae5dea88d7ca3a40a5e1c3615f Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Wed, 19 Jun 2024 11:01:48 -0600
Subject: [PATCH 19/27] linting

---
 tests/test_scdataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index 2627b88..2781c8c 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -17,7 +17,6 @@ def test_SCDataFrame_with_dataframe(
     basic_outlier_tsv: str,
     basic_outlier_parquet: str,
 ):
-    
     # Tests SCDataFrame with pd.DataFrame input.
     sc_df = SCDataFrame(data=basic_outlier_dataframe)
 

From 6d61bf3d0cec43780dea125c14bae4a9f27e8724 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Wed, 19 Jun 2024 11:39:05 -0600
Subject: [PATCH 20/27] enable pd.series compatibility

---
 src/cosmicqc/scdataframe.py | 6 ++++++
 tests/test_scdataframe.py   | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index bbe29db..50eb0f8 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -58,6 +58,12 @@ def __init__(
             self.data_source = data.data_source
             self.data = data.data
 
+        elif isinstance(data, pd.Series):
+            # if data is a pd.Series, remember this within the data_source attr
+            self.data_source = "pandas.Series"
+            # also cast the series to a dataframe
+            self.data = pd.DataFrame(data)
+
         elif isinstance(data, pd.DataFrame):
             # if data is a pd.DataFrame, remember this within the data_source attr
             self.data_source = "pandas.DataFrame"
diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index 2781c8c..2ddcb03 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -32,6 +32,13 @@ def test_SCDataFrame_with_dataframe(
 
     assert parquet.read_table(control_path).equals(parquet.read_table(test_path))
 
+    # Tests SCDataFrame with pd.Series input.
+    sc_df = SCDataFrame(data=basic_outlier_dataframe.loc[0])
+
+    # test that we ingested the data properly
+    assert sc_df.data_source == "pandas.Series"
+    assert sc_df.equals(pd.DataFrame(basic_outlier_dataframe.loc[0]))
+
     # Tests SCDataFrame with CSV input.
     sc_df = SCDataFrame(data=basic_outlier_csv)
     expected_df = pd.read_csv(basic_outlier_csv)

From f696b27499e503df64ce40c53d7a80b952454732 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 25 Jun 2024 11:56:35 -0600
Subject: [PATCH 21/27] update comments about input

---
 src/cosmicqc/analyze.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
index abe43d6..bf926e3 100644
--- a/src/cosmicqc/analyze.py
+++ b/src/cosmicqc/analyze.py
@@ -33,7 +33,8 @@ def identify_outliers(
 
     Args:
         df: Union[SCDataFrame, pd.DataFrame, str]
-            DataFrame or file with converted output from CytoTable.
+            DataFrame or file string-based filepath of a
+            Parquet, CSV, or TSV file with CytoTable output or similar data.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
@@ -123,7 +124,8 @@ def find_outliers(
 
     Args:
         df: Union[SCDataFrame, pd.DataFrame, str]
-            DataFrame or file with converted output from CytoTable.
+            DataFrame or file string-based filepath of a
+            Parquet, CSV, or TSV file with CytoTable output or similar data.
         metadata_columns: List[str]
             List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
@@ -188,7 +190,8 @@ def label_outliers(
 
         Args:
             df: Union[SCDataFrame, pd.DataFrame, str]
-                DataFrame or file with converted output from CytoTable.
+                DataFrame or file string-based filepath of a
+                Parquet, CSV, or TSV file with CytoTable output or similar data.
             feature_thresholds: Dict[str, float]
                 One of two options:
                 A dictionary with the feature name(s) as the key(s) and their assigned

From 07e18c8198e85c5765866b572aace8dd9a97a3ca Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 25 Jun 2024 11:58:07 -0600
Subject: [PATCH 22/27] tsv.gz compression option addition

---
 src/cosmicqc/scdataframe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 50eb0f8..78245f2 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -82,6 +82,7 @@ def __init__(
                 data_path.suffix == ".csv"
                 or data_path.suffix in (".tsv", ".txt")
                 or data_path.suffixes == [".csv", ".gz"]
+                or data_path.suffixes == [".tsv", ".gz"]
             ):
                 # read as a CSV, CSV.GZ, .TSV, or .TXT file
                 self.data = pd.read_csv(data, **kwargs)

From 457da2dfd5eb1f73e256046165683ac34b82d75b Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 25 Jun 2024 11:59:54 -0600
Subject: [PATCH 23/27] general docs consistency

---
 src/cosmicqc/scdataframe.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 78245f2..b683b90 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -50,7 +50,7 @@ def __init__(
             data (Union[pd.DataFrame, str]):
                 The data source, either a pandas DataFrame or a file path.
             **kwargs:
-                Additional keyword arguments to pass to the pandas read functions.
+                Additional keyword arguments to pass to the pandas read_* methods.
         """
 
         if isinstance(data, SCDataFrame):
@@ -101,8 +101,10 @@ def export(
         Exports the underlying pandas DataFrame to a file.
 
         Args:
-            file_path (str): The path where the DataFrame should be saved.
-            **kwargs: Additional keyword arguments to pass to the pandas to_* methods.
+            file_path (str):
+                The path where the DataFrame should be saved.
+            **kwargs:
+                Additional keyword arguments to pass to the pandas to_* methods.
         """
 
         data_path = pathlib.Path(file_path)
@@ -143,10 +145,12 @@ def __getattr__(self: SCDataFrame_type, attr: str) -> Any:  # noqa: ANN401
         pandas DataFrame, except for custom methods.
 
         Args:
-            attr (str): The name of the attribute being accessed.
+            attr (str):
+                The name of the attribute being accessed.
 
         Returns:
-            Any: The value of the attribute from the pandas DataFrame.
+            Any:
+                The value of the attribute from the pandas DataFrame.
         """
         if attr in self.__dict__:
             return self.__dict__[attr]
@@ -157,9 +161,11 @@ def __getitem__(self: SCDataFrame_type, key: Union[int, str]) -> Any:  # noqa: A
         Returns an element or a slice of the underlying pandas DataFrame.
 
         Args:
-            key: The key or slice to access the data.
+            key:
+                The key or slice to access the data.
 
         Returns:
-            pd.DataFrame or any: The selected element or slice of data.
+            pd.DataFrame or any:
+                The selected element or slice of data.
         """
         return self.data[key]

From ccb7cce044081a03c4a1b5abce5ebb8ae0db6a28 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 25 Jun 2024 12:07:19 -0600
Subject: [PATCH 24/27] class docs

---
 src/cosmicqc/scdataframe.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index b683b90..27ca5c9 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -14,11 +14,13 @@
 
 class SCDataFrame:
     """
-    A class to handle and load different types of data files into a pandas DataFrame.
+    A class designed to enhance single-cell data handling by wrapping
+    pandas DataFrame capabilities, providing advanced methods for quality control,
+    comprehensive analysis, and image-based data processing.
 
     This class can initialize with either a pandas DataFrame or a file path (CSV, TSV,
     TXT, or Parquet). When initialized with a file path, it reads the data into a
-    pandas DataFrame.
+    pandas DataFrame. It also includes capabilities to export data.
 
     Attributes:
         data_source (str):

From 341514d40ebcd07e12dee4685bdf8a4e6ade1133 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 25 Jun 2024 12:07:53 -0600
Subject: [PATCH 25/27] string repr

---
 src/cosmicqc/scdataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index 27ca5c9..eb0aa15 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -132,7 +132,7 @@ def __call__(self: SCDataFrame_type) -> pd.DataFrame:
         """
         return self.data
 
-    def __repr__(self: SCDataFrame_type) -> pd.DataFrame:
+    def __repr__(self: SCDataFrame_type) -> str:
         """
         Returns the representation of the underlying pandas DataFrame.
 

From 28c7931be361a0db6bee10e336bc814eff05700c Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 25 Jun 2024 12:21:27 -0600
Subject: [PATCH 26/27] add str-based checks for scdataframe output

---
 tests/test_scdataframe.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_scdataframe.py b/tests/test_scdataframe.py
index 2ddcb03..3ca01c3 100644
--- a/tests/test_scdataframe.py
+++ b/tests/test_scdataframe.py
@@ -23,6 +23,7 @@ def test_SCDataFrame_with_dataframe(
     # test that we ingested the data properly
     assert sc_df.data_source == "pandas.DataFrame"
     assert sc_df.equals(basic_outlier_dataframe)
+    assert str(sc_df) == str(basic_outlier_dataframe)
 
     # test export
     basic_outlier_dataframe.to_parquet(
@@ -38,6 +39,7 @@ def test_SCDataFrame_with_dataframe(
     # test that we ingested the data properly
     assert sc_df.data_source == "pandas.Series"
     assert sc_df.equals(pd.DataFrame(basic_outlier_dataframe.loc[0]))
+    assert str(sc_df) == str(pd.DataFrame(basic_outlier_dataframe.loc[0]))
 
     # Tests SCDataFrame with CSV input.
     sc_df = SCDataFrame(data=basic_outlier_csv)
@@ -46,6 +48,7 @@ def test_SCDataFrame_with_dataframe(
     # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_csv
     assert sc_df.equals(expected_df)
+    assert str(sc_df) == str(expected_df)
 
     # test export
     sc_df.export(test_path := f"{tmp_path}/df_input_example.csv", index=False)
@@ -59,6 +62,7 @@ def test_SCDataFrame_with_dataframe(
     # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_csv_gz
     assert sc_df.equals(expected_df)
+    assert str(sc_df) == str(expected_df)
 
     # test export
     sc_df.export(test_path := f"{tmp_path}/df_input_example.csv.gz", index=False)
@@ -74,6 +78,7 @@ def test_SCDataFrame_with_dataframe(
     # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_tsv
     assert sc_df.equals(expected_df)
+    assert str(sc_df) == str(expected_df)
 
     # test export
     sc_df.export(test_path := f"{tmp_path}/df_input_example.tsv", index=False)
@@ -87,6 +92,7 @@ def test_SCDataFrame_with_dataframe(
     # test that we ingested the data properly
     assert sc_df.data_source == basic_outlier_parquet
     assert sc_df.equals(expected_df)
+    assert str(sc_df) == str(expected_df)
 
     # test export
     sc_df.export(test_path := f"{tmp_path}/df_input_example2.parquet")

From fe73f4038d45ddee9b32c1a3f493977378adaa52 Mon Sep 17 00:00:00 2001
From: Dave Bunten <ekgto445@gmail.com>
Date: Tue, 25 Jun 2024 12:22:58 -0600
Subject: [PATCH 27/27] Update src/cosmicqc/scdataframe.py

Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com>
---
 src/cosmicqc/scdataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py
index eb0aa15..b722a50 100644
--- a/src/cosmicqc/scdataframe.py
+++ b/src/cosmicqc/scdataframe.py
@@ -72,7 +72,7 @@ def __init__(
             self.data = data
 
         elif isinstance(data, (pathlib.Path, str)):
-            # if the data is a string, remember the original source
+            # if the data is a string or a pathlib path, remember the original source
             # through a data_source attr
             self.data_source = data