add yaml threshold input for find_outliers

WayScience · May 20, 2024 · b63f2db · b63f2db
1 parent 3c8e34c
commit b63f2db
Show file tree

Hide file tree

Showing 5 changed files with 239 additions and 6 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ python = ">=3.9,<3.13"
 pandas = "^2.2.2"
 scipy = "^1.13.0"
 pyarrow = "^16.0.0"
+pyyaml = "^6.0.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.0"

diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
@@ -3,15 +3,24 @@
 """
 
 import operator
+import pathlib
 from functools import reduce
-from typing import Dict, List
+from typing import Dict, List, Optional, Union
 
 import pandas as pd
+import yaml
 from scipy.stats import zscore as scipy_zscore
 
+DEFAULT_QC_THRESHOLD_FILE = (
+    f"{pathlib.Path(__file__).parent!s}/data/qc_thresholds_default.yml"
+)
+
 
 def find_outliers(
-    df: pd.DataFrame, feature_thresholds: Dict[str, float], metadata_columns: List[str]
+    df: pd.DataFrame,
+    metadata_columns: List[str],
+    feature_thresholds: Union[Dict[str, float], str],
+    feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
 ) -> pd.DataFrame:
     """
     This function uses z-scoring to format the data for detecting outlier
@@ -23,18 +32,31 @@ def find_outliers(
     Args:
         df: pd.DataFrame
             Data frame with converted output from CytoTable.
+        metadata_columns: List[str]
+            List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
-            Dictionary with the feature name(s) as the key(s) and their assigned
+            One of two options:
+            A dictionary with the feature name(s) as the key(s) and their assigned
             threshold for identifying outliers. Positive int for the threshold
             will detect outliers "above" than the mean, negative int will detect
             outliers "below" the mean.
-        metadata_columns: List[str]
-            List of metadata columns that should be outputted with the outlier data.
+            Or a string which is a named key reference found within
+            the feature_thresholds_file yaml file.
+        feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
+            An optional feature thresholds file where thresholds may be
+            defined within a file.
 
     Returns:
         pd.DataFrame:
             Outlier data frame for the given conditions.
     """
+
+    if isinstance(feature_thresholds, str):
+        feature_thresholds = read_thresholds_set_from_file(
+            feature_thresholds=feature_thresholds,
+            feature_thresholds_file=feature_thresholds_file,
+        )
+
     # Create z-score columns for each feature to reference during outlier detection
     zscore_columns = {}
     for feature in feature_thresholds:
@@ -71,3 +93,40 @@ def find_outliers(
 
     # Return outliers DataFrame with specified columns
     return outliers_df[columns_to_include]
+
+
+def read_thresholds_set_from_file(
+    feature_thresholds: str, feature_thresholds_file: str
+):
+    """
+    Reads a set of feature thresholds from a specified file.
+
+    This function takes the path to a feature thresholds file and a
+    specific feature threshold string, reads the file, and returns
+    the thresholds set from the file.
+
+    Args:
+        feature_thresholds (str):
+            A string specifying the feature thresholds.
+        feature_thresholds_file (str):
+            The path to the file containing feature thresholds.
+
+    Returns:
+        dict: A dictionary containing the processed feature thresholds.
+
+    Raises:
+        LookupError: If the file does not contain the specified feature_thresholds key.
+    """
+
+    with open(feature_thresholds_file, "r") as file:
+        thresholds = yaml.safe_load(file)
+
+    if feature_thresholds not in thresholds["thresholds"]:
+        raise LookupError(
+            (
+                f"Unable to find threshold set by name {feature_thresholds}"
+                f" within {feature_thresholds_file}"
+            )
+        )
+
+    return thresholds["thresholds"][feature_thresholds]
diff --git a/src/cosmicqc/data/qc_thresholds_default.yml b/src/cosmicqc/data/qc_thresholds_default.yml
@@ -0,0 +1,16 @@
+# defines threshold sets for running qc procedures as part of this project.
+versions:
+  cellprofiler: ">=4.2.4"
+thresholds:
+  # Set a negative threshold to identify both outlier small nuclei
+  # and low formfactor representing non-circular segmentations.
+  small_and_low_formfactor_nuclei:
+    Nuclei_AreaShape_Area: -1
+    Nuclei_AreaShape_FormFactor: -1
+  # find very elongated nuclei segmentations (above mean)
+  elongated_nuclei:
+    Nuclei_AreaShape_Eccentricity: 2
+  # find large nuclei segmentations (above mean) and low formfactor
+  large_nuclei:
+    Nuclei_AreaShape_Area: 2
+    Nuclei_AreaShape_FormFactor: -2
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
@@ -3,6 +3,7 @@
 """
 
 import pandas as pd
+import pytest
 from cosmicqc import analyze
 
 
@@ -183,3 +184,99 @@ def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame):
             14811: "f01",
         },
     }
+
+
+def test_read_thresholds_set_from_file():
+    """
+    Tests read_thresholds_set_from_file
+    """
+
+    # test that an exception is raised on receiving a bad
+    # lookup value from the thresholds file.
+    with pytest.raises(LookupError):
+        analyze.read_thresholds_set_from_file(
+            feature_thresholds="bad_lookup_value",
+            feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
+        )
+
+    # test default threshold sets
+    assert analyze.read_thresholds_set_from_file(
+        feature_thresholds="small_and_low_formfactor_nuclei",
+        feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
+    ) == {"Nuclei_AreaShape_Area": -1, "Nuclei_AreaShape_FormFactor": -1}
+
+    assert analyze.read_thresholds_set_from_file(
+        feature_thresholds="elongated_nuclei",
+        feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
+    ) == {"Nuclei_AreaShape_Eccentricity": 2}
+
+    assert analyze.read_thresholds_set_from_file(
+        feature_thresholds="large_nuclei",
+        feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
+    ) == {"Nuclei_AreaShape_Area": 2, "Nuclei_AreaShape_FormFactor": -2}
+
+
+def test_find_outliers_dict_and_default_config_cfret(
+    cytotable_CFReT_data_df: pd.DataFrame,
+):
+    """
+    Testing find_outliers with dictionary vs yaml threshold sets
+    using CytoTable CFReT data.
+    """
+
+    # metadata columns to include in output data frame
+    metadata_columns = [
+        "Image_Metadata_Plate",
+        "Image_Metadata_Well",
+        "Image_Metadata_Site",
+    ]
+
+    # test that the output is the same from dict vs yaml
+    pd.testing.assert_frame_equal(
+        analyze.find_outliers(
+            df=cytotable_CFReT_data_df,
+            feature_thresholds={
+                "Nuclei_AreaShape_Area": -1,
+                "Nuclei_AreaShape_FormFactor": -1,
+            },
+            metadata_columns=metadata_columns,
+        ),
+        analyze.find_outliers(
+            df=cytotable_CFReT_data_df,
+            feature_thresholds="small_and_low_formfactor_nuclei",
+            metadata_columns=metadata_columns,
+        ),
+    )
+
+    # test that the output is the same from dict vs yaml
+    pd.testing.assert_frame_equal(
+        analyze.find_outliers(
+            df=cytotable_CFReT_data_df,
+            feature_thresholds={
+                "Nuclei_AreaShape_Eccentricity": 2,
+            },
+            metadata_columns=metadata_columns,
+        ),
+        analyze.find_outliers(
+            df=cytotable_CFReT_data_df,
+            feature_thresholds="elongated_nuclei",
+            metadata_columns=metadata_columns,
+        ),
+    )
+
+    # test that the output is the same from dict vs yaml
+    pd.testing.assert_frame_equal(
+        analyze.find_outliers(
+            df=cytotable_CFReT_data_df,
+            feature_thresholds={
+                "Nuclei_AreaShape_Area": 2,
+                "Nuclei_AreaShape_FormFactor": -2,
+            },
+            metadata_columns=metadata_columns,
+        ),
+        analyze.find_outliers(
+            df=cytotable_CFReT_data_df,
+            feature_thresholds="large_nuclei",
+            metadata_columns=metadata_columns,
+        ),
+    )