Skip to content

Commit

Permalink
add yaml threshold input for find_outliers
Browse files Browse the repository at this point in the history
  • Loading branch information
d33bs committed May 20, 2024
1 parent 3c8e34c commit b63f2db
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 6 deletions.
62 changes: 61 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ python = ">=3.9,<3.13"
pandas = "^2.2.2"
scipy = "^1.13.0"
pyarrow = "^16.0.0"
pyyaml = "^6.0.1"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.0"
Expand Down
69 changes: 64 additions & 5 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
"""

import operator
import pathlib
from functools import reduce
from typing import Dict, List
from typing import Dict, List, Optional, Union

import pandas as pd
import yaml
from scipy.stats import zscore as scipy_zscore

DEFAULT_QC_THRESHOLD_FILE = (
f"{pathlib.Path(__file__).parent!s}/data/qc_thresholds_default.yml"
)


def find_outliers(
df: pd.DataFrame, feature_thresholds: Dict[str, float], metadata_columns: List[str]
df: pd.DataFrame,
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
) -> pd.DataFrame:
"""
This function uses z-scoring to format the data for detecting outlier
Expand All @@ -23,18 +32,31 @@ def find_outliers(
Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
Dictionary with the feature name(s) as the key(s) and their assigned
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
threshold for identifying outliers. Positive int for the threshold
will detect outliers "above" than the mean, negative int will detect
outliers "below" the mean.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
Or a string which is a named key reference found within
the feature_thresholds_file yaml file.
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
Returns:
pd.DataFrame:
Outlier data frame for the given conditions.
"""

if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)

# Create z-score columns for each feature to reference during outlier detection
zscore_columns = {}
for feature in feature_thresholds:
Expand Down Expand Up @@ -71,3 +93,40 @@ def find_outliers(

# Return outliers DataFrame with specified columns
return outliers_df[columns_to_include]


def read_thresholds_set_from_file(
feature_thresholds: str, feature_thresholds_file: str
):
"""
Reads a set of feature thresholds from a specified file.
This function takes the path to a feature thresholds file and a
specific feature threshold string, reads the file, and returns
the thresholds set from the file.
Args:
feature_thresholds (str):
A string specifying the feature thresholds.
feature_thresholds_file (str):
The path to the file containing feature thresholds.
Returns:
dict: A dictionary containing the processed feature thresholds.
Raises:
LookupError: If the file does not contain the specified feature_thresholds key.
"""

with open(feature_thresholds_file, "r") as file:
thresholds = yaml.safe_load(file)

if feature_thresholds not in thresholds["thresholds"]:
raise LookupError(
(
f"Unable to find threshold set by name {feature_thresholds}"
f" within {feature_thresholds_file}"
)
)

return thresholds["thresholds"][feature_thresholds]
16 changes: 16 additions & 0 deletions src/cosmicqc/data/qc_thresholds_default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# defines threshold sets for running qc procedures as part of this project.
versions:
cellprofiler: ">=4.2.4"
thresholds:
# Set a negative threshold to identify both outlier small nuclei
# and low formfactor representing non-circular segmentations.
small_and_low_formfactor_nuclei:
Nuclei_AreaShape_Area: -1
Nuclei_AreaShape_FormFactor: -1
# find very elongated nuclei segmentations (above mean)
elongated_nuclei:
Nuclei_AreaShape_Eccentricity: 2
# find large nuclei segmentations (above mean) and low formfactor
large_nuclei:
Nuclei_AreaShape_Area: 2
Nuclei_AreaShape_FormFactor: -2
97 changes: 97 additions & 0 deletions tests/test_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import pandas as pd
import pytest
from cosmicqc import analyze


Expand Down Expand Up @@ -183,3 +184,99 @@ def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame):
14811: "f01",
},
}


def test_read_thresholds_set_from_file():
"""
Tests read_thresholds_set_from_file
"""

# test that an exception is raised on receiving a bad
# lookup value from the thresholds file.
with pytest.raises(LookupError):
analyze.read_thresholds_set_from_file(
feature_thresholds="bad_lookup_value",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
)

# test default threshold sets
assert analyze.read_thresholds_set_from_file(
feature_thresholds="small_and_low_formfactor_nuclei",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
) == {"Nuclei_AreaShape_Area": -1, "Nuclei_AreaShape_FormFactor": -1}

assert analyze.read_thresholds_set_from_file(
feature_thresholds="elongated_nuclei",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
) == {"Nuclei_AreaShape_Eccentricity": 2}

assert analyze.read_thresholds_set_from_file(
feature_thresholds="large_nuclei",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
) == {"Nuclei_AreaShape_Area": 2, "Nuclei_AreaShape_FormFactor": -2}


def test_find_outliers_dict_and_default_config_cfret(
cytotable_CFReT_data_df: pd.DataFrame,
):
"""
Testing find_outliers with dictionary vs yaml threshold sets
using CytoTable CFReT data.
"""

# metadata columns to include in output data frame
metadata_columns = [
"Image_Metadata_Plate",
"Image_Metadata_Well",
"Image_Metadata_Site",
]

# test that the output is the same from dict vs yaml
pd.testing.assert_frame_equal(
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds={
"Nuclei_AreaShape_Area": -1,
"Nuclei_AreaShape_FormFactor": -1,
},
metadata_columns=metadata_columns,
),
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds="small_and_low_formfactor_nuclei",
metadata_columns=metadata_columns,
),
)

# test that the output is the same from dict vs yaml
pd.testing.assert_frame_equal(
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds={
"Nuclei_AreaShape_Eccentricity": 2,
},
metadata_columns=metadata_columns,
),
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds="elongated_nuclei",
metadata_columns=metadata_columns,
),
)

# test that the output is the same from dict vs yaml
pd.testing.assert_frame_equal(
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds={
"Nuclei_AreaShape_Area": 2,
"Nuclei_AreaShape_FormFactor": -2,
},
metadata_columns=metadata_columns,
),
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds="large_nuclei",
metadata_columns=metadata_columns,
),
)

0 comments on commit b63f2db

Please sign in to comment.