Skip to content

Commit

Permalink
add documentation and typing
Browse files Browse the repository at this point in the history
  • Loading branch information
d33bs committed May 22, 2024
1 parent 677578a commit 6797700
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 14 deletions.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,16 @@ select = [
# flake8-comprehensions
"C4",
# flake8-simplify
"SIM"
"SIM",
# flake8-annotations
"ANN"
]

[tool.ruff.lint.per-file-ignores]
# Ignore `E402` and `F401` (unused imports) in all `__init__.py` files
"__init__.py" = ["E402", "F401"]
# ignore typing rules for tests
"tests/*" = ["ANN201"]

# set dynamic versioning capabilities for project
[tool.poetry-dynamic-versioning]
Expand Down
40 changes: 27 additions & 13 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def identify_outliers(
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
) -> pd.Series:
) -> Union[pd.Series, pd.DataFrame]:
"""
This function uses z-scoring to format the data for detecting outlier
nuclei or cells using specific CellProfiler features. Thresholds are
Expand All @@ -47,12 +47,14 @@ def identify_outliers(
defined within a file.
Returns:
pd.Series, df:
Union[pd.Series, pd.DataFrame]:
Outlier series with booleans based on whether outliers were detected
or not for use within other functions.
"""

outlier_df = df
# create a copy of the dataframe to ensure
# we don't modify the supplied dataframe inplace.
outlier_df = df.copy()

thresholds_name = (
f"outlier_{feature_thresholds}"
Expand Down Expand Up @@ -87,14 +89,15 @@ def identify_outliers(
condition = outlier_df[zscore_columns[feature]] < threshold
conditions.append(condition)

# create a boolean pd.series identifier for dataframe
# based on all conditions for use within other functions.

return (
# create a boolean pd.series identifier for dataframe
# based on all conditions for use within other functions.
reduce(operator.and_, conditions)
if not include_threshold_scores
# otherwise, provide the threshold zscore col and the above column
else pd.concat(
[
# grab only the outlier zscore columns from the outlier_df
outlier_df[zscore_columns.values()],
pd.DataFrame({thresholds_name: reduce(operator.and_, conditions)}),
],
Expand Down Expand Up @@ -169,7 +172,8 @@ def label_outliers(
df: pd.DataFrame,
feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
) -> pd.Series:
include_threshold_scores: bool = True,
) -> pd.DataFrame:
"""
This function uses z-scoring to format the data for detecting outlier
nuclei or cells using specific CellProfiler features. Thresholds are
Expand All @@ -191,50 +195,58 @@ def label_outliers(
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
include_threshold_scores: bool = True
Whether to include the scores in addition to whether an outlier
was detected or not.
Returns:
pd.Series:
Outlier series with booleans based on whether outliers were detected
or not for use within other functions.
pd.DataFrame:
Full dataframe with optional scores and outlier boolean column.
"""

# for single outlier processing
if isinstance(feature_thresholds, (str, dict)):
# return the outlier dataframe for one threshold rule
return pd.concat(
[
df,
identify_outliers(
df=df,
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
include_threshold_scores=True,
include_threshold_scores=include_threshold_scores,
),
],
axis=1,
)

# for multiple outlier processing
elif feature_thresholds is None:
# return the outlier dataframe for all threshold rules
labeled_df = pd.concat(
[df]
+ [
# identify outliers for each threshold rule
identify_outliers(
df=df,
feature_thresholds=thresholds,
feature_thresholds_file=feature_thresholds_file,
include_threshold_scores=True,
include_threshold_scores=include_threshold_scores,
)
# loop through each threshold rule
for thresholds in read_thresholds_set_from_file(
feature_thresholds_file=feature_thresholds_file,
)
],
axis=1,
)
# return a dataframe with a deduplicated columns by name
return labeled_df.loc[:, ~labeled_df.columns.duplicated()]


def read_thresholds_set_from_file(
feature_thresholds_file: str, feature_thresholds: Optional[str] = None
):
) -> Union[Dict[str, int], Dict[str, Dict[str, int]]]:
"""
Reads a set of feature thresholds from a specified file.
Expand All @@ -256,9 +268,11 @@ def read_thresholds_set_from_file(
LookupError: If the file does not contain the specified feature_thresholds key.
"""

# open the yaml file
with open(feature_thresholds_file, "r") as file:
thresholds = yaml.safe_load(file)

# if no feature thresholds name is specified, return all thresholds
if feature_thresholds is None:
return thresholds["thresholds"]

Expand Down

0 comments on commit 6797700

Please sign in to comment.