Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the consensus.py file to keep consistent API #101

Merged
merged 10 commits into from
Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pycytominer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .annotate import annotate
from .feature_select import feature_select
from .normalize import normalize
from .consensus import consensus
149 changes: 39 additions & 110 deletions pycytominer/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,123 +2,52 @@
Acquire consensus signatures for input samples
"""

import os
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.util import (
get_pairwise_correlation,
check_correlation_method,
infer_cp_features,
)

from pycytominer import aggregate
from pycytominer.cyto_utils import output, modz, check_consensus_operation

def modz_base(population_df, method="spearman", min_weight=0.01, precision=4):
"""
Perform a modified z score transformation. This code is modified from cmapPy.
(see https://github.com/cytomining/pycytominer/issues/52). Note that this will
apply the transformation to the FULL population_df.
See modz() for replicate level procedures.

Arguments:
population_df - pandas DataFrame that includes metadata and observation features.
rows are samples and columns are features
method - string indicating which correlation metric to use [default: "spearman"]
min_weight - the minimum correlation to clip all non-negative values lower to
precision - how many significant digits to round weights to

Return:
modz transformed dataframe - a consensus signature of the input population_df
weighted by replicate correlation
"""
assert population_df.shape[0] > 0, "population_df must include at least one sample"

method = check_correlation_method(method=method)

# Step 1: Extract pairwise correlations of samples
# Transpose so samples are columns
population_df = population_df.transpose()
cor_df, pair_df = get_pairwise_correlation(population_df, method=method)

# Round correlation results
pair_df = pair_df.round(precision)

# Step 2: Identify sample weights
# Fill diagonal of correlation_matrix with np.nan
np.fill_diagonal(cor_df.values, np.nan)

# Remove negative values
cor_df = cor_df.clip(lower=0)

# Get average correlation for each profile (will ignore NaN)
raw_weights = cor_df.mean(axis=1)

# Threshold weights (any value < min_weight will become min_weight)
raw_weights = raw_weights.clip(lower=min_weight)

# normalize raw_weights so that they add to 1
weights = raw_weights / sum(raw_weights)
weights = weights.round(precision)

# Step 3: Normalize
if population_df.shape[1] == 1:
# There is only one sample (note that columns are now samples)
modz_df = population_df
else:
modz_df = population_df * weights
modz_df = modz_df.sum(axis=1)

return modz_df


def modz(
population_df,
replicate_columns,
def consensus(
profiles,
replicate_columns=["Metadata_Plate", "Metadata_Well"],
operation="median",
features="infer",
method="spearman",
min_weight=0.01,
precision=4,
output_file="none",
modz_method="spearman",
modz_min_weight=0.01,
modz_precision=4,
compression=None,
float_format=None,
):
"""
Collapse replicates into a consensus signature using a weighted transformation

Arguments:
population_df - pandas DataFrame that includes metadata and observation features.
rows are samples and columns are features
replicate_columns - a string or list of column(s) in the population dataframe that
indicate replicate level information
features - a list of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_"
method - string indicating which correlation metric to use [default: "spearman"]
min_weight - the minimum correlation to clip all non-negative values lower to
precision - how many significant digits to round weights to

Return:
Consensus signatures for all replicates in the given DataFrame
"""
population_features = population_df.columns.tolist()
assert_error = "{} not in input dataframe".format(replicate_columns)
if isinstance(replicate_columns, list):
assert all([x in population_features for x in replicate_columns]), assert_error
elif isinstance(replicate_columns, str):
assert replicate_columns in population_features, assert_error
replicate_columns = replicate_columns.split()
# Confirm that the operation is supported
check_consensus_operation(operation)

if operation == "modz":
consensus_df = modz(
population_df=profiles,
replicate_columns=replicate_columns,
features=features,
method=modz_method,
min_weight=modz_min_weight,
precision=modz_precision,
)
else:
return ValueError("replicate_columns must be a list or string")

if features == "infer":
features = infer_cp_features(population_df)

subset_features = list(set(replicate_columns + features))
population_df = population_df.loc[:, subset_features]

modz_df = population_df.groupby(replicate_columns).apply(
lambda x: modz_base(
x.loc[:, features],
method=method,
min_weight=min_weight,
precision=precision,
consensus_df = aggregate(
population_df=profiles,
strata=replicate_columns,
features=features,
operation=operation,
subset_data_df="none",
)
)

return modz_df
if output_file != "none":
output(
df=consensus_df,
output_filename=output_file,
compression=compression,
float_format=float_format,
)
else:
return consensus_df
2 changes: 2 additions & 0 deletions pycytominer/cyto_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
load_known_metadata_dictionary,
check_correlation_method,
check_aggregate_operation,
check_consensus_operation,
get_pairwise_correlation,
)
from .load import (
Expand All @@ -18,3 +19,4 @@
drop_outlier_features,
)
from .write_gct import write_gct
from .modz import modz
123 changes: 123 additions & 0 deletions pycytominer/cyto_utils/modz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.util import (
get_pairwise_correlation,
check_correlation_method,
infer_cp_features,
)


def modz_base(population_df, method="spearman", min_weight=0.01, precision=4):
"""
Perform a modified z score transformation. This code is modified from cmapPy.
(see https://github.com/cytomining/pycytominer/issues/52). Note that this will
apply the transformation to the FULL population_df.
See modz() for replicate level procedures.

Arguments:
population_df - pandas DataFrame that includes metadata and observation features.
rows are samples and columns are features
method - string indicating which correlation metric to use [default: "spearman"]
min_weight - the minimum correlation to clip all non-negative values lower to
precision - how many significant digits to round weights to

Return:
modz transformed dataframe - a consensus signature of the input population_df
weighted by replicate correlation
"""
assert population_df.shape[0] > 0, "population_df must include at least one sample"

method = check_correlation_method(method=method)

# Step 1: Extract pairwise correlations of samples
# Transpose so samples are columns
population_df = population_df.transpose()
cor_df, pair_df = get_pairwise_correlation(population_df, method=method)

# Round correlation results
pair_df = pair_df.round(precision)

# Step 2: Identify sample weights
# Fill diagonal of correlation_matrix with np.nan
np.fill_diagonal(cor_df.values, np.nan)

# Remove negative values
cor_df = cor_df.clip(lower=0)

# Get average correlation for each profile (will ignore NaN)
raw_weights = cor_df.mean(axis=1)

# Threshold weights (any value < min_weight will become min_weight)
raw_weights = raw_weights.clip(lower=min_weight)

# normalize raw_weights so that they add to 1
weights = raw_weights / sum(raw_weights)
weights = weights.round(precision)

# Step 3: Normalize
if population_df.shape[1] == 1:
# There is only one sample (note that columns are now samples)
modz_df = population_df
else:
modz_df = population_df * weights
modz_df = modz_df.sum(axis=1)

return modz_df


def modz(
population_df,
replicate_columns,
features="infer",
method="spearman",
min_weight=0.01,
precision=4,
):
"""
Collapse replicates into a consensus signature using a weighted transformation

Arguments:
population_df - pandas DataFrame that includes metadata and observation features.
rows are samples and columns are features
replicate_columns - a string or list of column(s) in the population dataframe that
indicate replicate level information
features - a list of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_"
method - string indicating which correlation metric to use [default: "spearman"]
min_weight - the minimum correlation to clip all non-negative values lower to
precision - how many significant digits to round weights to

Return:
Consensus signatures for all replicates in the given DataFrame
"""
population_features = population_df.columns.tolist()
assert_error = "{} not in input dataframe".format(replicate_columns)
if isinstance(replicate_columns, list):
assert all([x in population_features for x in replicate_columns]), assert_error
elif isinstance(replicate_columns, str):
assert replicate_columns in population_features, assert_error
replicate_columns = replicate_columns.split()
else:
return ValueError("replicate_columns must be a list or string")

if features == "infer":
features = infer_cp_features(population_df)

subset_features = list(set(replicate_columns + features))
population_df = population_df.loc[:, subset_features]

modz_df = (
population_df.groupby(replicate_columns)
.apply(
lambda x: modz_base(
x.loc[:, features],
method=method,
min_weight=min_weight,
precision=precision,
)
)
.reset_index()
)

return modz_df
24 changes: 24 additions & 0 deletions pycytominer/cyto_utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,30 @@ def check_aggregate_operation(operation):
return operation


def check_consensus_operation(operation):
"""
Confirm that the input operation for consensus is currently supported

Arguments:
operation - string indicating the consensus operation to provide

Return:
Correctly formatted operation method
"""
operation = operation.lower()
avail_ops = ["modz"] # All aggregation operations are also supported
try:
operation = check_aggregate_operation(operation)
except AssertionError:
assert (
operation in avail_ops
), "operation {} not supported, select one of {} or see aggregate.py".format(
operation, avail_ops
)

return operation


def get_pairwise_correlation(population_df, method="pearson"):
"""
Given a population dataframe, calculate all pairwise correlations
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# Get temporary directory
tmpdir = tempfile.gettempdir()

# Lauch a sqlite connection
# Setup a testing file
output_file = os.path.join(tmpdir, "test.csv")

# Build data to use in tests
Expand Down
Loading