Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Robust Normalization by MAD #72

Merged
merged 3 commits into from
Mar 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions pycytominer/cyto_utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import pandas as pd
from scipy.linalg import eigh
from scipy.stats import median_absolute_deviation
from sklearn.base import BaseEstimator, TransformerMixin


Expand Down Expand Up @@ -52,3 +53,35 @@ def transform(self, X, y=None):
Whiten an input matrix a given population dataframe
"""
return np.dot(X - self.mu, self.W)


class RobustMAD(BaseEstimator, TransformerMixin):
"""
Class to perform a "Robust" normalization with respect to median and mad

scaled = (x - median) / mad
"""

def __init__(self):
pass

def fit(self, X, y=None):
"""
Compute the median and mad to be used for later scaling.

Argument:
X - pandas dataframe to fit RobustMAD transform
"""
# Get the mean of the features (columns) and center if specified
self.median = X.median()
self.mad = pd.Series(median_absolute_deviation(X), index=self.median.index)
return self

def transform(self, X, copy=None):
"""
Apply the RobustMAD calculation

Argument:
X - pandas dataframe to apply RobustMAD transform
"""
return (X - self.median) / self.mad
6 changes: 4 additions & 2 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.preprocessing import StandardScaler, RobustScaler

from pycytominer.cyto_utils import output, infer_cp_features
from pycytominer.cyto_utils.transform import Whiten
from pycytominer.cyto_utils.transform import Whiten, RobustMAD


def normalize(
Expand Down Expand Up @@ -59,13 +59,15 @@ def normalize(
# Define which scaler to use
method = method.lower()

avail_methods = ["standardize", "robustize", "whiten"]
avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"]
assert method in avail_methods, "operation must be one {}".format(avail_methods)

if method == "standardize":
scaler = StandardScaler()
elif method == "robustize":
scaler = RobustScaler()
elif method == "mad_robustize":
scaler = RobustMAD()
elif method == "whiten":
scaler = Whiten(center=whiten_center)

Expand Down
24 changes: 23 additions & 1 deletion pycytominer/tests/test_cyto_utils/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import random
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.transform import Whiten
from scipy.stats import median_absolute_deviation
from pycytominer.cyto_utils.transform import Whiten, RobustMAD

random.seed(123)

Expand Down Expand Up @@ -45,3 +46,24 @@ def test_whiten_no_center():
expected_result = data_df.shape[1]

assert int(result) == expected_result


def test_robust_mad():
"""
Testing the RobustMAD class
"""
scaler = RobustMAD()
scaler = scaler.fit(data_df)
transform_df = scaler.transform(data_df)

# The transfomed data is expected to have a median equal to zero
result = transform_df.median().sum()
expected_result = 0

assert int(result) == expected_result

# Check a median absolute deviation equal to the number of columns
result = median_absolute_deviation(transform_df).sum()
expected_result = data_df.shape[1]

assert int(result) == expected_result
76 changes: 76 additions & 0 deletions pycytominer/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,82 @@ def test_normalize_robustize_ctrlsamples():
pd.testing.assert_frame_equal(normalize_result, expected_result)


def test_normalize_robustize_mad_allsamples():
"""
Testing normalize pycytominer function
method = "standardize"
meta_features = "none"
samples="all"
"""
normalize_result = normalize(
profiles=data_df.copy(),
features=["x", "y", "z", "zz"],
meta_features="infer",
samples="all",
method="mad_robustize",
).round(1)

expected_result = pd.DataFrame(
{
"Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
"Metadata_treatment": [
"drug",
"drug",
"control",
"control",
"drug",
"drug",
"control",
"control",
],
"x": [-1.1, -0.7, 2, -0.7, 0.7, 0.7, 0.7, -1.1],
"y": [-0.5, -1.2, 0.8, -0.2, 0.2, 1.5, 0.5, -1.2],
"z": [-0.8, 1.5, -0.5, 0.5, 0.8, 6.2, -0.5, -0.5],
"zz": [0.3, 2.9, -0.7, -0.3, 1.6, 7.1, -0.6, -0.6],
}
).reset_index(drop=True)

pd.testing.assert_frame_equal(normalize_result, expected_result)


def test_normalize_robustize_mad_ctrlsamples():
"""
Testing normalize pycytominer function
method = "standardize"
meta_features = "none"
samples="Metadata_treatment == 'control'"
"""
normalize_result = normalize(
profiles=data_df.copy(),
features=["x", "y", "z", "zz"],
meta_features="infer",
samples="Metadata_treatment == 'control'",
method="mad_robustize",
).round(1)

expected_result = pd.DataFrame(
{
"Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
"Metadata_treatment": [
"drug",
"drug",
"control",
"control",
"drug",
"drug",
"control",
"control",
],
"x": [-0.8, -0.5, 1.5, -0.5, 0.5, 0.5, 0.5, -0.8],
"y": [-0.9, -1.8, 0.9, -0.4, 0.0, 1.8, 0.4, -1.8],
"z": [-np.inf, np.inf, np.nan, np.inf, np.inf, np.inf, np.nan, np.nan],
"zz": [16.2, 59.4, -1.3, 5.4, 37.8, 132.2, 0.0, 0.0],
}
).reset_index(drop=True)

pd.testing.assert_frame_equal(normalize_result, expected_result)


def test_normalize_standardize_allsamples_fromfile():
"""
Testing normalize pycytominer function
Expand Down