-
Notifications
You must be signed in to change notification settings - Fork 35
/
transform.py
192 lines (153 loc) · 6.05 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""Transform observation variables by specified groups.
References
----------
.. [1] Kessy et al. 2016 "Optimal Whitening and Decorrelation" arXiv: https://arxiv.org/abs/1512.00809
"""
import os
import numpy as np
import pandas as pd
from scipy.linalg import eigh
from scipy.stats import median_abs_deviation
from sklearn.base import BaseEstimator, TransformerMixin
class Spherize(BaseEstimator, TransformerMixin):
"""Class to apply a sphering transform (aka whitening) data in the base sklearn
transform API. Note, this implementation is modified/inspired from the following
sources:
1) A custom function written by Juan C. Caicedo
2) A custom ZCA function at https://github.com/mwv/zca
3) Notes from Niranj Chandrasekaran (https://github.com/cytomining/pycytominer/issues/90)
4) The R package "whitening" written by Strimmer et al (http://strimmerlab.org/software/whitening/)
5) Kessy et al. 2016 "Optimal Whitening and Decorrelation" [1]_
Attributes
----------
epsilon : float
fudge factor parameter
center : bool
option to center the input X matrix
method : str
a string indicating which class of sphering to perform
"""
def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
"""
Parameters
----------
epsilon : float, default 1e-6
fudge factor parameter
center : bool, default True
option to center the input X matrix
method : str, default "ZCA"
a string indicating which class of sphering to perform
"""
avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
self.epsilon = epsilon
self.center = center
assert (
method in avail_methods
), f"Error {method} not supported. Select one of {avail_methods}"
self.method = method
def fit(self, X, y=None):
"""Identify the sphering transform given self.X
Parameters
----------
X : pandas.core.frame.DataFrame
dataframe to fit sphering transform
Returns
-------
self
With computed weights attribute
"""
# Get the mean of the features (columns) and center if specified
self.mu = X.mean()
if self.center:
X = X - self.mu
# Get the covariance matrix
C = (1 / X.shape[0]) * np.dot(X.transpose(), X)
if self.method in ["PCA", "ZCA"]:
# Get the eigenvalues and eigenvectors of the covariance matrix
s, U = eigh(C)
# Fix sign ambiguity of eigenvectors
U = pd.DataFrame(U * np.sign(np.diag(U)))
# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon)))
# Calculate the sphering matrix
self.W = np.dot(D, U.transpose())
# If ZCA, perform additional rotation
if self.method == "ZCA":
self.W = np.dot(U, self.W)
if self.method in ["PCA-cor", "ZCA-cor"]:
# Get the correlation matrix
R = np.corrcoef(X.transpose())
# Get the eigenvalues and eigenvectors of the correlation matrix
try:
t, G = eigh(R)
except ValueError:
raise ValueError(
"Divide by zero error, make sure low variance columns are removed"
)
# Fix sign ambiguity of eigenvectors
G = pd.DataFrame(G * np.sign(np.diag(G)))
# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(t.clip(self.epsilon)))
# process the covariance diagonal matrix and fix rounding errors
v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon)))
# Calculate the sphering matrix
self.W = np.dot(np.dot(D, G.transpose()), v)
# If ZCA-cor, perform additional rotation
if self.method == "ZCA-cor":
self.W = np.dot(G, self.W)
return self
def transform(self, X, y=None):
"""Perform the sphering transform
Parameters
----------
X : pd.core.frame.DataFrame
Profile dataframe to be transformed using the precompiled weights
y : None
Has no effect; only used for consistency in sklearn transform API
Returns
-------
pandas.core.frame.DataFrame
Spherized dataframe
"""
return np.dot(X - self.mu, self.W.transpose())
class RobustMAD(BaseEstimator, TransformerMixin):
"""Class to perform a "Robust" normalization with respect to median and mad
scaled = (x - median) / mad
Attributes
----------
epsilon : float
fudge factor parameter
"""
def __init__(self, epsilon=1e-18):
self.epsilon = epsilon
def fit(self, X, y=None):
"""Compute the median and mad to be used for later scaling.
Parameters
----------
X : pandas.core.frame.DataFrame
dataframe to fit RobustMAD transform
Returns
-------
self
With computed median and mad attributes
"""
# Get the mean of the features (columns) and center if specified
self.median = X.median()
# The scale param is required to preserve previous behavior. More info at:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.median_absolute_deviation.html#scipy.stats.median_absolute_deviation
self.mad = pd.Series(
median_abs_deviation(X, nan_policy="omit", scale=1/1.4826), index=self.median.index
)
return self
def transform(self, X, copy=None):
"""Apply the RobustMAD calculation
Parameters
----------
X : pandas.core.frame.DataFrame
dataframe to fit RobustMAD transform
Returns
-------
pandas.core.frame.DataFrame
RobustMAD transformed dataframe
"""
return (X - self.median) / (self.mad + self.epsilon)