From 9143599305a701f027424bf93c0487b4631ca13d Mon Sep 17 00:00:00 2001 From: Shadi Akiki Date: Wed, 26 Sep 2018 13:20:20 +0300 Subject: [PATCH] ENH: correlation function accepts method being a callable (#22684) --- doc/source/computation.rst | 15 +++++++++++++ doc/source/whatsnew/v0.24.0.txt | 2 ++ pandas/core/frame.py | 20 +++++++++++++++-- pandas/core/nanops.py | 2 ++ pandas/core/series.py | 18 +++++++++++++-- pandas/tests/series/test_analytics.py | 32 +++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 4 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 5e7b8be5f8af04..0d2021de8f88e0 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -153,6 +153,21 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: frame.corr(min_periods=12) +.. versionadded:: 0.24.0 + +The ``method`` argument can also be a callable for a generic correlation +calculation. In this case, it should be a single function +that produces a single value from two ndarray inputs. Suppose we wanted to +compute the correlation based on histogram intersection: + +.. ipython:: python + + # histogram intersection + histogram_intersection = lambda a, b: np.minimum( + np.true_divide(a, a.sum()), np.true_divide(b, b.sum()) + ).sum() + frame.corr(method=histogram_intersection) + A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to compute the correlation between like-labeled Series contained in different DataFrame objects. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3b61fde77cb9f0..7eb3ea303cc9d1 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -20,6 +20,8 @@ New features - :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) + .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 721c31c57bc06c..e16f61d7f5f023 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6672,10 +6672,14 @@ def corr(self, method='pearson', min_periods=1): Parameters ---------- - method : {'pearson', 'kendall', 'spearman'} + method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float + .. versionadded:: 0.24.0 + min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. Currently only available for pearson @@ -6684,6 +6688,18 @@ def corr(self, method='pearson', min_periods=1): Returns ------- y : DataFrame + + Examples + -------- + >>> import numpy as np + >>> histogram_intersection = lambda a, b: np.minimum(a, b + ... ).sum().round(decimals=1) + >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], + ... columns=['dogs', 'cats']) + >>> df.corr(method=histogram_intersection) + dogs cats + dogs 1.0 0.3 + cats 0.3 1.0 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns @@ -6695,7 +6711,7 @@ def corr(self, method='pearson', min_periods=1): elif method == 'spearman': correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) - elif method == 'kendall': + elif method == 'kendall' or callable(method): if min_periods is None: min_periods = 1 mat = ensure_float64(mat).T diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f44fb4f6e9e144..7619d47cbc8f9b 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -766,6 +766,8 @@ def nancorr(a, b, method='pearson', min_periods=None): def get_corr_func(method): if method in ['kendall', 'spearman']: from scipy.stats import kendalltau, spearmanr + elif callable(method): + return method def _pearson(a, b): return np.corrcoef(a, b)[0, 1] diff --git a/pandas/core/series.py b/pandas/core/series.py index fdb9ef59c1d3e3..544a08981c83b8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1910,10 +1910,14 @@ def corr(self, other, method='pearson', min_periods=None): Parameters ---------- other : Series - method : {'pearson', 'kendall', 'spearman'} + method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarray + and returning a float + .. versionadded:: 0.24.0 + min_periods : int, optional Minimum number of observations needed to have a valid result @@ -1921,12 +1925,22 @@ def corr(self, other, method='pearson', min_periods=None): Returns ------- correlation : float + + Examples + -------- + >>> import numpy as np + >>> histogram_intersection = lambda a, b: np.minimum(a, b + ... ).sum().round(decimals=1) + >>> s1 = pd.Series([.2, .0, .6, .2]) + >>> s2 = pd.Series([.3, .6, .0, .1]) + >>> s1.corr(s2, method=histogram_intersection) + 0.3 """ this, other = self.align(other, join='inner', copy=False) if len(this) == 0: return np.nan - if method in ['pearson', 'spearman', 'kendall']: + if method in ['pearson', 'spearman', 'kendall'] or callable(method): return nanops.nancorr(this.values, other.values, method=method, min_periods=min_periods) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 9acd6501c38251..58a160d17cbe8d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -789,6 +789,38 @@ def test_corr_invalid_method(self): with tm.assert_raises_regex(ValueError, msg): s1.corr(s2, method="____") + def test_corr_callable_method(self): + # simple correlation example + # returns 1 if exact equality, 0 otherwise + my_corr = lambda a, b: 1. if (a == b).all() else 0. + + # simple example + s1 = Series([1, 2, 3, 4, 5]) + s2 = Series([5, 4, 3, 2, 1]) + expected = 0 + tm.assert_almost_equal( + s1.corr(s2, method=my_corr), + expected) + + # full overlap + tm.assert_almost_equal( + self.ts.corr(self.ts, method=my_corr), 1.) + + # partial overlap + tm.assert_almost_equal( + self.ts[:15].corr(self.ts[5:], method=my_corr), 1.) + + # No overlap + assert np.isnan( + self.ts[::2].corr(self.ts[1::2], method=my_corr)) + + # dataframe example + df = pd.DataFrame([s1, s2]) + expected = pd.DataFrame([ + {0: 1., 1: 0}, {0: 0, 1: 1.}]) + tm.assert_almost_equal( + df.transpose().corr(method=my_corr), expected) + def test_cov(self): # full overlap tm.assert_almost_equal(self.ts.cov(self.ts), self.ts.std() ** 2)