Adding more to pandas interface

scienxlab · Oct 2, 2023 · c12f088 · c12f088
1 parent a088506
commit c12f088
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## 0.4.1, 3 October 2023
+
+- This is a minor release intended to preview new `pandas`-related features for version 0.5.0.
+- Added another `pandas` Series accessor, `is_imbalanced()`.
+- Added two `pandas` DataFrame accessors, `feature_importances()` and `correlation_detector()`. These are experimental features.
+
+
 ## 0.4.0, 28 September 2023
 
 - `redflag` can now be installed by the `conda` package and environment manager. To do so, use `conda install -c conda-forge redflag`.

diff --git a/docs/notebooks/Using_redflag_with_Pandas.ipynb b/docs/notebooks/Using_redflag_with_Pandas.ipynb
@@ -23,7 +23,7 @@
     {
      "data": {
       "text/plain": [
-       "'0.4.0rc1'"
+       "'0.4.1.dev4+ga088506.d20231002'"
       ]
      },
      "execution_count": 1,
@@ -303,8 +303,8 @@
     {
      "data": {
       "text/plain": [
-       "{'f1': 0.23675633692130837,\n",
-       " 'roc_auc': 0.5016874730299105,\n",
+       "{'f1': 0.24829855151480457,\n",
+       " 'roc_auc': 0.50211405767246,\n",
        " 'strategy': 'stratified',\n",
        " 'task': 'classification'}"
       ]
@@ -369,9 +369,9 @@
      "output_type": "stream",
      "text": [
       "Continuous data suitable for regression\n",
-      "Outliers:    [  34   35  140  141  142  143  175  532  581  583  633  662  757  768\n",
-      "  769  801 1316 1547 1744 1754 1756 1778 1779 1780 1784 1785 1788 1808\n",
-      " 1812 2884 2932 2973 2974 3004 3087 3094 3109]\n",
+      "Outliers:    [  34   35  136  140  141  142  143  145  147  180  181  182  532  583\n",
+      "  633  662  757  768  769  801 1316 1547 1731 1732 1744 1754 1756 1779\n",
+      " 1780 1788 2884 2932 2973 2974 3004 3079 3080 3087 3094 3109]\n",
       "Correlated:  True\n",
       "Dummy scores:{'mean': {'mean_squared_error': 47528.78263092096, 'r2': 0.0}}\n",
       "\n"
@@ -397,7 +397,58 @@
    "source": [
     "## DataFrame accessor\n",
     "\n",
-    "Coming soon!"
+    "Experimental feature: so far only `feature_importances` and `correlation_detector` are implemented."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "274cc24d-69ad-49ef-8606-cc9b77b154dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.23209524, 0.21793961, 0.34387713, 0.20608802])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "features = ['GR', 'RHOB', 'PE', 'ILD_log10']\n",
+    "df.redflag.feature_importances(features, target='Lithology')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "12e3e4ee-e8df-47ba-810d-3bff492d5389",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Feature 0 appears to be autocorrelated.\n",
+      "Feature 1 appears to be autocorrelated.\n",
+      "Feature 2 appears to be autocorrelated.\n",
+      "Feature 3 appears to be autocorrelated.\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.redflag.correlation_detector(features, target=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3185f63-64b1-47fd-875d-2c646b84aa65",
+   "metadata": {},
+   "source": [
+    "Indeed, all of these features are correlated."
    ]
   }
  ],

diff --git a/src/redflag/pandas.py b/src/redflag/pandas.py
@@ -19,11 +19,14 @@
 limitations under the License.
 """
 import warnings
+from typing import Optional
 
 from .imbalance import imbalance_degree, minority_classes, is_imbalanced
+from .importance import feature_importances as feature_importances
 from .outliers import get_outliers
 from .target import *
 from .independence import is_correlated
+from .utils import docstring_from
 
 
 def null_decorator(arg):
@@ -62,34 +65,20 @@ class SeriesAccessor:
     def __init__(self, pandas_obj):
         self._obj = pandas_obj
 
+    @docstring_from(minority_classes)
     def minority_classes(self):
         if is_continuous(self._obj):
             warnings.warn('The Series does not seem categorical.')
         return minority_classes(self._obj)
 
+    @docstring_from(imbalance_degree)
     def imbalance_degree(self):
         if is_continuous(self._obj):
             warnings.warn('The Series does not seem categorical.')
         return imbalance_degree(self._obj)
 
+    @docstring_from(is_imbalanced)
     def is_imbalanced(self, threshold=0.4, method='tv', classes=None):
-        """
-        Check if a dataset is imbalanced by first checking that there are minority
-        classes, then inspecting the fractional part of the imbalance degree metric.
-        The metric is compared to the threshold you provide (default 0.4, same as
-        the sklearn detector ImbalanceDetector).
-
-        Args:
-            a (array): A list of class labels.
-            threshold (float): The threshold to use. Default: 0.5.
-            method (str or function): The method to use.
-            classes (array): A list of classes, in the event that `a` does not
-                contain all of the classes, or if you want to ignore some classes
-                in `a` (not recommended) you can omit them from this list.
-
-        Returns:
-            bool: True if the dataset is imbalanced.
-        """
         if is_continuous(self._obj):
             warnings.warn('The Series does not seem categorical.')
         return is_imbalanced(self._obj,
@@ -98,11 +87,13 @@ def is_imbalanced(self, threshold=0.4, method='tv', classes=None):
                              classes=classes
                              )
 
+    @docstring_from(is_ordered)
     def is_ordered(self, q=0.95):
         if is_continuous(self._obj):
             warnings.warn('The Series does not seem categorical.')
         return is_ordered(self._obj, q=q)
 
+    @docstring_from(dummy_scores)
     def dummy_scores(self, task='auto', random_state=None):
         return dummy_scores(self._obj, task=task, random_state=random_state)
 
@@ -123,7 +114,54 @@ def report(self, random_state=None):
         return template.format(**results)
 
 
-# @register_dataframe_accessor("redflag")
-# class DataFrameAccessor:
-#     def __init__(self, pandas_obj):
-#         self._obj = pandas_obj
+@register_dataframe_accessor("redflag")
+class DataFrameAccessor:
+    def __init__(self, pandas_obj):
+        self._obj = pandas_obj
+
+    @docstring_from(feature_importances)
+    def feature_importances(self, features=None, target=None,
+                            n: int=3, task: Optional[str]=None,
+                            random_state: Optional[int]=None,
+                            standardize: bool=True):
+        if target is None:
+            raise ValueError('You must provide a target column.')
+        else:
+            y_ = self._obj[target]
+            if is_continuous(y_):
+                task = 'regression'
+            else:
+                task = 'classification'
+        if len(y_.shape) > 1:
+            raise NotImplementedError('Multilabel targets are not supported.')
+        if features is None and target is not None:
+            X_ = self._obj.drop(columns=target)
+        else:
+            X_ = self._obj[features]
+        return feature_importances(X_, y_, n=n, task=task,
+                                   random_state=random_state,
+                                   standardize=standardize)
+
+
+    def correlation_detector(self, features=None, target=None, n=20, s=20, threshold=0.1):
+        """
+        This is an experimental feature.
+        """
+        if target is not None:
+            y_ = self._obj[target]
+            if len(y_.shape) > 1:
+                raise NotImplementedError('Multilabel targets are not supported.')
+            if is_correlated(y_):
+                warnings.warn('The target appears to be autocorrelated.',stacklevel=2)
+
+        if features is None and target is not None:
+            X_ = self._obj.drop(target, axis=1).values
+        else:
+            X_ = self._obj[features].values
+
+        for i, x in enumerate(X_.T):
+            if is_correlated(x, n=n, s=s, threshold=threshold):
+                warnings.warn(f'Feature {i} appears to be autocorrelated.',stacklevel=2)
+
+        # There is probably something more useful to return.
+        return
diff --git a/src/redflag/utils.py b/src/redflag/utils.py
@@ -34,14 +34,30 @@
 from scipy.spatial.distance import pdist
 
 
+def docstring_from(source_func):
+    """
+    Decorator copying the docstring one function to another.
+    """
+    def decorator(func):
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            func.__doc__ = source_func.__doc__
+            return func(*args, **kwargs)
+        return wrapper
+
+    return decorator
+
+
 def deprecated(instructions):
     """
     Flags a method as deprecated. This decorator can be used to mark functions
     as deprecated. It will result in a warning being emitted when the function
     is used.
+
     Args:
-        instructions (str): A human-friendly string of instructions, such
-        as: 'Please migrate to add_proxy() ASAP.'
+        instructions (str): A human-friendly string of instructions.
+
     Returns:
         The decorated function.
     """

diff --git a/tests/test_pandas.py b/tests/test_pandas.py
@@ -24,8 +24,7 @@ def test_dummy_scores():
 
 
 def test_imbalance():
-    is_imbalanced = c.redflag.is_imbalanced()
-    assert is_imbalanced
+    assert c.redflag.is_imbalanced(threshold=0.24, method='tv')
 
     minorities = c.redflag.minority_classes()
     assert 2 in minorities and 3 in minorities
@@ -54,4 +53,11 @@ def test_series_categorical_report():
 
 def test_series_continuous_report():
     report_r = r.redflag.report()
-    assert 'Continuous' in report_r
+    assert 'Continuous' in report_r
+
+
+def test_feature_importances_docstring():
+    s = pd.DataFrame([c, r]).redflag.feature_importances.__doc__
+
+    # There is more to this than I thought. See issue.
+    # assert s.startswith("Measure feature importances on a task, given X and y.")