Skip to content

Commit

Permalink
Adding more to pandas interface
Browse files Browse the repository at this point in the history
  • Loading branch information
kwinkunks committed Oct 2, 2023
1 parent a088506 commit c12f088
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 33 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## 0.4.1, 3 October 2023

- This is a minor release intended to preview new `pandas`-related features for version 0.5.0.
- Added another `pandas` Series accessor, `is_imbalanced()`.
- Added two `pandas` DataFrame accessors, `feature_importances()` and `correlation_detector()`. These are experimental features.


## 0.4.0, 28 September 2023

- `redflag` can now be installed by the `conda` package and environment manager. To do so, use `conda install -c conda-forge redflag`.
Expand Down
65 changes: 58 additions & 7 deletions docs/notebooks/Using_redflag_with_Pandas.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
{
"data": {
"text/plain": [
"'0.4.0rc1'"
"'0.4.1.dev4+ga088506.d20231002'"
]
},
"execution_count": 1,
Expand Down Expand Up @@ -303,8 +303,8 @@
{
"data": {
"text/plain": [
"{'f1': 0.23675633692130837,\n",
" 'roc_auc': 0.5016874730299105,\n",
"{'f1': 0.24829855151480457,\n",
" 'roc_auc': 0.50211405767246,\n",
" 'strategy': 'stratified',\n",
" 'task': 'classification'}"
]
Expand Down Expand Up @@ -369,9 +369,9 @@
"output_type": "stream",
"text": [
"Continuous data suitable for regression\n",
"Outliers: [ 34 35 140 141 142 143 175 532 581 583 633 662 757 768\n",
" 769 801 1316 1547 1744 1754 1756 1778 1779 1780 1784 1785 1788 1808\n",
" 1812 2884 2932 2973 2974 3004 3087 3094 3109]\n",
"Outliers: [ 34 35 136 140 141 142 143 145 147 180 181 182 532 583\n",
" 633 662 757 768 769 801 1316 1547 1731 1732 1744 1754 1756 1779\n",
" 1780 1788 2884 2932 2973 2974 3004 3079 3080 3087 3094 3109]\n",
"Correlated: True\n",
"Dummy scores:{'mean': {'mean_squared_error': 47528.78263092096, 'r2': 0.0}}\n",
"\n"
Expand All @@ -397,7 +397,58 @@
"source": [
"## DataFrame accessor\n",
"\n",
"Coming soon!"
"Experimental feature: so far only `feature_importances` and `correlation_detector` are implemented."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "274cc24d-69ad-49ef-8606-cc9b77b154dc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.23209524, 0.21793961, 0.34387713, 0.20608802])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features = ['GR', 'RHOB', 'PE', 'ILD_log10']\n",
"df.redflag.feature_importances(features, target='Lithology')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "12e3e4ee-e8df-47ba-810d-3bff492d5389",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Feature 0 appears to be autocorrelated.\n",
"Feature 1 appears to be autocorrelated.\n",
"Feature 2 appears to be autocorrelated.\n",
"Feature 3 appears to be autocorrelated.\n"
]
}
],
"source": [
"df.redflag.correlation_detector(features, target=None)"
]
},
{
"cell_type": "markdown",
"id": "a3185f63-64b1-47fd-875d-2c646b84aa65",
"metadata": {},
"source": [
"Indeed, all of these features are correlated."
]
}
],
Expand Down
80 changes: 59 additions & 21 deletions src/redflag/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@
limitations under the License.
"""
import warnings
from typing import Optional

from .imbalance import imbalance_degree, minority_classes, is_imbalanced
from .importance import feature_importances as feature_importances
from .outliers import get_outliers
from .target import *
from .independence import is_correlated
from .utils import docstring_from


def null_decorator(arg):
Expand Down Expand Up @@ -62,34 +65,20 @@ class SeriesAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj

@docstring_from(minority_classes)
def minority_classes(self):
if is_continuous(self._obj):
warnings.warn('The Series does not seem categorical.')
return minority_classes(self._obj)

@docstring_from(imbalance_degree)
def imbalance_degree(self):
if is_continuous(self._obj):
warnings.warn('The Series does not seem categorical.')
return imbalance_degree(self._obj)

@docstring_from(is_imbalanced)
def is_imbalanced(self, threshold=0.4, method='tv', classes=None):
"""
Check if a dataset is imbalanced by first checking that there are minority
classes, then inspecting the fractional part of the imbalance degree metric.
The metric is compared to the threshold you provide (default 0.4, same as
the sklearn detector ImbalanceDetector).
Args:
a (array): A list of class labels.
threshold (float): The threshold to use. Default: 0.5.
method (str or function): The method to use.
classes (array): A list of classes, in the event that `a` does not
contain all of the classes, or if you want to ignore some classes
in `a` (not recommended) you can omit them from this list.
Returns:
bool: True if the dataset is imbalanced.
"""
if is_continuous(self._obj):
warnings.warn('The Series does not seem categorical.')
return is_imbalanced(self._obj,
Expand All @@ -98,11 +87,13 @@ def is_imbalanced(self, threshold=0.4, method='tv', classes=None):
classes=classes
)

@docstring_from(is_ordered)
def is_ordered(self, q=0.95):
if is_continuous(self._obj):
warnings.warn('The Series does not seem categorical.')
return is_ordered(self._obj, q=q)

@docstring_from(dummy_scores)
def dummy_scores(self, task='auto', random_state=None):
return dummy_scores(self._obj, task=task, random_state=random_state)

Expand All @@ -123,7 +114,54 @@ def report(self, random_state=None):
return template.format(**results)


# @register_dataframe_accessor("redflag")
# class DataFrameAccessor:
# def __init__(self, pandas_obj):
# self._obj = pandas_obj
@register_dataframe_accessor("redflag")
class DataFrameAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj

@docstring_from(feature_importances)
def feature_importances(self, features=None, target=None,
n: int=3, task: Optional[str]=None,
random_state: Optional[int]=None,
standardize: bool=True):
if target is None:
raise ValueError('You must provide a target column.')
else:
y_ = self._obj[target]
if is_continuous(y_):
task = 'regression'
else:
task = 'classification'
if len(y_.shape) > 1:
raise NotImplementedError('Multilabel targets are not supported.')
if features is None and target is not None:
X_ = self._obj.drop(columns=target)
else:
X_ = self._obj[features]
return feature_importances(X_, y_, n=n, task=task,
random_state=random_state,
standardize=standardize)


def correlation_detector(self, features=None, target=None, n=20, s=20, threshold=0.1):
"""
This is an experimental feature.
"""
if target is not None:
y_ = self._obj[target]
if len(y_.shape) > 1:
raise NotImplementedError('Multilabel targets are not supported.')
if is_correlated(y_):
warnings.warn('The target appears to be autocorrelated.',stacklevel=2)

if features is None and target is not None:
X_ = self._obj.drop(target, axis=1).values
else:
X_ = self._obj[features].values

for i, x in enumerate(X_.T):
if is_correlated(x, n=n, s=s, threshold=threshold):
warnings.warn(f'Feature {i} appears to be autocorrelated.',stacklevel=2)

# There is probably something more useful to return.
return
20 changes: 18 additions & 2 deletions src/redflag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,30 @@
from scipy.spatial.distance import pdist


def docstring_from(source_func):
"""
Decorator copying the docstring one function to another.
"""
def decorator(func):

@functools.wraps(func)
def wrapper(*args, **kwargs):
func.__doc__ = source_func.__doc__
return func(*args, **kwargs)
return wrapper

return decorator


def deprecated(instructions):
"""
Flags a method as deprecated. This decorator can be used to mark functions
as deprecated. It will result in a warning being emitted when the function
is used.
Args:
instructions (str): A human-friendly string of instructions, such
as: 'Please migrate to add_proxy() ASAP.'
instructions (str): A human-friendly string of instructions.
Returns:
The decorated function.
"""
Expand Down
12 changes: 9 additions & 3 deletions tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ def test_dummy_scores():


def test_imbalance():
is_imbalanced = c.redflag.is_imbalanced()
assert is_imbalanced
assert c.redflag.is_imbalanced(threshold=0.24, method='tv')

minorities = c.redflag.minority_classes()
assert 2 in minorities and 3 in minorities
Expand Down Expand Up @@ -54,4 +53,11 @@ def test_series_categorical_report():

def test_series_continuous_report():
report_r = r.redflag.report()
assert 'Continuous' in report_r
assert 'Continuous' in report_r


def test_feature_importances_docstring():
s = pd.DataFrame([c, r]).redflag.feature_importances.__doc__

# There is more to this than I thought. See issue.
# assert s.startswith("Measure feature importances on a task, given X and y.")

0 comments on commit c12f088

Please sign in to comment.