Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SLEP006 on Sample Properties #16

Merged
merged 16 commits into from
Jun 29, 2020
Merged
5 changes: 5 additions & 0 deletions conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
'sphinx.ext.intersphinx',
'sphinx.ext.mathjax',
'sphinx.ext.viewcode',
'sphinx_issues',
]

# Add any paths that contain templates here, relative to this directory.
Expand Down Expand Up @@ -165,3 +166,7 @@
# -- Options for intersphinx extension ---------------------------------------

intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)}

# -- Sphinx-Issues configuration --

issues_github_path = "scikit-learn/scikit-learn"
1 change: 1 addition & 0 deletions index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
slep002/proposal
slep003/proposal
slep004/proposal
slep006/proposal

.. toctree::
:maxdepth: 1
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
sphinx
sphinx-rtd-theme
sphinx-issues
68 changes: 68 additions & 0 deletions slep006/cases_opt1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from defs import (accuracy, group_cv, make_scorer, SelectKBest,
LogisticRegressionCV, cross_validate, make_pipeline, X, y,
my_groups, my_weights, my_other_weights)

# %%
# Case A: weighted scoring and fitting

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
)
cross_validate(lr, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring='accuracy')

# Error handling: if props={'sample_eight': my_weights, ...} was passed
# instead, the estimator would fit and score without weight, silently failing.

# %%
# Case B: weighted scoring and unweighted fitting


class MyLogisticRegressionCV(LogisticRegressionCV):
def fit(self, X, y, props=None):
props = props.copy()
props.pop('sample_weight', None)
super().fit(X, y, props=props)


# %%
# Case C: unweighted feature selection

# Currently feature selection does not handle sample_weight, and as long as
# that remains the case, it will simply ignore the prop passed to it. Hence:

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
)
sel = SelectKBest()
pipe = make_pipeline(sel, lr)
cross_validate(pipe, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring='accuracy')

# %%
# Case D: different scoring and fitting weights

weighted_acc = make_scorer(accuracy)


def specially_weighted_acc(est, X, y, props):
props = props.copy()
props['sample_weight'] = 'scoring_weight'
return weighted_acc(est, X, y, props)


lr = LogisticRegressionCV(
cv=group_cv,
scoring=specially_weighted_acc,
)
cross_validate(lr, X, y, cv=group_cv,
props={
'scoring_weight': my_weights,
'sample_weight': my_other_weights,
'groups': my_groups,
},
scoring=specially_weighted_acc)
70 changes: 70 additions & 0 deletions slep006/cases_opt2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from defs import (group_cv, SelectKBest, LogisticRegressionCV,
cross_validate, make_pipeline, X, y, my_groups,
my_weights, my_other_weights)

# %%
# Case A: weighted scoring and fitting

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
)
props = {'cv__groups': my_groups,
'estimator__cv__groups': my_groups,
'estimator__sample_weight': my_weights,
'scoring__sample_weight': my_weights,
'estimator__scoring__sample_weight': my_weights}
cross_validate(lr, X, y, cv=group_cv,
props=props,
scoring='accuracy')

# error handling: if props={'estimator__sample_eight': my_weights, ...} was
# passed instead, the estimator would raise an error.

# %%
# Case B: weighted scoring and unweighted fitting

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
)
props = {'cv__groups': my_groups,
'estimator__cv__groups': my_groups,
'scoring__sample_weight': my_weights,
'estimator__scoring__sample_weight': my_weights}
cross_validate(lr, X, y, cv=group_cv,
props=props,
scoring='accuracy')

# %%
# Case C: unweighted feature selection

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
)
pipe = make_pipeline(SelectKBest(), lr)
props = {'cv__groups': my_groups,
'estimator__logisticregressioncv__cv__groups': my_groups,
'estimator__logisticregressioncv__sample_weight': my_weights,
'scoring__sample_weight': my_weights,
'estimator__scoring__sample_weight': my_weights}
cross_validate(pipe, X, y, cv=group_cv,
props=props,
scoring='accuracy')

# %%
# Case D: different scoring and fitting weights

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
)
props = {'cv__groups': my_groups,
'estimator__cv__groups': my_groups,
'estimator__sample_weight': my_other_weights,
'scoring__sample_weight': my_weights,
'estimator__scoring__sample_weight': my_weights}
cross_validate(lr, X, y, cv=group_cv,
props=props,
scoring='accuracy')
99 changes: 99 additions & 0 deletions slep006/cases_opt3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from defs import (accuracy, make_scorer, SelectKBest, LogisticRegressionCV,
group_cv, cross_validate, make_pipeline, X, y, my_groups,
my_weights, my_other_weights)

# %%
# Case A: weighted scoring and fitting

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
prop_routing={'cv': ['groups'],
'scoring': ['sample_weight'],
}
# one question here is whether we need to explicitly route sample_weight
# to LogisticRegressionCV's fitting...
)

# Alternative syntax, which assumes cv receives 'groups' by default, and that a
# method-based API is provided on meta-estimators:
# lr = LogisticRegressionCV(
# cv=group_cv,
# scoring='accuracy',
# ).add_prop_route(scoring='sample_weight')

cross_validate(lr, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring='accuracy',
prop_routing={'estimator': '*', # pass all props
'cv': ['groups'],
'scoring': ['sample_weight'],
})

# Error handling: if props={'sample_eight': my_weights, ...} was passed
# instead, LogisticRegressionCV would have to identify that a key was passed
# that could not be routed nor used, in order to raise an error.

# %%
# Case B: weighted scoring and unweighted fitting

# Here we rename the sample_weight prop so that we can specify that it only
# applies to scoring.
lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
prop_routing={'cv': ['groups'],
# read the following as "scoring should consume
# 'scoring_weight' as if it were 'sample_weight'."
'scoring': {'sample_weight': 'scoring_weight'},
},
)
cross_validate(lr, X, y, cv=group_cv,
props={'scoring_weight': my_weights, 'groups': my_groups},
scoring='accuracy',
prop_routing={'estimator': '*',
'cv': ['groups'],
'scoring': {'sample_weight': 'scoring_weight'},
})

# %%
# Case C: unweighted feature selection

lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
prop_routing={'cv': ['groups'],
'scoring': ['sample_weight'],
})
pipe = make_pipeline(SelectKBest(), lr,
prop_routing={'logisticregressioncv': ['sample_weight',
'groups']})
cross_validate(lr, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring='accuracy',
prop_routing={'estimator': '*',
'cv': ['groups'],
'scoring': ['sample_weight'],
})

# %%
# Case D: different scoring and fitting weights
lr = LogisticRegressionCV(
cv=group_cv,
scoring='accuracy',
prop_routing={'cv': ['groups'],
# read the following as "scoring should consume
# 'scoring_weight' as if it were 'sample_weight'."
'scoring': {'sample_weight': 'scoring_weight'},
},
)
cross_validate(lr, X, y, cv=group_cv,
props={'scoring_weight': my_weights, 'groups': my_groups,
'fitting_weight': my_other_weights},
scoring='accuracy',
prop_routing={'estimator': {'sample_weight': 'fitting_weight',
'scoring_weight': 'scoring_weight',
'groups': 'groups'},
'cv': ['groups'],
'scoring': {'sample_weight': 'scoring_weight'},
})
78 changes: 78 additions & 0 deletions slep006/cases_opt4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from defs import (accuracy, group_cv, make_scorer, SelectKBest,
LogisticRegressionCV, cross_validate,
make_pipeline, X, y, my_groups, my_weights,
my_other_weights)

# %%
# Case A: weighted scoring and fitting

# Here we presume that GroupKFold requests `groups` by default.
# We need to explicitly request weights in make_scorer and for
# LogisticRegressionCV. Both of these consumers understand the meaning
# of the key "sample_weight".

weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
lr = LogisticRegressionCV(
cv=group_cv,
scoring=weighted_acc,
).set_props_request(['sample_weight'])
cross_validate(lr, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring=weighted_acc)

# Error handling: if props={'sample_eight': my_weights, ...} was passed,
# cross_validate would raise an error, since 'sample_eight' was not requested
# by any of its children.

# %%
# Case B: weighted scoring and unweighted fitting

# Since LogisticRegressionCV requires that weights explicitly be requested,
# removing that request means the fitting is unweighted.

weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
lr = LogisticRegressionCV(
cv=group_cv,
scoring=weighted_acc,
)
cross_validate(lr, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring=weighted_acc)

# %%
# Case C: unweighted feature selection

# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly.
# Here it does not request them.

weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
lr = LogisticRegressionCV(
cv=group_cv,
scoring=weighted_acc,
).set_props_request(['sample_weight'])
sel = SelectKBest()
pipe = make_pipeline(sel, lr)
cross_validate(pipe, X, y, cv=group_cv,
props={'sample_weight': my_weights, 'groups': my_groups},
scoring=weighted_acc)

# %%
# Case D: different scoring and fitting weights

# Despite make_scorer and LogisticRegressionCV both expecting a key
# sample_weight, we can use aliases to pass different weights to different
# consumers.

weighted_acc = make_scorer(accuracy,
request_props={'scoring_weight': 'sample_weight'})
lr = LogisticRegressionCV(
cv=group_cv,
scoring=weighted_acc,
).set_props_request({'fitting_weight': "sample_weight"})
cross_validate(lr, X, y, cv=group_cv,
props={
'scoring_weight': my_weights,
'fitting_weight': my_other_weights,
'groups': my_groups,
},
scoring=weighted_acc)
14 changes: 14 additions & 0 deletions slep006/defs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy
from sklearn.metrics import make_scorer
from sklearn.model_selection import GroupKFold, cross_validate
from sklearn.pipeline import make_pipeline

N, M = 100, 4
X = np.random.rand(N, M)
y = np.random.randint(0, 1, size=N)
my_groups = np.random.randint(0, 10, size=N)
my_weights = np.random.rand(N)
my_other_weights = np.random.rand(N)
Loading