Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug: make sampler work when NaN values are predicted by the models #18

Merged
merged 2 commits into from
Sep 17, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions src/autora/experimentalist/model_disagreement/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,54 @@
import itertools
from typing import Iterable, List, Union, Optional
from typing import Iterable, List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from autora.utils.deprecation import deprecated_alias
from sklearn.preprocessing import StandardScaler


def score_sample(conditions: Union[pd.DataFrame, np.ndarray],
models: List,
num_samples: Optional[int] = None):
def score_sample(
conditions: Union[pd.DataFrame, np.ndarray],
models: List,
num_samples: Optional[int] = None,
):
"""
A experimentalist that returns selected samples for independent variables
for which the models disagree the most in terms of their predictions.

Args:
X: pool of IV conditions to evaluate in terms of model disagreement
conditions: pool of IV conditions to evaluate in terms of model disagreement
models: List of Scikit-learn (regression or classification) models to compare
num_samples: number of samples to select

Returns: Sampled pool

Examples:
If a model is undefined at a certain condition, the disagreement on that point is set to 0:
>>> class ModelUndefined:
... def predict(self, X):
... return np.log(X)
>>> class ModelDefinined:
... def predict(self, X):
... return X
>>> modelUndefined = ModelUndefined()
>>> modelDefined = ModelDefinined()
>>> conditions_defined = np.array([1, 2, 3])
>>> score_sample(conditions_defined, [modelUndefined, modelDefined], 3)
0 score
2 3 1.364948
1 2 -0.362023
0 1 -1.002924

>>> conditions_undefined = np.array([-1, 0, 1, 2, 3])
>>> score_sample(conditions_undefined, [modelUndefined, modelDefined], 5)
0 score
4 3 1.752985
3 2 0.330542
2 1 -0.197345
0 -1 -0.943091
1 0 -0.943091
"""

if isinstance(conditions, Iterable) and not isinstance(conditions, pd.DataFrame):
Expand Down Expand Up @@ -61,6 +90,10 @@ def score_sample(conditions: Union[pd.DataFrame, np.ndarray],
else:
disagreement = np.mean((y_a - y_b) ** 2, axis=1)

disagreement[np.isinf(disagreement)] = 0

disagreement = np.nan_to_num(disagreement)

model_disagreement.append(disagreement)

assert len(model_disagreement) >= 1, "No disagreements to compare."
Expand All @@ -87,16 +120,15 @@ def score_sample(conditions: Union[pd.DataFrame, np.ndarray],
return conditions.head(num_samples)



def sample(conditions: Union[pd.DataFrame, np.ndarray],
models: List,
num_samples: int = 1):
def sample(
conditions: Union[pd.DataFrame, np.ndarray], models: List, num_samples: int = 1
):
"""
A experimentalist that returns selected samples for independent variables
for which the models disagree the most in terms of their predictions.

Args:
X: pool of IV conditions to evaluate in terms of model disagreement
conditions: pool of IV conditions to evaluate in terms of model disagreement
models: List of Scikit-learn (regression or classification) models to compare
num_samples: number of samples to select

Expand All @@ -112,4 +144,5 @@ def sample(conditions: Union[pd.DataFrame, np.ndarray],
model_disagreement_sample = sample
model_disagreement_score_sample = score_sample
model_disagreement_sampler = deprecated_alias(
model_disagreement_sample, "model_disagreement_sampler")
model_disagreement_sample, "model_disagreement_sampler"
)
38 changes: 25 additions & 13 deletions tests/test_model_disagreement_sampler.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,43 @@
from src.autora.experimentalist.model_disagreement import model_disagreement_sample, model_disagreement_score_sample
from autora.theorist.bms import BMSRegressor; BMSRegressor()
from autora.theorist.darts import DARTSRegressor; DARTSRegressor()
import numpy as np
import pandas as pd

from autora.experimentalist.model_disagreement import (
model_disagreement_sample,
model_disagreement_score_sample,
)
from autora.theorist.bms import BMSRegressor
from autora.theorist.darts import DARTSRegressor

BMSRegressor()


DARTSRegressor()


def test_output_dimensions():
#Meta-Setup
# Meta-Setup
X = np.linspace(start=-3, stop=6, num=10).reshape(-1, 1)
y = (X**2).reshape(-1, 1)
n = 5
#Theorists

# Theorists
bms_theorist = BMSRegressor(epochs=10)
darts_theorist = DARTSRegressor(max_epochs=10)

bms_theorist.fit(X,y)
darts_theorist.fit(X,y)

#Sampler
bms_theorist.fit(X, y)
darts_theorist.fit(X, y)

# Sampler
X_new = model_disagreement_sample(X, [bms_theorist, darts_theorist], n)

# Check that the sampler returns n experiment conditions
assert X_new.shape == (n, X.shape[1])


def test_pandas():
# Meta-Setup
X = np.linspace(start=-3, stop=6, num=10).reshape(-1, 1)
y = (X ** 2).reshape(-1, 1)
y = (X**2).reshape(-1, 1)
n = 5

X = pd.DataFrame(X)
Expand All @@ -45,10 +56,11 @@ def test_pandas():
assert isinstance(X_new, pd.DataFrame)
assert X_new.shape == (n, X.shape[1])


def test_scoring():
# Meta-Setup
X = np.linspace(start=-3, stop=6, num=10).reshape(-1, 1)
y = (X ** 2).reshape(-1, 1)
y = (X**2).reshape(-1, 1)
n = 5

X = pd.DataFrame(X)
Expand All @@ -66,4 +78,4 @@ def test_scoring():
# Check that the sampler returns n experiment conditions
assert isinstance(X_new, pd.DataFrame)
assert "score" in X_new.columns
assert X_new.shape == (n, X.shape[1] + 1)
assert X_new.shape == (n, X.shape[1] + 1)