Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DRScorer #757

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions econml/score/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"""

from .rscorer import RScorer
from .drscorer import DRScorer
from .ensemble_cate import EnsembleCateEstimator

__all__ = ['RScorer',
'DRScorer',
'EnsembleCateEstimator']
277 changes: 277 additions & 0 deletions econml/score/drscorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
from ..dr import DRLearner
from sklearn.base import clone
import numpy as np
from scipy.special import softmax
from .ensemble_cate import EnsembleCateEstimator


class DRScorer:
""" Scorer based on the DRLearner loss. Fits regression model g (using T-Learner) and propensity model p at fit
time and calculates the regression and propensity of the evaluation data::

g (model_regression) = E[Y | X, W, T]
p (model_propensity) = Pr[T | X, W]

Ydr(g,p) = g(X,W,T) + (Y - g(X,W,T)) / p_T(X,W)

Then for any given cate model calculates the loss::

loss(cate) = E_n[(Ydr(g, p) - cate(X))^2]

Also calculates a baseline loss based on a constant treatment effect model, i.e.::

base_loss = min_{theta} E_n[(Ydr(g, p) - theta)^2]

Returns an analogue of the R-square score for regression::

score = 1 - loss(cate) / base_loss

This corresponds to the extra variance of the outcome explained by introducing heterogeneity
in the effect as captured by the cate model, as opposed to always predicting a constant effect.
A negative score, means that the cate model performs even worse than a constant effect model
and hints at overfitting during training of the cate model.

This method was also advocated in recent work of [Schuleretal2018]_ when compared among several alternatives
for causal model selection and introduced in the work of [NieWager2017]_.

Parameters
----------
model_propensity : scikit-learn classifier or 'auto', default 'auto'
Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
where T is a shape (n, ) array.
If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.

model_regression : scikit-learn regressor or 'auto', default 'auto'
Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
`predict` methods. If different models per treatment arm are desired, see the
:class:`.MultiModelWrapper` helper class.
If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.

model_final :
estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
on (features X).

- If X is None, then the fit method of model_final should be able to handle X=None.
- If featurizer is not None and X is not None, then it is trained on the outcome of
featurizer.fit_transform(X).
- If multitask_model_final is True, then this model must support multitasking
and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously.
- The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to
baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a
mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.

multitask_model_final : bool, default False
Whether the model_final should be treated as a multi-task model. See description of model_final.

featurizer : :term:`transformer`, optional
Must support fit_transform and transform. Used to create composite features in the final CATE regression.
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.

min_propensity : float, default ``1e-6``
The minimum propensity at which to clip propensity estimates to avoid dividing by zero.

categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.

cv: int, cross-validation generator or an iterable, default 2
Determines the cross-validation splitting strategy.
Possible inputs for cv are:

- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`
- An iterable yielding (train, test) splits as arrays of indices.

For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).

Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.

mc_iters: int, optional
The number of times to rerun the first stage models to reduce the variance of the nuisances.

mc_agg: {'mean', 'median'}, default 'mean'
How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
cross-fitting.

random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.

References
----------
.. [NieWager2017] X. Nie and S. Wager.
Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
arXiv preprint arXiv:1712.04912, 2017.
`<https://arxiv.org/pdf/1712.04912.pdf>`_

.. [Schuleretal2018] Alejandro Schuler, Michael Baiocchi, Robert Tibshirani, Nigam Shah.
"A comparison of methods for model selection when estimating individual treatment effects."
Arxiv, 2018
`<https://arxiv.org/pdf/1804.05146.pdf>`_

"""

def __init__(self, *,
model_propensity='auto',
model_regression='auto',
model_final=StatsModelsLinearRegression(),
multitask_model_final=False,
featurizer=None,
min_propensity=1e-6,
categories='auto',
cv=2,
mc_iters=None,
mc_agg='mean',
random_state=None):
self.model_propensity = clone(model_propensity, safe=False)
self.model_regression = clone(model_regression, safe=False)
self.model_final = clone(model_final, safe=False)
self.multitask_model_final = multitask_model_final
self.featurizer = clone(featurizer, safe=False)
self.min_propensity = min_propensity
self.categories = categories
self.cv = cv
self.mc_iters = mc_iters
self.mc_agg = mc_agg
self.random_state = random_state

def fit(self, y, T, X=None, W=None, sample_weight=None, groups=None):
"""

Parameters
----------
Y: (n × d_y) matrix or vector of length n
Outcomes for each sample
T: (n × dₜ) matrix or vector of length n
Treatments for each sample
X: (n × dₓ) matrix, optional
Features for each sample
W: (n × d_w) matrix, optional
Controls for each sample
sample_weight: (n,) vector, optional
Weights for each row
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
If groups is not None, the `cv` argument passed to this class's initializer
must support a 'groups' argument to its split method.

Returns
-------
self
"""
if X is None:
raise ValueError("X cannot be None for the DRScorer!")

self.drlearner_ = DRLearner(model_propensity=self.model_propensity,
model_regression=self.model_regression,
model_final=self.model_final,
multitask_model_final=self.multitask_model_final,
featurizer=self.featurizer,
min_propensity=self.min_propensity,
categories=self.categories,
cv=self.cv,
mc_iters=self.mc_iters,
mc_agg=self.mc_agg,
random_state=self.random_state)
self.drlearner_.fit(y, T, X=None, W=np.hstack([v for v in [X, W] if v is not None]),
sample_weight=sample_weight, groups=groups, cache_values=True)
self.base_score_ = self.drlearner_.score_
self.dx_ = X.shape[1]
return self

def score(self, cate_model):
"""
Parameters
----------
cate_model : instance of fitted BaseCateEstimator

Returns
-------
score : double
An analogue of the DR-square loss for the causal setting.
"""
Y = self.drlearner_._cached_values.Y
T = self.drlearner_._cached_values.T
Y_pred, _ = self.drlearner_._cached_values.nuisances
Ydr = Y_pred[..., 1:] - Y_pred[..., [0]]
X = self.drlearner_._cached_values.W[:, :self.dx_]
sample_weight = self.drlearner_._cached_values.sample_weight
if Ydr.ndim == 1:
Ydr = Ydr.reshape((-1, 1))

effects = cate_model.const_marginal_effect(X).reshape((-1, Ydr.shape[1]))
if sample_weight is not None:
return 1 - np.mean(np.average((Ydr - effects)**2, weights=sample_weight, axis=0)) / self.base_score_
else:
return 1 - np.mean((Ydr - effects) ** 2) / self.base_score_

def best_model(self, cate_models, return_scores=False):
""" Chooses the best among a list of models

Parameters
----------
cate_models : list of instance of fitted BaseCateEstimator
return_scores : bool, default False
Whether to return the list scores of each model
Returns
-------
best_model : instance of fitted BaseCateEstimator
The model that achieves the best score
best_score : double
The score of the best model
scores : list of double
The list of scores for each of the input models. Returned only if `return_scores=True`.
"""
rscores = [self.score(mdl) for mdl in cate_models]
best = np.nanargmax(rscores)
if return_scores:
return cate_models[best], rscores[best], rscores
else:
return cate_models[best], rscores[best]

def ensemble(self, cate_models, eta=1000.0, return_scores=False):
""" Ensembles a list of models based on their performance

Parameters
----------
cate_models : list of instance of fitted BaseCateEstimator
eta : double, default 1000
The soft-max parameter for the ensemble
return_scores : bool, default False
Whether to return the list scores of each model
Returns
-------
ensemble_model : instance of fitted EnsembleCateEstimator
A fitted ensemble cate model that calculates effects based on a weighted
version of the input cate models, weighted by a softmax of their score
performance
ensemble_score : double
The score of the ensemble model
scores : list of double
The list of scores for each of the input models. Returned only if `return_scores=True`.
"""
drscores = np.array([self.score(mdl) for mdl in cate_models])
goodinds = np.isfinite(drscores)
weights = softmax(eta * drscores[goodinds])
goodmodels = [mdl for mdl, good in zip(cate_models, goodinds) if good]
ensemble = EnsembleCateEstimator(cate_models=goodmodels, weights=weights)
ensemble_score = self.score(ensemble)
if return_scores:
return ensemble, ensemble_score, drscores
else:
return ensemble, ensemble_score
97 changes: 97 additions & 0 deletions econml/tests/test_drscorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

import unittest
import numpy as np
import scipy.special
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import check_random_state
from econml.score import DRScorer
from econml.dr import DRLearner


class TestDRLearner(unittest.TestCase):
def test_default_models(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.001
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner()
est.fit(y, T, X=X, W=None)
assert est.const_marginal_effect(X[:2]).shape == (2, 2)
assert est.effect(X[:2], T0=0, T1=1).shape == (2,)
assert isinstance(est.score_, float)
assert isinstance(est.score(y, T, X=X), float)
assert len(est.model_cate(T=1).coef_.shape) == 1
assert len(est.model_cate(T=2).coef_.shape) == 1
assert isinstance(est.cate_feature_names(), list)
assert isinstance(est.models_regression[0][0].coef_, np.ndarray)
assert isinstance(est.models_propensity[0][0].coef_, np.ndarray)

def test_custom_models(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.01
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner(
model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10),
model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10),
model_final=LassoCV(cv=3),
featurizer=None
)
est.fit(y, T, X=X, W=None)
assert isinstance(est.score_, float)
assert est.const_marginal_effect(X[:3]).shape == (3, 2)
assert len(est.model_cate(T=2).coef_.shape) == 1
assert isinstance(est.model_cate(T=2).intercept_, float)
assert len(est.model_cate(T=1).coef_.shape) == 1
assert isinstance(est.model_cate(T=1).intercept_, float)

def test_cv_splitting_strategy(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.001
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner(cv=2)
est.fit(y, T, X=X, W=None)
assert est.const_marginal_effect(X[:2]).shape == (2, 2)

def test_mc_iters(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.001
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner()
est.fit(y, T, X=X, W=None, inference='bootstrap', n_bootstrap_samples=50)

self.aseertAlmostEqual(est.effect(X[:2], T0=0, T1=1, inference='bootstrap',
n_bootstrap_samples=50).shape[0], 50)
self.assertAlmostEqual(est.effect_interval(X[:2], T0=0, T1=1, alpha=0.05, inference='bootstrap',
n_bootstrap_samples=50).shape, (2, 50, 2))
self.assertAlmostEqual(est.ortho_summary(X[:2], T0=0, T1=1, inference='bootstrap',
n_bootstrap_samples=50).shape, (2, 2, 5))
self.assertAlmostEqual(est.ortho_intervals(X[:2], T0=0, T1=1, inference='bootstrap',
n_bootstrap_samples=50, method='normal').shape, (2, 2, 2, 2))

def test_score(self):
np.random.seed(123)
y = np.random.normal(size=(1000,))
T = np.random.binomial(2, 0.5, size=(1000,))
X = np.random.normal(size=(1000, 3))
est = DRScorer()
est.fit(y, T, X=X, W=None)
score = est.score()
self.assertAlmostEqual(score, 0.05778546)
# Test using baseline method (e.g., RScorer)
# Replace the following lines with the appropriate baseline method and its parameters
baseline_est = BaselineScorer()
baseline_est.fit(y, T, X=X, W=None)
score_baseline = baseline_est.score()
# Perform the comparison test
self.assertAlmostEqual(score_dr, score_baseline)