Skip to content

Commit

Permalink
Vasilis/casestudy (#441)
Browse files Browse the repository at this point in the history
* changed how aggregation happens in policy ensembles. Added policy learning to mulit-investment attribution. Fixed some mistkaes in that case study

* added clipping to the denominator in the dr correction to avoid division by zero, in the dr estimate of ate and att in cfdml.
  • Loading branch information
vsyrgkanis authored May 3, 2021
1 parent 9630421 commit 5bf448a
Show file tree
Hide file tree
Showing 9 changed files with 437 additions and 55 deletions.
2 changes: 1 addition & 1 deletion azure-pipelines-steps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
condition: and(succeeded(), eq(variables['Agent.OS'], 'Linux'))

# Install the package
- script: 'python -m pip install --upgrade pip && pip install --upgrade setuptools wheel Cython && pip install ${{ parameters.package }}'
- script: 'pip install --upgrade setuptools wheel Cython && pip install ${{ parameters.package }}'
displayName: 'Install dependencies'

- ${{ parameters.job.steps }}
2 changes: 1 addition & 1 deletion econml/dml/causal_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def fit(self, X, T, T_res, Y_res, sample_weight=None, freq_weight=None, sample_v
"where available.")
residuals = Y_res - np.einsum('ijk,ik->ij', oob_preds, T_res)
propensities = T - T_res
VarT = propensities * (1 - propensities)
VarT = np.clip(propensities * (1 - propensities), 1e-10, np.inf)
drpreds = oob_preds
drpreds += cross_product(residuals, T_res / VarT).reshape((-1, Y_res.shape[1], T_res.shape[1]))
drpreds[np.isnan(oob_preds)] = np.nan
Expand Down
26 changes: 23 additions & 3 deletions econml/policy/_drlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from warnings import warn
import numpy as np
from sklearn.base import clone
from ..utilities import check_inputs, filter_none_kwargs
from ..utilities import check_inputs, filter_none_kwargs, check_input_arrays
from ..dr import DRLearner
from ..dr._drlearner import _ModelFinal
from .._tree_exporter import _SingleTreeExporterMixin
Expand Down Expand Up @@ -98,6 +98,24 @@ def predict_value(self, X):
"""
return self.drlearner_.const_marginal_effect(X)

def predict_proba(self, X):
""" Predict the probability of recommending each treatment
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples.
Returns
-------
treatment_proba : array-like of shape (n_samples, n_treatments)
The probability of each treatment recommendation
"""
X, = check_input_arrays(X)
if self.drlearner_.featurizer_ is not None:
X = self.drlearner_.featurizer_.fit_transform(X)
return self.policy_model_.predict_proba(X)

def predict(self, X):
""" Get recommended treatment for each sample.
Expand All @@ -111,9 +129,11 @@ def predict(self, X):
treatment : array-like of shape (n_samples,)
The index of the recommended treatment in the same order as in categories, or in
lexicographic order if `categories='auto'`. 0 corresponds to the baseline/control treatment.
For ensemble policy models, recommended treatments are aggregated from each model in the ensemble
and the treatment that receives the most votes is returned. Use `predict_proba` to get the fraction
of models in the ensemble that recommend each treatment for each sample.
"""
values = self.predict_value(X)
return np.argmax(np.hstack([np.zeros((values.shape[0], 1)), values]), axis=1)
return np.argmax(self.predict_proba(X), axis=1)

def policy_feature_names(self, *, feature_names=None):
"""
Expand Down
44 changes: 42 additions & 2 deletions econml/policy/_forest/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,43 @@ def predict_value(self, X):

return y_hat

def predict_proba(self, X):
""" Predict the probability of recommending each treatment
Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The input samples. Internally, it will be converted to
``dtype=np.float64``.
check_input : bool, default=True
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Returns
-------
treatment_proba : array-like of shape (n_samples, n_treatments)
The probability of each treatment recommendation
"""
check_is_fitted(self)
# Check data
X = self._validate_X_predict(X)

# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

# avoid storing the output of every estimator by summing them here
y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)

# Parallel loop
lock = threading.Lock()
Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
delayed(_accumulate_prediction)(e.predict_proba, X, [y_hat], lock)
for e in self.estimators_)

y_hat /= len(self.estimators_)

return y_hat

def predict(self, X):
""" Predict the best treatment for each sample
Expand All @@ -467,6 +504,9 @@ def predict(self, X):
Returns
-------
treatment : array-like of shape (n_samples)
The recommded treatment, i.e. the treatment index with the largest reward for each sample
The recommded treatment, i.e. the treatment index most often predicted to have the highest reward
for each sample. Recommended treatments are aggregated from each tree in the ensemble and the treatment
that receives the most votes is returned. Use `predict_proba` to get the fraction of trees in the ensemble
that recommend each treatment for each sample.
"""
return np.argmax(self.predict_value(X), axis=1)
return np.argmax(self.predict_proba(X), axis=1)
24 changes: 24 additions & 0 deletions econml/policy/_forest/_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,30 @@ def predict(self, X, check_input=True):
pred = self.tree_.predict(X)
return np.argmax(pred, axis=1)

def predict_proba(self, X, check_input=True):
""" Predict the probability of recommending each treatment
Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The input samples. Internally, it will be converted to
``dtype=np.float64``.
check_input : bool, default=True
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Returns
-------
treatment_proba : array-like of shape (n_samples, n_treatments)
The probability of each treatment recommendation
"""
check_is_fitted(self)
X = self._validate_X_predict(X, check_input)
pred = self.tree_.predict(X)
proba = np.zeros(pred.shape)
proba[np.arange(X.shape[0]), np.argmax(pred, axis=1)] = 1
return proba

def predict_value(self, X, check_input=True):
""" Predict the expected value of each treatment for each sample
Expand Down
14 changes: 14 additions & 0 deletions econml/tests/test_policy_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ def test_non_standard_input(self,):
forest = PolicyForest(n_estimators=20, n_jobs=1, random_state=123).fit(X, y)
pred = forest.predict(X)
pred_val = forest.predict_value(X)
pred_prob = forest.predict_proba(X)
assert pred_prob.shape == (X.shape[0], 2)
feat_imp = forest.feature_importances()
forest = PolicyForest(n_estimators=20, n_jobs=1, random_state=123).fit(X.astype(np.float32),
np.asfortranarray(y))
Expand All @@ -298,12 +300,15 @@ def test_non_standard_input(self,):
forest = PolicyForest(n_estimators=20, n_jobs=1, random_state=123).fit(tuple(X), tuple(y))
np.testing.assert_allclose(pred, forest.predict(tuple(X)))
np.testing.assert_allclose(pred_val, forest.predict_value(tuple(X)))
np.testing.assert_allclose(pred_prob, forest.predict_proba(tuple(X)))
forest = PolicyForest(n_estimators=20, n_jobs=1, random_state=123).fit(list(X), list(y))
np.testing.assert_allclose(pred, forest.predict(list(X)))
np.testing.assert_allclose(pred_val, forest.predict_value(list(X)))
np.testing.assert_allclose(pred_prob, forest.predict_proba(list(X)))
forest = PolicyForest(n_estimators=20, n_jobs=1, random_state=123).fit(pd.DataFrame(X), pd.DataFrame(y))
np.testing.assert_allclose(pred, forest.predict(pd.DataFrame(X)))
np.testing.assert_allclose(pred_val, forest.predict_value(pd.DataFrame(X)))
np.testing.assert_allclose(pred_prob, forest.predict_proba(pd.DataFrame(X)))

groups = np.repeat(np.arange(X.shape[0]), 2)
Xraw = X.copy()
Expand All @@ -324,6 +329,15 @@ def test_non_standard_input(self,):
forest.predict_value(Xraw[mask]).flatten(), atol=.08)
np.testing.assert_allclose(feat_imp, forest.feature_importances(), atol=1e-4)
np.testing.assert_allclose(feat_imp, forest.feature_importances_, atol=1e-4)
pred = forest.predict(X)
pred_val = forest.predict_value(X)
pred_prob = forest.predict_proba(X)
np.testing.assert_allclose(pred, forest.predict(tuple(X)))
np.testing.assert_allclose(pred_val, forest.predict_value(tuple(X)))
np.testing.assert_allclose(pred, forest.predict(pd.DataFrame(X)))
np.testing.assert_allclose(pred_val, forest.predict_value(pd.DataFrame(X)))
np.testing.assert_allclose(pred_prob, forest.predict_proba(pd.DataFrame(X)))

return

def test_raise_exceptions(self,):
Expand Down

Large diffs are not rendered by default.

35 changes: 17 additions & 18 deletions notebooks/Policy Learning with Trees and Forests.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ tests_require =
seaborn
lightgbm
xgboost
jupyter-client <= 6.1.12

[options.extras_require]
automl =
Expand Down

0 comments on commit 5bf448a

Please sign in to comment.