Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled feature_importances_ for our ForestDML and ForestDRLearner estimators #306

Merged
merged 17 commits into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion econml/cate_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from .inference import BootstrapInference
from .utilities import tensordot, ndim, reshape, shape, parse_final_model_params, inverse_onehot, Summary
from .inference import StatsModelsInference, StatsModelsInferenceDiscrete, LinearModelFinalInference,\
LinearModelFinalInferenceDiscrete, NormalInferenceResults
LinearModelFinalInferenceDiscrete, NormalInferenceResults, GenericSingleTreatmentModelFinalInference,\
GenericModelFinalInferenceDiscrete


class BaseCateEstimator(metaclass=abc.ABCMeta):
Expand Down Expand Up @@ -664,6 +665,19 @@ def _get_inference_options(self):
return options


class ForestModelFinalCateEstimatorMixin(BaseCateEstimator):

def _get_inference_options(self):
# add blb to parent's options
options = super()._get_inference_options()
options.update(blb=GenericSingleTreatmentModelFinalInference)
return options

@property
def feature_importances_(self):
return self.model_final.feature_importances_


class LinearModelFinalCateEstimatorDiscreteMixin(BaseCateEstimator):
# TODO Share some logic with non-discrete version
"""
Expand Down Expand Up @@ -861,3 +875,18 @@ def _get_inference_options(self):
options = super()._get_inference_options()
options.update(debiasedlasso=LinearModelFinalInferenceDiscrete)
return options


class ForestModelFinalCateEstimatorDiscreteMixin(BaseCateEstimator):

def _get_inference_options(self):
# add blb to parent's options
options = super()._get_inference_options()
options.update(blb=GenericModelFinalInferenceDiscrete)
return options

def feature_importances_(self, T):
_, T = self._expand_treatments(None, T)
ind = inverse_onehot(T).item() - 1
assert ind >= 0, "No model was fitted for the control"
return self.fitted_models_final[ind].feature_importances_
11 changes: 3 additions & 8 deletions econml/dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
from sklearn.utils import check_random_state
from .cate_estimator import (BaseCateEstimator, LinearCateEstimator,
TreatmentExpansionMixin, StatsModelsCateEstimatorMixin,
LinearModelFinalCateEstimatorMixin, DebiasedLassoCateEstimatorMixin)
LinearModelFinalCateEstimatorMixin, DebiasedLassoCateEstimatorMixin,
ForestModelFinalCateEstimatorMixin)
from .inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference
from ._rlearner import _RLearner
from .sklearn_extensions.model_selection import WeightedStratifiedKFold
Expand Down Expand Up @@ -866,7 +867,7 @@ def __init__(self,
random_state=random_state)


class ForestDMLCateEstimator(NonParamDMLCateEstimator):
class ForestDMLCateEstimator(NonParamDMLCateEstimator, ForestModelFinalCateEstimatorMixin):
vsyrgkanis marked this conversation as resolved.
Show resolved Hide resolved
""" Instance of NonParamDMLCateEstimator with a
:class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
as a final model, so as to enable non-parametric inference.
Expand Down Expand Up @@ -1058,12 +1059,6 @@ def __init__(self,
categories=categories,
n_splits=n_crossfit_splits, random_state=random_state)

def _get_inference_options(self):
# add statsmodels to parent's options
options = super()._get_inference_options()
options.update(blb=GenericSingleTreatmentModelFinalInference)
return options

def fit(self, Y, T, X=None, W=None, sample_weight=None, sample_var=None, inference=None):
"""
Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
Expand Down
11 changes: 3 additions & 8 deletions econml/drlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper, DebiasedLasso
from econml.sklearn_extensions.ensemble import SubsampledHonestForest
from econml._ortho_learner import _OrthoLearner
from econml.cate_estimator import StatsModelsCateEstimatorDiscreteMixin, DebiasedLassoCateEstimatorDiscreteMixin
from econml.cate_estimator import StatsModelsCateEstimatorDiscreteMixin, DebiasedLassoCateEstimatorDiscreteMixin,\
ForestModelFinalCateEstimatorDiscreteMixin
from econml.inference import GenericModelFinalInferenceDiscrete


Expand Down Expand Up @@ -949,7 +950,7 @@ def fitted_models_final(self):
return super().model_final.models_cate


class ForestDRLearner(DRLearner):
class ForestDRLearner(DRLearner, ForestModelFinalCateEstimatorDiscreteMixin):
vsyrgkanis marked this conversation as resolved.
Show resolved Hide resolved
""" Instance of DRLearner with a :class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
as a final model, so as to enable non-parametric inference.

Expand Down Expand Up @@ -1144,12 +1145,6 @@ def __init__(self,
categories=categories,
n_splits=n_crossfit_splits, random_state=random_state)

def _get_inference_options(self):
# add statsmodels to parent's options
options = super()._get_inference_options()
options.update(blb=GenericModelFinalInferenceDiscrete)
return options

def fit(self, Y, T, X=None, W=None, sample_weight=None, sample_var=None, inference=None):
"""
Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
Expand Down
60 changes: 39 additions & 21 deletions econml/sklearn_extensions/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def _parallel_add_trees(tree, forest, X, y, sample_weight, s_inds, tree_idx, n_t
children_left = tree.tree_.children_left
children_right = tree.tree_.children_right
stack = [(0, -1)] # seed is the root node id and its parent depth
numerator = np.empty_like(tree.tree_.value)
denominator = np.empty_like(tree.tree_.weighted_n_node_samples)
while len(stack) > 0:
node_id, parent_id = stack.pop()
# If minimum weight requirement or minimum leaf size requirement is not satisfied on estimation
Expand All @@ -81,17 +83,22 @@ def _parallel_add_trees(tree, forest, X, y, sample_weight, s_inds, tree_idx, n_t
tree.tree_.children_right[parent_id] = -1
else:
for i in range(tree.n_outputs_):
# Set the value of the node to: \sum_{i} sample_weight[i] 1{i \\in node} Y_i / |node|
tree.tree_.value[node_id, i] = value_est[i, node_id] / count_est[0, node_id]
# Set the weight of the node to: \sum_{i} sample_weight[i] 1{i \\in node} / |node|
tree.tree_.weighted_n_node_samples[node_id] = weight_est[0, node_id] / count_est[0, node_id]
# Set the numerator of the node to: \sum_{i} sample_weight[i] 1{i \\in node} Y_i / |node|
numerator[node_id, i] = value_est[i, node_id] / count_est[0, node_id]
# Set the value of the node to:
# \sum_{i} sample_weight[i] 1{i \\in node} Y_i / \sum_{i} sample_weight[i] 1{i \\in node}
tree.tree_.value[node_id, i] = value_est[i, node_id] / weight_est[0, node_id]
# Set the denominator of the node to: \sum_{i} sample_weight[i] 1{i \\in node} / |node|
denominator[node_id] = weight_est[0, node_id] / count_est[0, node_id]
# Set the weight of the node to: \sum_{i} sample_weight[i] 1{i \\in node}
tree.tree_.weighted_n_node_samples[node_id] = weight_est[0, node_id]
# Set the count to the estimation split count
tree.tree_.n_node_samples[node_id] = count_est[0, node_id]
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], node_id))
stack.append((children_right[node_id], node_id))

return tree
return tree, numerator, denominator


class SubsampledHonestForest(ForestRegressor, RegressorMixin):
Expand Down Expand Up @@ -314,7 +321,7 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
>>> regr.fit(X_train, y_train)
SubsampledHonestForest(n_estimators=1000, random_state=0)
>>> regr.feature_importances_
array([0.40..., 0.35..., 0.11..., 0.11...])
array([0.67..., 0.30..., 0.00... , 0.00...])
>>> regr.predict(np.ones((1, 4)))
array([112.9...])
>>> regr.predict_interval(np.ones((1, 4)), alpha=.05)
Expand Down Expand Up @@ -483,6 +490,8 @@ def fit(self, X, y, sample_weight=None, sample_var=None):
if not self.warm_start or not hasattr(self, "estimators_"):
# Free allocated memory, if any
self.estimators_ = []
self.numerators_ = []
self.denominators_ = []

n_more_estimators = self.n_estimators - len(self.estimators_)

Expand Down Expand Up @@ -523,21 +532,27 @@ def fit(self, X, y, sample_weight=None, sample_var=None):
int(np.ceil(self.subsample_fr_ *
(X.shape[0] // 2))),
replace=False)])
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
**_joblib_parallel_args(prefer='threads'))(
res = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
**_joblib_parallel_args(prefer='threads'))(
delayed(_parallel_add_trees)(
t, self, X, y, sample_weight, s_inds[i], i, len(trees),
verbose=self.verbose)
for i, t in enumerate(trees))
trees = [t[0] for t in res]
numerators = [t[1] for t in res]
denominators = [t[2] for t in res]
vsyrgkanis marked this conversation as resolved.
Show resolved Hide resolved
# Collect newly grown trees
self.estimators_.extend(trees)
self.numerators_.extend(numerators)
self.denominators_.extend(denominators)

return self

def _mean_fn(self, X, fn, acc, slice=None):
# Helper class that accumulates an arbitrary function in parallel on the accumulator acc
# and calls the function fn on each tree e and returns the mean output. The function fn
# should take as input a tree e, and return another function g_e, which takes as input X, check_input
# should take as input a tree e and associated numerator n and denominator d structures and
# return another function g_e, which takes as input X, check_input
# If slice is not None, but rather a tuple (start, end), then a subset of the trees from
# index start to index end will be used. The returned result is essentially:
# (mean over e in slice)(g_e(X)).
Expand All @@ -546,18 +561,21 @@ def _mean_fn(self, X, fn, acc, slice=None):
X = self._validate_X_predict(X)

if slice is None:
estimator_slice = self.estimators_
estimator_slice = zip(self.estimators_, self.numerators_, self.denominators_)
n_estimators = len(self.estimators_)
else:
estimator_slice = self.estimators_[slice[0]:slice[1]]
estimator_slice = zip(self.estimators_[slice[0]:slice[1]], self.numerators_[slice[0]:slice[1]],
self.denominators_[slice[0]:slice[1]])
n_estimators = slice[1] - slice[0]

# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(len(estimator_slice), self.n_jobs)
n_jobs, _, _ = _partition_estimators(n_estimators, self.n_jobs)
lock = threading.Lock()
Parallel(n_jobs=n_jobs, verbose=self.verbose,
**_joblib_parallel_args(require="sharedmem"))(
delayed(_accumulate_prediction)(fn(e), X, [acc], lock)
for e in estimator_slice)
acc /= len(estimator_slice)
delayed(_accumulate_prediction)(fn(e, n, d), X, [acc], lock)
for e, n, d in estimator_slice)
acc /= n_estimators
return acc

def _weight(self, X, slice=None):
Expand All @@ -582,7 +600,7 @@ def _weight(self, X, slice=None):
# Check data
X = self._validate_X_predict(X)
weight_hat = np.zeros((X.shape[0]), dtype=np.float64)
return self._mean_fn(X, lambda e: (lambda x, check_input: e.tree_.weighted_n_node_samples[e.apply(x)]),
return self._mean_fn(X, lambda e, n, d: (lambda x, check_input: d[e.apply(x)]),
weight_hat, slice=slice)

def _predict(self, X, slice=None):
Expand Down Expand Up @@ -610,12 +628,12 @@ def _predict(self, X, slice=None):
# Check data
X = self._validate_X_predict(X)
# avoid storing the output of every estimator by summing them here
if self.n_outputs_ > 1:
y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
else:
y_hat = np.zeros((X.shape[0]), dtype=np.float64)
y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
y_hat = self._mean_fn(X, lambda e, n, d: (lambda x, check_input: n[e.apply(x), :, 0]), y_hat, slice=slice)
if self.n_outputs_ == 1:
y_hat = y_hat.flatten()

return self._mean_fn(X, lambda e: e.predict, y_hat, slice=slice)
return y_hat

def _inference(self, X, stderr=False):
"""
Expand Down
4 changes: 4 additions & 0 deletions econml/tests/test_dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,10 @@ def make_random(is_discrete, d):
eff = est.effect(X, T0=T0, T1=T)
self.assertEqual(shape(eff), effect_shape)

if isinstance(est, ForestDMLCateEstimator):
np.testing.assert_array_equal(est.feature_importances_.shape,
[X.shape[1]])

if inf is not None:
const_marg_eff_int = est.const_marginal_effect_interval(X)
marg_eff_int = est.marginal_effect_interval(T, X)
Expand Down
5 changes: 5 additions & 0 deletions econml/tests/test_drlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,11 @@ def test_drlearner_with_inference_all_attributes(self):
# test summary function works
est.summary(t)

if isinstance(est, ForestDRLearner):
for t in [1, 2]:
np.testing.assert_array_equal(est.feature_importances_(t).shape,
[X.shape[1]])

@staticmethod
def _check_with_interval(truth, point, lower, upper):
np.testing.assert_allclose(point, truth, rtol=0, atol=.2)
Expand Down
86 changes: 63 additions & 23 deletions notebooks/Double Machine Learning Examples.ipynb

Large diffs are not rendered by default.

Loading