Skip to content

Commit

Permalink
Fixed SubsampledHonestForest randomness
Browse files Browse the repository at this point in the history
  • Loading branch information
Miruna Oprescu committed Jun 9, 2020
1 parent 62f0be6 commit 11e06f5
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
19 changes: 13 additions & 6 deletions econml/sklearn_extensions/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _parallel_add_trees(tree, forest, X, y, sample_weight, s_inds, tree_idx, n_t
if forest.honest:
X_split, X_est, y_split, y_est,\
sample_weight_split, sample_weight_est = train_test_split(
X, y, sample_weight, test_size=.5, shuffle=True)
X, y, sample_weight, test_size=.5, shuffle=True, random_state=tree.random_state)
else:
X_split, X_est, y_split, y_est, sample_weight_split, sample_weight_est =\
X, X, y, y, sample_weight, sample_weight
Expand Down Expand Up @@ -284,6 +284,12 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
estimators_ : list of DecisionTreeRegressor
The collection of fitted sub-estimators.
feature_importances_ : array of shape = [n_features]
The feature importances (the higher, the more important the feature).
Feature importances are calculate based on the reduction in the
splitting criterion among the split samples (not the estimation samples).
So it might contain some upwards bias.
n_features_ : int
The number of features when ``fit`` is performed.
Expand Down Expand Up @@ -405,6 +411,7 @@ def __init__(self,
self.min_impurity_decrease = min_impurity_decrease
self.subsample_fr = subsample_fr
self.honest = honest
self.random_state = random_state
self.estimators_ = None
self.vars_ = None
self.subsample_fr_ = None
Expand Down Expand Up @@ -520,13 +527,13 @@ def fit(self, X, y, sample_weight=None, sample_var=None):
# TODO. This slicing should ultimately be done inside the parallel function
# so that we don't need to create a matrix of size roughly n_samples * n_estimators
for it in range(self.n_slices):
half_sample_inds = np.random.choice(
half_sample_inds = random_state.choice(
X.shape[0], X.shape[0] // 2, replace=False)
for _ in np.arange(it * self.slice_len, min((it + 1) * self.slice_len, self.n_estimators)):
s_inds.append(half_sample_inds[np.random.choice(X.shape[0] // 2,
int(np.ceil(self.subsample_fr_ *
(X.shape[0] // 2))),
replace=False)])
s_inds.append(half_sample_inds[random_state.choice(X.shape[0] // 2,
int(np.ceil(self.subsample_fr_ *
(X.shape[0] // 2))),
replace=False)])
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
**_joblib_parallel_args(prefer='threads'))(
delayed(_parallel_add_trees)(
Expand Down
18 changes: 18 additions & 0 deletions econml/tests/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,21 @@ def test_dishonest_y2d(self):
np.testing.assert_allclose(point, 1. * (X_test[:, [0, 0]] > 0), rtol=0, atol=.2)
np.testing.assert_array_less(lb, 1. * (X_test[:, [0, 0]] > 0) + .05)
np.testing.assert_array_less(1. * (X_test[:, [0, 0]] > 0), ub + .05)

def test_random_state(self):
np.random.seed(123)
n = 5000
d = 5
x_grid = np.linspace(-1, 1, 10)
X_test = np.hstack([x_grid.reshape(-1, 1), np.random.normal(size=(10, d - 1))])
X = np.random.normal(0, 1, size=(n, d))
y = X[:, 0] + np.random.normal(0, .1, size=(n,))
est = SubsampledHonestForest(n_estimators=100, max_depth=5, min_samples_leaf=10, verbose=0, random_state=12345)
est.fit(X, y)
point1 = est.predict(X_test)
est = SubsampledHonestForest(n_estimators=100, max_depth=5,
min_samples_leaf=10, verbose=0, random_state=12345)
est.fit(X, y)
point2 = est.predict(X_test)
# Check that the point estimates are the same
np.testing.assert_equal(point1, point2)

0 comments on commit 11e06f5

Please sign in to comment.