From 11fb0fc3dfe23c9db8b39e5ddd753e72ce460ef4 Mon Sep 17 00:00:00 2001 From: Miruna Oprescu Date: Tue, 9 Jun 2020 16:49:52 -0400 Subject: [PATCH] Fixed SubsampledHonestForest randomness (#258) * Fixed SubsampledHonestForest randomness * Added random seed to ForestDML tests * Changed docs to reflect correct randomness usage. --- econml/sklearn_extensions/ensemble.py | 19 ++++++++++--------- econml/tests/test_dml.py | 6 ++++-- econml/tests/test_ensemble.py | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/econml/sklearn_extensions/ensemble.py b/econml/sklearn_extensions/ensemble.py index 04bfdfd24..365fcb864 100644 --- a/econml/sklearn_extensions/ensemble.py +++ b/econml/sklearn_extensions/ensemble.py @@ -42,7 +42,7 @@ def _parallel_add_trees(tree, forest, X, y, sample_weight, s_inds, tree_idx, n_t if forest.honest: X_split, X_est, y_split, y_est,\ sample_weight_split, sample_weight_est = train_test_split( - X, y, sample_weight, test_size=.5, shuffle=True) + X, y, sample_weight, test_size=.5, shuffle=True, random_state=tree.random_state) else: X_split, X_est, y_split, y_est, sample_weight_split, sample_weight_est =\ X, X, y, y, sample_weight, sample_weight @@ -319,11 +319,11 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin): n_estimators=1000, n_jobs=None, random_state=0, subsample_fr='auto', verbose=0, warm_start=False) >>> regr.feature_importances_ - array([0.39..., 0.34..., 0.12..., 0.12...]) + array([0.40..., 0.35..., 0.11..., 0.11...]) >>> regr.predict(np.ones((1, 4))) - array([110.4...]) + array([112.9...]) >>> regr.predict_interval(np.ones((1, 4)), alpha=.05) - (array([93.0...]), array([127.7...])) + (array([94.9...]), array([130.9...])) >>> regr.score(X_test, y_test) 0.94... @@ -405,6 +405,7 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.subsample_fr = subsample_fr self.honest = honest + self.random_state = random_state self.estimators_ = None self.vars_ = None self.subsample_fr_ = None @@ -520,13 +521,13 @@ def fit(self, X, y, sample_weight=None, sample_var=None): # TODO. This slicing should ultimately be done inside the parallel function # so that we don't need to create a matrix of size roughly n_samples * n_estimators for it in range(self.n_slices): - half_sample_inds = np.random.choice( + half_sample_inds = random_state.choice( X.shape[0], X.shape[0] // 2, replace=False) for _ in np.arange(it * self.slice_len, min((it + 1) * self.slice_len, self.n_estimators)): - s_inds.append(half_sample_inds[np.random.choice(X.shape[0] // 2, - int(np.ceil(self.subsample_fr_ * - (X.shape[0] // 2))), - replace=False)]) + s_inds.append(half_sample_inds[random_state.choice(X.shape[0] // 2, + int(np.ceil(self.subsample_fr_ * + (X.shape[0] // 2))), + replace=False)]) trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_parallel_add_trees)( diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 4bc9ad53b..5fcadbe68 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -587,7 +587,8 @@ def true_fn(x): subsample_fr=.8, min_samples_leaf=min_samples_leaf, min_impurity_decrease=0.001, - verbose=0, min_weight_fraction_leaf=.03) + verbose=0, min_weight_fraction_leaf=.03, + random_state=12345) if summarized: if sample_var: est.fit(y_sum, T_sum, X_sum[:, :4], X_sum[:, 4:], @@ -613,7 +614,8 @@ def true_fn(x): subsample_fr=.8, min_samples_leaf=min_samples_leaf, min_impurity_decrease=0.001, - verbose=0, min_weight_fraction_leaf=.03) + verbose=0, min_weight_fraction_leaf=.03, + random_state=12345) if summarized: if sample_var: est.fit(y_sum, T_sum, X_sum[:, :4], X_sum[:, 4:], diff --git a/econml/tests/test_ensemble.py b/econml/tests/test_ensemble.py index 11a99d410..f87232da9 100644 --- a/econml/tests/test_ensemble.py +++ b/econml/tests/test_ensemble.py @@ -98,3 +98,21 @@ def test_dishonest_y2d(self): np.testing.assert_allclose(point, 1. * (X_test[:, [0, 0]] > 0), rtol=0, atol=.2) np.testing.assert_array_less(lb, 1. * (X_test[:, [0, 0]] > 0) + .05) np.testing.assert_array_less(1. * (X_test[:, [0, 0]] > 0), ub + .05) + + def test_random_state(self): + np.random.seed(123) + n = 5000 + d = 5 + x_grid = np.linspace(-1, 1, 10) + X_test = np.hstack([x_grid.reshape(-1, 1), np.random.normal(size=(10, d - 1))]) + X = np.random.normal(0, 1, size=(n, d)) + y = X[:, 0] + np.random.normal(0, .1, size=(n,)) + est = SubsampledHonestForest(n_estimators=100, max_depth=5, min_samples_leaf=10, verbose=0, random_state=12345) + est.fit(X, y) + point1 = est.predict(X_test) + est = SubsampledHonestForest(n_estimators=100, max_depth=5, + min_samples_leaf=10, verbose=0, random_state=12345) + est.fit(X, y) + point2 = est.predict(X_test) + # Check that the point estimates are the same + np.testing.assert_equal(point1, point2)