Fixed SubsampledHonestForest randomness

py-why · Jun 9, 2020 · 11e06f5 · 11e06f5
1 parent 62f0be6
commit 11e06f5
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 6 deletions.
diff --git a/econml/sklearn_extensions/ensemble.py b/econml/sklearn_extensions/ensemble.py
@@ -42,7 +42,7 @@ def _parallel_add_trees(tree, forest, X, y, sample_weight, s_inds, tree_idx, n_t
     if forest.honest:
         X_split, X_est, y_split, y_est,\
             sample_weight_split, sample_weight_est = train_test_split(
-                X, y, sample_weight, test_size=.5, shuffle=True)
+                X, y, sample_weight, test_size=.5, shuffle=True, random_state=tree.random_state)
     else:
         X_split, X_est, y_split, y_est, sample_weight_split, sample_weight_est =\
             X, X, y, y, sample_weight, sample_weight
@@ -284,6 +284,12 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
     estimators_ : list of DecisionTreeRegressor
         The collection of fitted sub-estimators.
 
+    feature_importances_ : array of shape = [n_features]
+        The feature importances (the higher, the more important the feature).
+        Feature importances are calculate based on the reduction in the
+        splitting criterion among the split samples (not the estimation samples).
+        So it might contain some upwards bias.
+
     n_features_ : int
         The number of features when ``fit`` is performed.
 
@@ -405,6 +411,7 @@ def __init__(self,
         self.min_impurity_decrease = min_impurity_decrease
         self.subsample_fr = subsample_fr
         self.honest = honest
+        self.random_state = random_state
         self.estimators_ = None
         self.vars_ = None
         self.subsample_fr_ = None
@@ -520,13 +527,13 @@ def fit(self, X, y, sample_weight=None, sample_var=None):
             # TODO. This slicing should ultimately be done inside the parallel function
             # so that we don't need to create a matrix of size roughly n_samples * n_estimators
             for it in range(self.n_slices):
-                half_sample_inds = np.random.choice(
+                half_sample_inds = random_state.choice(
                     X.shape[0], X.shape[0] // 2, replace=False)
                 for _ in np.arange(it * self.slice_len, min((it + 1) * self.slice_len, self.n_estimators)):
-                    s_inds.append(half_sample_inds[np.random.choice(X.shape[0] // 2,
-                                                                    int(np.ceil(self.subsample_fr_ *
-                                                                                (X.shape[0] // 2))),
-                                                                    replace=False)])
+                    s_inds.append(half_sample_inds[random_state.choice(X.shape[0] // 2,
+                                                                       int(np.ceil(self.subsample_fr_ *
+                                                                                   (X.shape[0] // 2))),
+                                                                       replace=False)])
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                              **_joblib_parallel_args(prefer='threads'))(
                 delayed(_parallel_add_trees)(

diff --git a/econml/tests/test_ensemble.py b/econml/tests/test_ensemble.py
@@ -98,3 +98,21 @@ def test_dishonest_y2d(self):
             np.testing.assert_allclose(point, 1. * (X_test[:, [0, 0]] > 0), rtol=0, atol=.2)
             np.testing.assert_array_less(lb, 1. * (X_test[:, [0, 0]] > 0) + .05)
             np.testing.assert_array_less(1. * (X_test[:, [0, 0]] > 0), ub + .05)
+
+    def test_random_state(self):
+        np.random.seed(123)
+        n = 5000
+        d = 5
+        x_grid = np.linspace(-1, 1, 10)
+        X_test = np.hstack([x_grid.reshape(-1, 1), np.random.normal(size=(10, d - 1))])
+        X = np.random.normal(0, 1, size=(n, d))
+        y = X[:, 0] + np.random.normal(0, .1, size=(n,))
+        est = SubsampledHonestForest(n_estimators=100, max_depth=5, min_samples_leaf=10, verbose=0, random_state=12345)
+        est.fit(X, y)
+        point1 = est.predict(X_test)
+        est = SubsampledHonestForest(n_estimators=100, max_depth=5,
+                                     min_samples_leaf=10, verbose=0, random_state=12345)
+        est.fit(X, y)
+        point2 = est.predict(X_test)
+        # Check that the point estimates are the same
+        np.testing.assert_equal(point1, point2)