From 11fb0fc3dfe23c9db8b39e5ddd753e72ce460ef4 Mon Sep 17 00:00:00 2001
From: Miruna Oprescu <moprescu@microsoft.com>
Date: Tue, 9 Jun 2020 16:49:52 -0400
Subject: [PATCH] Fixed SubsampledHonestForest randomness (#258)

* Fixed SubsampledHonestForest randomness
* Added random seed to ForestDML tests
* Changed docs to reflect correct randomness usage.
---
 econml/sklearn_extensions/ensemble.py | 19 ++++++++++---------
 econml/tests/test_dml.py              |  6 ++++--
 econml/tests/test_ensemble.py         | 18 ++++++++++++++++++
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/econml/sklearn_extensions/ensemble.py b/econml/sklearn_extensions/ensemble.py
index 04bfdfd24..365fcb864 100644
--- a/econml/sklearn_extensions/ensemble.py
+++ b/econml/sklearn_extensions/ensemble.py
@@ -42,7 +42,7 @@ def _parallel_add_trees(tree, forest, X, y, sample_weight, s_inds, tree_idx, n_t
     if forest.honest:
         X_split, X_est, y_split, y_est,\
             sample_weight_split, sample_weight_est = train_test_split(
-                X, y, sample_weight, test_size=.5, shuffle=True)
+                X, y, sample_weight, test_size=.5, shuffle=True, random_state=tree.random_state)
     else:
         X_split, X_est, y_split, y_est, sample_weight_split, sample_weight_est =\
             X, X, y, y, sample_weight, sample_weight
@@ -319,11 +319,11 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
         n_estimators=1000, n_jobs=None, random_state=0,
         subsample_fr='auto', verbose=0, warm_start=False)
     >>> regr.feature_importances_
-    array([0.39..., 0.34..., 0.12..., 0.12...])
+    array([0.40..., 0.35..., 0.11..., 0.11...])
     >>> regr.predict(np.ones((1, 4)))
-    array([110.4...])
+    array([112.9...])
     >>> regr.predict_interval(np.ones((1, 4)), alpha=.05)
-    (array([93.0...]), array([127.7...]))
+    (array([94.9...]), array([130.9...]))
     >>> regr.score(X_test, y_test)
     0.94...
 
@@ -405,6 +405,7 @@ def __init__(self,
         self.min_impurity_decrease = min_impurity_decrease
         self.subsample_fr = subsample_fr
         self.honest = honest
+        self.random_state = random_state
         self.estimators_ = None
         self.vars_ = None
         self.subsample_fr_ = None
@@ -520,13 +521,13 @@ def fit(self, X, y, sample_weight=None, sample_var=None):
             # TODO. This slicing should ultimately be done inside the parallel function
             # so that we don't need to create a matrix of size roughly n_samples * n_estimators
             for it in range(self.n_slices):
-                half_sample_inds = np.random.choice(
+                half_sample_inds = random_state.choice(
                     X.shape[0], X.shape[0] // 2, replace=False)
                 for _ in np.arange(it * self.slice_len, min((it + 1) * self.slice_len, self.n_estimators)):
-                    s_inds.append(half_sample_inds[np.random.choice(X.shape[0] // 2,
-                                                                    int(np.ceil(self.subsample_fr_ *
-                                                                                (X.shape[0] // 2))),
-                                                                    replace=False)])
+                    s_inds.append(half_sample_inds[random_state.choice(X.shape[0] // 2,
+                                                                       int(np.ceil(self.subsample_fr_ *
+                                                                                   (X.shape[0] // 2))),
+                                                                       replace=False)])
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                              **_joblib_parallel_args(prefer='threads'))(
                 delayed(_parallel_add_trees)(
diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py
index 4bc9ad53b..5fcadbe68 100644
--- a/econml/tests/test_dml.py
+++ b/econml/tests/test_dml.py
@@ -587,7 +587,8 @@ def true_fn(x):
                                              subsample_fr=.8,
                                              min_samples_leaf=min_samples_leaf,
                                              min_impurity_decrease=0.001,
-                                             verbose=0, min_weight_fraction_leaf=.03)
+                                             verbose=0, min_weight_fraction_leaf=.03,
+                                             random_state=12345)
                 if summarized:
                     if sample_var:
                         est.fit(y_sum, T_sum, X_sum[:, :4], X_sum[:, 4:],
@@ -613,7 +614,8 @@ def true_fn(x):
                                              subsample_fr=.8,
                                              min_samples_leaf=min_samples_leaf,
                                              min_impurity_decrease=0.001,
-                                             verbose=0, min_weight_fraction_leaf=.03)
+                                             verbose=0, min_weight_fraction_leaf=.03,
+                                             random_state=12345)
                 if summarized:
                     if sample_var:
                         est.fit(y_sum, T_sum, X_sum[:, :4], X_sum[:, 4:],
diff --git a/econml/tests/test_ensemble.py b/econml/tests/test_ensemble.py
index 11a99d410..f87232da9 100644
--- a/econml/tests/test_ensemble.py
+++ b/econml/tests/test_ensemble.py
@@ -98,3 +98,21 @@ def test_dishonest_y2d(self):
             np.testing.assert_allclose(point, 1. * (X_test[:, [0, 0]] > 0), rtol=0, atol=.2)
             np.testing.assert_array_less(lb, 1. * (X_test[:, [0, 0]] > 0) + .05)
             np.testing.assert_array_less(1. * (X_test[:, [0, 0]] > 0), ub + .05)
+
+    def test_random_state(self):
+        np.random.seed(123)
+        n = 5000
+        d = 5
+        x_grid = np.linspace(-1, 1, 10)
+        X_test = np.hstack([x_grid.reshape(-1, 1), np.random.normal(size=(10, d - 1))])
+        X = np.random.normal(0, 1, size=(n, d))
+        y = X[:, 0] + np.random.normal(0, .1, size=(n,))
+        est = SubsampledHonestForest(n_estimators=100, max_depth=5, min_samples_leaf=10, verbose=0, random_state=12345)
+        est.fit(X, y)
+        point1 = est.predict(X_test)
+        est = SubsampledHonestForest(n_estimators=100, max_depth=5,
+                                     min_samples_leaf=10, verbose=0, random_state=12345)
+        est.fit(X, y)
+        point2 = est.predict(X_test)
+        # Check that the point estimates are the same
+        np.testing.assert_equal(point1, point2)