From a6fc2e6306a054823a5a110c30e031f62c3cb374 Mon Sep 17 00:00:00 2001 From: Jacob Pieniazek <89324971+jakepenzak@users.noreply.github.com> Date: Tue, 26 Nov 2024 18:25:27 -0500 Subject: [PATCH 1/2] Enable Discrete Outcomes w/ RScorer (#927) * api(enable discrete outcome in RScorer) Signed-off-by: Jacob Pieniazek * test(update DGP for test) Signed-off-by: Jacob Pieniazek * Update econml/tests/test_rscorer.py Co-authored-by: fverac Signed-off-by: Jacob Pieniazek <89324971+jakepenzak@users.noreply.github.com> Signed-off-by: Jacob Pieniazek * test(rlearner): Update test data dimensions in _get_data to allow testing w/ SLearner Signed-off-by: Jacob Pieniazek --------- Signed-off-by: Jacob Pieniazek Signed-off-by: Jacob Pieniazek <89324971+jakepenzak@users.noreply.github.com> Co-authored-by: fverac --- econml/score/rscorer.py | 6 ++ econml/tests/test_rscorer.py | 134 +++++++++++++++++++++++++---------- 2 files changed, 101 insertions(+), 39 deletions(-) diff --git a/econml/score/rscorer.py b/econml/score/rscorer.py index f54b15406..cf04ceb1a 100644 --- a/econml/score/rscorer.py +++ b/econml/score/rscorer.py @@ -51,6 +51,9 @@ class RScorer: discrete_treatment: bool, default ``False`` Whether the treatment values should be treated as categorical, rather than continuous, quantities + discrete_outcome: bool, default ``False`` + Whether the outcome should be treated as binary + categories: 'auto' or list, default 'auto' The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. @@ -104,6 +107,7 @@ def __init__(self, *, model_y, model_t, discrete_treatment=False, + discrete_outcome=False, categories='auto', cv=2, mc_iters=None, @@ -112,6 +116,7 @@ def __init__(self, *, self.model_y = clone(model_y, safe=False) self.model_t = clone(model_t, safe=False) self.discrete_treatment = discrete_treatment + self.discrete_outcome = discrete_outcome self.cv = cv self.categories = categories self.random_state = random_state @@ -150,6 +155,7 @@ def fit(self, y, T, X=None, W=None, sample_weight=None, groups=None): model_t=self.model_t, cv=self.cv, discrete_treatment=self.discrete_treatment, + discrete_outcome=self.discrete_outcome, categories=self.categories, random_state=self.random_state, mc_iters=self.mc_iters, diff --git a/econml/tests/test_rscorer.py b/econml/tests/test_rscorer.py index 4f41b5016..8b39fbe26 100644 --- a/econml/tests/test_rscorer.py +++ b/econml/tests/test_rscorer.py @@ -20,52 +20,108 @@ def _fit_model(name, model, Y, T, X): class TestRScorer(unittest.TestCase): - def _get_data(self): + def _get_data(self, discrete_outcome=False): X = np.random.normal(0, 1, size=(100000, 2)) T = np.random.binomial(1, .5, size=(100000,)) - y = X[:, 0] * T + np.random.normal(size=(100000,)) - return y, T, X, X[:, 0] + if discrete_outcome: + eps = np.random.normal(size=(100000,)) + log_odds = X[:, 0]*T + eps + y_sigmoid = 1/(1 + np.exp(-log_odds)) + y = np.array([np.random.binomial(1, p) for p in y_sigmoid]) + # Difference in conditional probabilities P(y=1|X,T=1) - P(y=1|X,T=0) + true_eff = (1 / (1 + np.exp(-(X[:, 0]+eps)))) - (1 / (1 + np.exp(-eps))) + else: + y = X[:, 0] * T + np.random.normal(size=(100000,)) + true_eff = X[:, 0] + + y = y.reshape(-1, 1) + T = T.reshape(-1, 1) + return y, T, X, true_eff def test_comparison(self): + def reg(): return LinearRegression() def clf(): return LogisticRegression() - y, T, X, true_eff = self._get_data() - (X_train, X_val, T_train, T_val, - Y_train, Y_val, _, true_eff_val) = train_test_split(X, T, y, true_eff, test_size=.4) - - models = [('ldml', LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True, cv=3)), - ('sldml', SparseLinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True, - featurizer=PolynomialFeatures(degree=2, include_bias=False), cv=3)), - ('xlearner', XLearner(models=reg(), cate_models=reg(), propensity_model=clf())), - ('dalearner', DomainAdaptationLearner(models=reg(), final_models=reg(), propensity_model=clf())), - ('slearner', SLearner(overall_model=reg())), - ('tlearner', TLearner(models=reg())), - ('drlearner', DRLearner(model_propensity=clf(), model_regression=reg(), - model_final=reg(), cv=3)), - ('rlearner', NonParamDML(model_y=reg(), model_t=clf(), model_final=reg(), - discrete_treatment=True, cv=3)), - ('dml3dlasso', DML(model_y=reg(), model_t=clf(), model_final=reg(), discrete_treatment=True, - featurizer=PolynomialFeatures(degree=3), cv=3)) - ] - - models = Parallel(n_jobs=1, verbose=1)(delayed(_fit_model)(name, mdl, - Y_train, T_train, X_train) - for name, mdl in models) - - scorer = RScorer(model_y=reg(), model_t=clf(), - discrete_treatment=True, cv=3, mc_iters=2, mc_agg='median') - scorer.fit(Y_val, T_val, X=X_val) - rscore = [scorer.score(mdl) for _, mdl in models] - rootpehe_score = [np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2)) - for _, mdl in models] - assert LinearRegression().fit(np.array(rscore).reshape(-1, 1), np.array(rootpehe_score)).coef_ < 0.5 - mdl, _ = scorer.best_model([mdl for _, mdl in models]) - rootpehe_best = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2)) - assert rootpehe_best < 1.5 * np.min(rootpehe_score) + 0.05 - mdl, _ = scorer.ensemble([mdl for _, mdl in models]) - rootpehe_ensemble = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2)) - assert rootpehe_ensemble < 1.5 * np.min(rootpehe_score) + 0.05 + test_cases = [ + {"name":"continuous_outcome", "discrete_outcome": False}, + {"name":"discrete_outcome", "discrete_outcome": True} + ] + + for case in test_cases: + with self.subTest(case["name"]): + discrete_outcome = case["discrete_outcome"] + + if discrete_outcome: + y, T, X, true_eff = self._get_data(discrete_outcome=True) + + models = [('ldml', LinearDML(model_y=clf(), model_t=clf(), discrete_treatment=True, + discrete_outcome=discrete_outcome, cv=3)), + ('sldml', SparseLinearDML(model_y=clf(), model_t=clf(), discrete_treatment=True, + discrete_outcome=discrete_outcome, + featurizer=PolynomialFeatures(degree=2, include_bias=False), + cv=3)), + ('drlearner', DRLearner(model_propensity=clf(), model_regression=clf(), model_final=reg(), + discrete_outcome=discrete_outcome, cv=3)), + ('rlearner', NonParamDML(model_y=clf(), model_t=clf(), model_final=reg(), + discrete_treatment=True, discrete_outcome=discrete_outcome, cv=3)), + ('dml3dlasso', DML(model_y=clf(), model_t=clf(), model_final=reg(), discrete_treatment=True, + discrete_outcome=discrete_outcome, + featurizer=PolynomialFeatures(degree=3), cv=3)), + # SLearner as baseline for rootpehe score - not enough variation in rscore w/ above models + ('slearner', SLearner(overall_model=reg())), + ] + + else: + y, T, X, true_eff = self._get_data() + + models = [('ldml', LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True, cv=3)), + ('sldml', SparseLinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True, + featurizer=PolynomialFeatures(degree=2, include_bias=False), + cv=3)), + ('xlearner', XLearner(models=reg(), cate_models=reg(), propensity_model=clf())), + ('dalearner', DomainAdaptationLearner(models=reg(), final_models=reg(), + propensity_model=clf())), + ('slearner', SLearner(overall_model=reg())), + ('tlearner', TLearner(models=reg())), + ('drlearner', DRLearner(model_propensity=clf(), model_regression=reg(), + model_final=reg(), cv=3)), + ('rlearner', NonParamDML(model_y=reg(), model_t=clf(), model_final=reg(), + discrete_treatment=True, cv=3)), + ('dml3dlasso', DML(model_y=reg(), model_t=clf(), model_final=reg(), + discrete_treatment=True, featurizer=PolynomialFeatures(degree=3), cv=3)) + ] + + (X_train, X_val, T_train, T_val, + Y_train, Y_val, _, true_eff_val) = train_test_split(X, T, y, true_eff, test_size=.4) + + models = Parallel(n_jobs=1, verbose=1)(delayed(_fit_model)(name, mdl, + Y_train, T_train, X_train) + for name, mdl in models) + + if discrete_outcome: + scorer = RScorer(model_y=clf(), model_t=clf(), + discrete_treatment=True, discrete_outcome=discrete_outcome, + cv=3, mc_iters=2, mc_agg='median') + else: + scorer = RScorer(model_y=reg(), model_t=clf(), + discrete_treatment=True, cv=3, + mc_iters=2, mc_agg='median') + + scorer.fit(Y_val, T_val, X=X_val) + rscore = [scorer.score(mdl) for _, mdl in models] + rootpehe_score = [np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2)) + for _, mdl in models] + # Checking neg corr between rscore and rootpehe (precision in estimating heterogeneous effects) + assert LinearRegression().fit(np.array(rscore).reshape(-1, 1), np.array(rootpehe_score)).coef_ < 0.5 + mdl, _ = scorer.best_model([mdl for _, mdl in models]) + rootpehe_best = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2)) + # Checking best model selection behaves as intended + assert rootpehe_best < 1.5 * np.min(rootpehe_score) + 0.05 + mdl, _ = scorer.ensemble([mdl for _, mdl in models]) + rootpehe_ensemble = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2)) + # Checking cate ensembling behaves as intended + assert rootpehe_ensemble < 1.5 * np.min(rootpehe_score) + 0.05 From 0e6e294b076a5f874cc026d676e0057f7787183f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Komi=20Bi-Ay=C3=A9fo=20ATSOU?= Date: Fri, 29 Nov 2024 13:12:35 +0100 Subject: [PATCH 2/2] Correct "Help Wanted" url in readme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current link : https://github.com/py-why/EconML?tab=readme-ov-file#help-wanted Correct link : https://github.com/py-why/EconML?tab=readme-ov-file#finding-issues-to-help-with Signed-off-by: Komi Bi-AyƩfo ATSOU --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c293579d5..2a66f357a 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ For information on use cases and background material on causal inference and het # News -If you'd like to contribute to this project, see the [Help Wanted](#help-wanted) section below. +If you'd like to contribute to this project, see the [Help Wanted](#finding-issues-to-help-with) section below. **July 3, 2024:** Release v0.15.1, see release notes [here](https://github.com/py-why/EconML/releases/tag/v0.15.1)