PyFixest 0.10.10 (#212)

* attempt to fix #211 * cleanup * clarification * fix i() bug for i(var), not for i(var1, var2) * add drop_intercept argument to feols, fepois * fix error with did2s inference * add error when 0 or -1 in first stage * fix tests, output warning for i(var1, var2) syntax * format code * update figure + fix small bug * no 0 in second stage did test * bump version * deprecate i(var1, var2) syntax, update news * format * bring back i(var1, var2) * format
py-econometrics · Nov 10, 2023 · c683dd2 · c683dd2
1 parent f14e765
commit c683dd2
Show file tree

Hide file tree

Showing 12 changed files with 994 additions and 877 deletions.
diff --git a/docs/news.md b/docs/news.md
@@ -1,5 +1,9 @@
 # News
 
+## PyFixest `0.10.10`
+
+Fixes a bug with variable interactions via `i(var)` syntax. See [issue #221](https://github.com/s3alfisc/pyfixest/issues/211).
+
 ## PyFixest `0.10.9`
 
 Makes `etable()` prettier and more informative.

diff --git a/figures/event_study.svg b/figures/event_study.svg
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyfixest/FixestMulti.py b/pyfixest/FixestMulti.py
@@ -56,6 +56,7 @@ def _prepare_estimation(
         vcov: Union[None, str, Dict[str, str]] = None,
         ssc: Dict[str, str] = {},
         fixef_rm: str = "none",
+        drop_intercept: bool = False,
         i_ref1: Optional[Union[List, str]] = None,
         i_ref2: Optional[Union[List, str]] = None,
     ) -> None:
@@ -70,6 +71,7 @@ def _prepare_estimation(
             ssc (Dict[str, str], optional): A dictionary specifying the type of standard errors to use for inference. See `feols()` or `fepois()`.
             fixef_rm (str, optional): A string specifying whether singleton fixed effects should be dropped.
                 Options are "none" (default) and "singleton". If "singleton", singleton fixed effects are dropped.
+            drop_intercept (bool, optional): Whether to drop the intercept. Default is False.
             i_ref1 (Optional[Union[List, str]], optional): A list or string specifying the reference category for the first interaction variable.
             i_ref2 (Optional[Union[List, str]], optional): A list or string specifying the reference category for the second interaction variable.
 
@@ -85,6 +87,9 @@ def _prepare_estimation(
         self._drop_singletons = None
         self._fixef_keys = None
         self._is_multiple_estimation = None
+        self._i_ref1 = None
+        self._i_ref2 = None
+        self._drop_intercept = None
 
         # set i_ref1 and i_ref2 to list if not None
         if i_ref1 is not None:
@@ -110,8 +115,10 @@ def _prepare_estimation(
         self._ssc_dict = ssc
         self._drop_singletons = _drop_singletons(fixef_rm)
         self._fixef_keys = list(self._fml_dict.keys())
+
         self._i_ref1 = i_ref1
         self._i_ref2 = i_ref2
+        self._drop_intercept = drop_intercept
 
     def _estimate_all_models(
         self,
@@ -147,6 +154,7 @@ def _estimate_all_models(
         _method = self._method
         _drop_singletons = self._drop_singletons
         _ssc_dict = self._ssc_dict
+        _drop_intercept = self._drop_intercept
         _i_ref1 = self._i_ref1
         _i_ref2 = self._i_ref2
 
@@ -184,7 +192,11 @@ def _estimate_all_models(
                         _icovars,
                         X_is_empty,
                     ) = model_matrix_fixest(
-                        fml=fml, data=_data, i_ref1=_i_ref1, i_ref2=_i_ref2
+                        fml=fml,
+                        data=_data,
+                        drop_intercept=_drop_intercept,
+                        i_ref1=_i_ref1,
+                        i_ref2=_i_ref2,
                     )
 
                     weights = np.ones((Y.shape[0], 1))

diff --git a/pyfixest/estimation.py b/pyfixest/estimation.py
@@ -13,6 +13,7 @@ def feols(
     ssc=ssc(),
     fixef_rm: str = "none",
     collin_tol: float = 1e-10,
+    drop_intercept: bool = False,
     i_ref1: Optional[Union[list, str]] = None,
     i_ref2: Optional[Union[list, str]] = None,
 ) -> Union[Feols, FixestMulti]:
@@ -70,6 +71,9 @@ def feols(
                             via the diagonal cholesky decomposition of the correlation matrix of the variables.
                             If the tolerance is higher, more variables will be dropped.
 
+        drop_intercept (bool): Whether to drop the intercept from the model. False by default. If True, the intercept will be dropped **after** creating the model matrix via formulaic.
+                               This implies that reference levels for categorical variables will be dropped as well and are not recovered.
+
         i_ref1 (Optional[Union[list, str]]): A list of strings or a string specifying the reference category for the first set of categorical variables in the formula, interacted via "i()".
         i_ref2 (Optional[Union[list, str]]): A list of strings or a string specifying the reference category for the second set of categorical variables in the formula, interacted via "i()".
 
@@ -135,7 +139,9 @@ def feols(
     _estimation_input_checks(fml, data, vcov, ssc, fixef_rm, collin_tol, i_ref1)
 
     fixest = FixestMulti(data=data)
-    fixest._prepare_estimation("feols", fml, vcov, ssc, fixef_rm, i_ref1, i_ref2)
+    fixest._prepare_estimation(
+        "feols", fml, vcov, ssc, fixef_rm, drop_intercept, i_ref1, i_ref2
+    )
 
     # demean all models: based on fixed effects x split x missing value combinations
     fixest._estimate_all_models(vcov, fixest._fixef_keys, collin_tol=collin_tol)
@@ -155,6 +161,7 @@ def fepois(
     iwls_tol: float = 1e-08,
     iwls_maxiter: int = 25,
     collin_tol: float = 1e-10,
+    drop_intercept: bool = False,
     i_ref1: Optional[Union[list, str]] = None,
     i_ref2: Optional[Union[list, str]] = None,
 ) -> Union[Fepois, FixestMulti]:
@@ -210,6 +217,10 @@ def fepois(
 
         collin_tol (float): tolerance for collinearity check. 1e-06 by default. If collinear variables are detected, they will be dropped from the model. The performed check is
                             via the diagonal cholesky decomposition of the correlation matrix of the variables. If the tolerance is higher, more variables will be dropped.
+
+        drop_intercept (bool): Whether to drop the intercept from the model. False by default. If True, the intercept will be dropped **after** creating the model matrix via formulaic.
+                               This implies that reference levels for categorical variables will be dropped as well and are not recovered.
+
         i_ref1 (Optional[Union[list, str]]): A list of strings or a string specifying the reference category for the first set of categorical variables in the formula, interacted via "i()".
         i_ref2 (Optional[Union[list, str]]): A list of strings or a string specifying the reference category for the second set of categorical variables in the formula, interacted via "i()".
 
@@ -264,7 +275,9 @@ def fepois(
 
     fixest = FixestMulti(data=data)
 
-    fixest._prepare_estimation("fepois", fml, vcov, ssc, fixef_rm, i_ref1, i_ref2)
+    fixest._prepare_estimation(
+        "fepois", fml, vcov, ssc, fixef_rm, drop_intercept, i_ref1, i_ref2
+    )
     if fixest._is_iv:
         raise NotImplementedError(
             "IV Estimation is not supported for Poisson Regression"

diff --git a/pyfixest/experimental/did.py b/pyfixest/experimental/did.py
@@ -355,7 +355,7 @@ def _did2s_estimate(
     """
 
     _first_stage_full = f"{yname} {_first_stage}"
-    _second_stage_full = f"{yname}_hat {_second_stage} + 0"
+    _second_stage_full = f"{yname}_hat {_second_stage}"
 
     if treatment is not None:
         if treatment not in data.columns:
@@ -382,6 +382,13 @@ def _did2s_estimate(
     else:
         _not_yet_treated_data = data[data["ATT"] == False]
 
+    # check if first stage formulas has fixed effects
+    if "|" not in _first_stage:
+        raise ValueError("First stage formula must contain fixed effects.")
+    # check if second stage formulas has fixed effects
+    if "|" in _second_stage:
+        raise ValueError("Second stage formula must not contain fixed effects.")
+
     # estimate first stage
     fit1 = feols(
         fml=_first_stage_full,
@@ -399,8 +406,14 @@ def _did2s_estimate(
     _first_u = data[f"{yname}"].to_numpy().flatten() - Y_hat
     data[f"{yname}_hat"] = _first_u
 
+    # intercept needs to be dropped by hand due to the presence of fixed effects in the first stage
     fit2 = feols(
-        _second_stage_full, data=data, vcov="iid", i_ref1=i_ref1, i_ref2=i_ref2
+        _second_stage_full,
+        data=data,
+        vcov="iid",
+        drop_intercept=True,
+        i_ref1=i_ref1,
+        i_ref2=i_ref2,
     )
     _second_u = fit2.resid()
 
@@ -447,13 +460,23 @@ def _did2s_vcov(
     first_stage_fe = "+".join(first_stage_fe)
     first_stage = f"{first_stage_x}+{first_stage_fe}"
 
-    second_stage = f"{second_stage} + 0"
+    second_stage = f"{second_stage}"
 
+    # note for future Alex: intercept needs to be dropped! it is not as fixed effects are converted to
+    # dummies, hence has_fixed checks are False
     _, X1, _, _, _, _, _, _, _ = model_matrix_fixest(
-        fml=f"{yname} {first_stage}", data=data, i_ref1=i_ref1, i_ref2=i_ref2
+        fml=f"{yname} {first_stage}",
+        data=data,
+        drop_intercept=False,
+        i_ref1=i_ref1,
+        i_ref2=i_ref2,
     )
     _, X2, _, _, _, _, _, _, _ = model_matrix_fixest(
-        fml=f"{yname} {second_stage}", data=data, i_ref1=i_ref1, i_ref2=i_ref2
+        fml=f"{yname} {second_stage}",
+        data=data,
+        drop_intercept=True,
+        i_ref1=i_ref1,
+        i_ref2=i_ref2,
     )  # reference values not dropped, multicollinearity error
 
     X1 = csr_matrix(X1.values)
@@ -540,6 +563,12 @@ def did2s(
     assert first_stage[0] == "~", "First stage must start with ~"
     assert second_stage[0] == "~", "Second stage must start with ~"
 
+    # assert that there is no 0, -1 or - 1 in the second stage formula
+    if "0" in second_stage or "-1" in second_stage:
+        raise ValueError(
+            "The second stage formula should not contain '0' or '-1'. Note that the intercept is dropped automatically due to the presence of fixed effects in the first stage."
+        )
+
     data = data.copy()
 
     fit, first_u, second_u = _did2s_estimate(