Add updates to wald_test method and unit testing files (#536)

* Update Wald Test and add testing py files * Update minor changes to wald test and unit testing * Add wald-test case and rename tests * Minor changes * Add an wald-test usage example * Add minor changes to test cases and usage example.
py-econometrics · Jul 11, 2024 · ca1200e · ca1200e
1 parent 83b7c01
commit ca1200e
Show file tree

Hide file tree

Showing 4 changed files with 380 additions and 56 deletions.
diff --git a/.coverage b/.coverage
diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py
@@ -6,10 +6,11 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 from formulaic import Formula
 from scipy.sparse import csr_matrix
 from scipy.sparse.linalg import lsqr, spsolve
-from scipy.stats import f, norm, t
+from scipy.stats import chi2, f, norm, t
 
 from pyfixest.errors import VcovTypeNotSupportedError
 from pyfixest.estimation.ritest import (
@@ -383,6 +384,10 @@ def vcov(
         Feols
             An instance of class [Feols(/reference/Feols.qmd) with updated inference.
         """
+        # Assuming `data` is the DataFrame in question
+        if isinstance(data, pl.DataFrame):
+            data = _polars_to_pandas(data)
+
         _data = self._data
         _has_fixef = self._has_fixef
         _is_iv = self._is_iv
@@ -439,7 +444,8 @@ def vcov(
             if data is not None:
                 # use input data set
                 self._cluster_df = _get_cluster_df(
-                    data=data, clustervar=self._clustervar
+                    data=data,
+                    clustervar=self._clustervar,
                 )
                 _check_cluster_df(cluster_df=self._cluster_df, data=data)
             else:
@@ -789,13 +795,29 @@ def add_fixest_multi_context(
         else:
             self._has_fixef = False
 
-    def wald_test(self, R=None, q=None, distribution="F") -> None:
+    def wald_test(self, R=None, q=None, distribution="F"):
         """
         Conduct Wald test.
 
-        Compute a Wald test for a linear hypothesis of the form Rb = q.
+        Compute a Wald test for a linear hypothesis of the form R * β = q.
+        where R is m x k matrix, β is a k x 1 vector of coefficients,
+        and q is m x 1 vector.
         By default, tests the joint null hypothesis that all coefficients are zero.
 
+        This method producues the following attriutes
+
+        _dfd : int
+            degree of freedom in denominator
+        _dfn : int
+            degree of freedom in numerator
+        _wald_statistic : scalar
+            Wald-statistics computed for hypothesis testing
+        _f_statistic : scalar
+            Wald-statistics(when R is an indentity matrix, and q being zero vector)
+            computed for hypothesis testing
+        _p_value : scalar
+            corresponding p-value for statistics
+
         Parameters
         ----------
         R : array-like, optional
@@ -812,65 +834,101 @@ def wald_test(self, R=None, q=None, distribution="F") -> None:
         -------
         pd.Series
             A pd.Series with the Wald statistic and p-value.
-        """
-        raise ValueError("wald_tests will be released as a feature with pyfixest 0.14.")
 
+        Examples
+        --------
+        import numpy as np
+        import pandas as pd
+
+        from pyfixest.estimation.estimation import feols
+
+        data = pd.read_csv("pyfixest/did/data/df_het.csv")
+        data = data.iloc[1:3000]
+
+        R = np.array([[1,-1]] )
+        q = np.array([0.0])
+
+        fml = "dep_var ~ treat"
+        fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))
+
+        # Wald test
+        fit.wald_test(R=R, q=q, distribution = "chi2")
+        f_stat = fit._f_statistic
+        p_stat = fit._p_value
+
+        print(f"Python f_stat: {f_stat}")
+        print(f"Python p_stat: {p_stat}")
+
+        # The code above produces the following results :
+        # Python f_stat: 256.55432910297003
+        # Python p_stat: 9.67406627744023e-58
+        """
         _vcov = self._vcov
         _N = self._N
         _k = self._k
         _beta_hat = self._beta_hat
         _k_fe = np.sum(self._k_fe.values) if self._has_fixef else 0
 
-        dfn = _N - _k_fe - _k
-        dfd = _k
-
-        # if R is not two dimensional, make it two dimensional
-        if R is not None:
-            if R.ndim == 1:
-                R = R.reshape((1, len(R)))
-            assert (
-                R.shape[1] == _k
-            ), "R must have the same number of columns as the number of coefficients."
-        else:
+        # If R is None, default to the identity matrix
+        if R is None:
             R = np.eye(_k)
 
-        if q is not None:
-            assert isinstance(
-                q, (int, float, np.ndarray)
-            ), "q must be a numeric scalar."
-            if isinstance(q, np.ndarray):
-                assert q.ndim == 1, "q must be a one-dimensional array or a scalar."
-                assert (
-                    q.shape[0] == R.shape[0]
-                ), "q must have the same number of rows as R."
-            warnings.warn(
-                "Note that the argument q is experimental and no unit tests are implemented. Please use with caution / take a look at the source code."
+        # Ensure R is two-dimensional
+        if R.ndim == 1:
+            R = R.reshape((1, len(R)))
+
+        if R.shape[1] != _k:
+            raise ValueError(
+                "The number of columns of R must be equal to the number of coefficients."
             )
-        else:
+
+        # If q is None, default to a vector of zeros
+        if q is None:
             q = np.zeros(R.shape[0])
+        else:
+            if not isinstance(q, (int, float, np.ndarray)):
+                raise ValueError("q must be a numeric scalar or array.")
+            if isinstance(q, np.ndarray):
+                if q.ndim != 1:
+                    raise ValueError("q must be a one-dimensional array or a scalar.")
+                if q.shape[0] != R.shape[0]:
+                    raise ValueError("q must have the same number of rows as R.")
 
-        assert distribution in [
-            "F",
-            "chi2",
-        ], "distribution must be either 'F' or 'chi2'."
+        n_restriction = R.shape[0]
+        self._dfn = n_restriction
+
+        if self._is_clustered:
+            self._dfd = np.min(np.array(self._G)) - 1
+        else:
+            self._dfd = _N - _k - _k_fe
 
         bread = R @ _beta_hat - q
         meat = np.linalg.inv(R @ _vcov @ R.T)
         W = bread.T @ meat @ bread
-
-        # this is chi-squared(k) distributed, with k = number of coefficients
         self._wald_statistic = W
-        self._f_statistic = W / dfd
+
+        # Check if distribution is "F" and R is not identity matrix
+        # or q is not zero vector
+        if distribution == "F" and (
+            not np.array_equal(R, np.eye(_k)) or not np.all(q == 0)
+        ):
+            warnings.warn(
+                "Distribution changed to chi2, as R is not an identity matrix and q is not a zero vector."
+            )
+            distribution = "chi2"
 
         if distribution == "F":
-            self._f_statistic_pvalue = f.sf(self._f_statistic, dfn=dfn, dfd=dfd)
-            # self._f_statistic_pvalue = 1 - chi2(df = _k).cdf(self._f_statistic)
+            self._f_statistic = W / self._dfn
+            self._p_value = 1 - f.cdf(self._f_statistic, dfn=self._dfn, dfd=self._dfd)
+            res = pd.Series({"statistic": self._f_statistic, "pvalue": self._p_value})
+        elif distribution == "chi2":
+            self._f_statistic = W / self._dfn
+            self._p_value = chi2.sf(self._wald_statistic, self._dfn)
             res = pd.Series(
-                {"statistic": self._f_statistic, "pvalue": self._f_statistic_pvalue}
+                {"statistic": self._wald_statistic, "pvalue": self._p_value}
             )
         else:
-            raise NotImplementedError("chi2 distribution not yet implemented.")
-            # self._wald_pvalue = 1 - chi2(df = _k).cdf(self._wald_statistic)
+            raise ValueError("Distribution must be F or chi2")
 
         return res
 
@@ -1529,12 +1587,12 @@ def tidy(self, alpha: Optional[float] = None) -> pd.DataFrame:
         Return a tidy pd.DataFrame with the point estimates, standard errors,
         t-statistics, and p-values.
 
-        Parameters 
+        Parameters
         ----------
         alpha: Optional[float]
-            The significance level for the confidence intervals. If None, 
-            computes a 95% confidence interval (`alpha = 0.05`). 
-        
+            The significance level for the confidence intervals. If None,
+            computes a 95% confidence interval (`alpha = 0.05`).
+
         Returns
         -------
         tidy_df : pd.DataFrame

diff --git a/tests/test_errors.py b/tests/test_errors.py
@@ -18,7 +18,7 @@
 from pyfixest.estimation.FormulaParser import FixestFormulaParser
 from pyfixest.estimation.multcomp import rwolf
 from pyfixest.report.summarize import etable, summary
-from pyfixest.utils.utils import get_data
+from pyfixest.utils.utils import get_data, ssc
 
 
 @pytest.fixture
@@ -407,3 +407,39 @@ def test_ritest_error(data):
         fit = pf.feols("Y ~ X1", data=data)
         fit.ritest(resampvar="X1", reps=100)
         fit.plot_ritest()
+
+
+def test_wald_test_invalid_distribution():
+    data = pd.read_csv("pyfixest/did/data/df_het.csv")
+    data = data.iloc[1:3000]
+
+    fml = "dep_var ~ treat"
+    fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))
+
+    with pytest.raises(ValueError):
+        fit.wald_test(R=np.array([[1, -1]]), distribution="abc")
+
+
+def test_wald_test_R_q_column_consistency():
+    data = pd.read_csv("pyfixest/did/data/df_het.csv")
+    data = data.iloc[1:3000]
+    fml = "dep_var ~ treat"
+    fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))
+
+    # Test with R.size[1] == number of coeffcients
+    with pytest.raises(ValueError):
+        fit.wald_test(R=np.array([[1, 0, 0]]))
+
+    # Test with q type
+    with pytest.raises(ValueError):
+        fit.wald_test(R=np.array([[1, 0]]), q="invalid type q")
+
+    # Test with q being a one-dimensional array or a scalar.
+    with pytest.raises(ValueError):
+        fit.wald_test(R=np.array([[1, 0], [0, 1]]), q=np.array([[0, 1]]))
+
+    # q must have the same number of rows as R
+    with pytest.raises(ValueError):
+        fit.wald_test(
+            R=np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), q=np.array([[0, 1]])
+        )