Skip to content

Commit

Permalink
Add updates to wald_test method and unit testing files (#536)
Browse files Browse the repository at this point in the history
* Update Wald Test and add testing py files

* Update minor changes to wald test and unit testing

* Add wald-test case and rename tests

* Minor changes

* Add an wald-test usage example

* Add minor changes to test cases and usage example.
  • Loading branch information
Jayhyung authored Jul 11, 2024
1 parent 83b7c01 commit ca1200e
Show file tree
Hide file tree
Showing 4 changed files with 380 additions and 56 deletions.
Binary file modified .coverage
Binary file not shown.
148 changes: 103 additions & 45 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

import numpy as np
import pandas as pd
import polars as pl
from formulaic import Formula
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import lsqr, spsolve
from scipy.stats import f, norm, t
from scipy.stats import chi2, f, norm, t

from pyfixest.errors import VcovTypeNotSupportedError
from pyfixest.estimation.ritest import (
Expand Down Expand Up @@ -383,6 +384,10 @@ def vcov(
Feols
An instance of class [Feols(/reference/Feols.qmd) with updated inference.
"""
# Assuming `data` is the DataFrame in question
if isinstance(data, pl.DataFrame):
data = _polars_to_pandas(data)

_data = self._data
_has_fixef = self._has_fixef
_is_iv = self._is_iv
Expand Down Expand Up @@ -439,7 +444,8 @@ def vcov(
if data is not None:
# use input data set
self._cluster_df = _get_cluster_df(
data=data, clustervar=self._clustervar
data=data,
clustervar=self._clustervar,
)
_check_cluster_df(cluster_df=self._cluster_df, data=data)
else:
Expand Down Expand Up @@ -789,13 +795,29 @@ def add_fixest_multi_context(
else:
self._has_fixef = False

def wald_test(self, R=None, q=None, distribution="F") -> None:
def wald_test(self, R=None, q=None, distribution="F"):
"""
Conduct Wald test.
Compute a Wald test for a linear hypothesis of the form Rb = q.
Compute a Wald test for a linear hypothesis of the form R * β = q.
where R is m x k matrix, β is a k x 1 vector of coefficients,
and q is m x 1 vector.
By default, tests the joint null hypothesis that all coefficients are zero.
This method producues the following attriutes
_dfd : int
degree of freedom in denominator
_dfn : int
degree of freedom in numerator
_wald_statistic : scalar
Wald-statistics computed for hypothesis testing
_f_statistic : scalar
Wald-statistics(when R is an indentity matrix, and q being zero vector)
computed for hypothesis testing
_p_value : scalar
corresponding p-value for statistics
Parameters
----------
R : array-like, optional
Expand All @@ -812,65 +834,101 @@ def wald_test(self, R=None, q=None, distribution="F") -> None:
-------
pd.Series
A pd.Series with the Wald statistic and p-value.
"""
raise ValueError("wald_tests will be released as a feature with pyfixest 0.14.")
Examples
--------
import numpy as np
import pandas as pd
from pyfixest.estimation.estimation import feols
data = pd.read_csv("pyfixest/did/data/df_het.csv")
data = data.iloc[1:3000]
R = np.array([[1,-1]] )
q = np.array([0.0])
fml = "dep_var ~ treat"
fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))
# Wald test
fit.wald_test(R=R, q=q, distribution = "chi2")
f_stat = fit._f_statistic
p_stat = fit._p_value
print(f"Python f_stat: {f_stat}")
print(f"Python p_stat: {p_stat}")
# The code above produces the following results :
# Python f_stat: 256.55432910297003
# Python p_stat: 9.67406627744023e-58
"""
_vcov = self._vcov
_N = self._N
_k = self._k
_beta_hat = self._beta_hat
_k_fe = np.sum(self._k_fe.values) if self._has_fixef else 0

dfn = _N - _k_fe - _k
dfd = _k

# if R is not two dimensional, make it two dimensional
if R is not None:
if R.ndim == 1:
R = R.reshape((1, len(R)))
assert (
R.shape[1] == _k
), "R must have the same number of columns as the number of coefficients."
else:
# If R is None, default to the identity matrix
if R is None:
R = np.eye(_k)

if q is not None:
assert isinstance(
q, (int, float, np.ndarray)
), "q must be a numeric scalar."
if isinstance(q, np.ndarray):
assert q.ndim == 1, "q must be a one-dimensional array or a scalar."
assert (
q.shape[0] == R.shape[0]
), "q must have the same number of rows as R."
warnings.warn(
"Note that the argument q is experimental and no unit tests are implemented. Please use with caution / take a look at the source code."
# Ensure R is two-dimensional
if R.ndim == 1:
R = R.reshape((1, len(R)))

if R.shape[1] != _k:
raise ValueError(
"The number of columns of R must be equal to the number of coefficients."
)
else:

# If q is None, default to a vector of zeros
if q is None:
q = np.zeros(R.shape[0])
else:
if not isinstance(q, (int, float, np.ndarray)):
raise ValueError("q must be a numeric scalar or array.")
if isinstance(q, np.ndarray):
if q.ndim != 1:
raise ValueError("q must be a one-dimensional array or a scalar.")
if q.shape[0] != R.shape[0]:
raise ValueError("q must have the same number of rows as R.")

assert distribution in [
"F",
"chi2",
], "distribution must be either 'F' or 'chi2'."
n_restriction = R.shape[0]
self._dfn = n_restriction

if self._is_clustered:
self._dfd = np.min(np.array(self._G)) - 1
else:
self._dfd = _N - _k - _k_fe

bread = R @ _beta_hat - q
meat = np.linalg.inv(R @ _vcov @ R.T)
W = bread.T @ meat @ bread

# this is chi-squared(k) distributed, with k = number of coefficients
self._wald_statistic = W
self._f_statistic = W / dfd

# Check if distribution is "F" and R is not identity matrix
# or q is not zero vector
if distribution == "F" and (
not np.array_equal(R, np.eye(_k)) or not np.all(q == 0)
):
warnings.warn(
"Distribution changed to chi2, as R is not an identity matrix and q is not a zero vector."
)
distribution = "chi2"

if distribution == "F":
self._f_statistic_pvalue = f.sf(self._f_statistic, dfn=dfn, dfd=dfd)
# self._f_statistic_pvalue = 1 - chi2(df = _k).cdf(self._f_statistic)
self._f_statistic = W / self._dfn
self._p_value = 1 - f.cdf(self._f_statistic, dfn=self._dfn, dfd=self._dfd)
res = pd.Series({"statistic": self._f_statistic, "pvalue": self._p_value})
elif distribution == "chi2":
self._f_statistic = W / self._dfn
self._p_value = chi2.sf(self._wald_statistic, self._dfn)
res = pd.Series(
{"statistic": self._f_statistic, "pvalue": self._f_statistic_pvalue}
{"statistic": self._wald_statistic, "pvalue": self._p_value}
)
else:
raise NotImplementedError("chi2 distribution not yet implemented.")
# self._wald_pvalue = 1 - chi2(df = _k).cdf(self._wald_statistic)
raise ValueError("Distribution must be F or chi2")

return res

Expand Down Expand Up @@ -1529,12 +1587,12 @@ def tidy(self, alpha: Optional[float] = None) -> pd.DataFrame:
Return a tidy pd.DataFrame with the point estimates, standard errors,
t-statistics, and p-values.
Parameters
Parameters
----------
alpha: Optional[float]
The significance level for the confidence intervals. If None,
computes a 95% confidence interval (`alpha = 0.05`).
The significance level for the confidence intervals. If None,
computes a 95% confidence interval (`alpha = 0.05`).
Returns
-------
tidy_df : pd.DataFrame
Expand Down
38 changes: 37 additions & 1 deletion tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pyfixest.estimation.FormulaParser import FixestFormulaParser
from pyfixest.estimation.multcomp import rwolf
from pyfixest.report.summarize import etable, summary
from pyfixest.utils.utils import get_data
from pyfixest.utils.utils import get_data, ssc


@pytest.fixture
Expand Down Expand Up @@ -407,3 +407,39 @@ def test_ritest_error(data):
fit = pf.feols("Y ~ X1", data=data)
fit.ritest(resampvar="X1", reps=100)
fit.plot_ritest()


def test_wald_test_invalid_distribution():
data = pd.read_csv("pyfixest/did/data/df_het.csv")
data = data.iloc[1:3000]

fml = "dep_var ~ treat"
fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))

with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, -1]]), distribution="abc")


def test_wald_test_R_q_column_consistency():
data = pd.read_csv("pyfixest/did/data/df_het.csv")
data = data.iloc[1:3000]
fml = "dep_var ~ treat"
fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))

# Test with R.size[1] == number of coeffcients
with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, 0, 0]]))

# Test with q type
with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, 0]]), q="invalid type q")

# Test with q being a one-dimensional array or a scalar.
with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, 0], [0, 1]]), q=np.array([[0, 1]]))

# q must have the same number of rows as R
with pytest.raises(ValueError):
fit.wald_test(
R=np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), q=np.array([[0, 1]])
)
Loading

0 comments on commit ca1200e

Please sign in to comment.