Skip to content

Commit

Permalink
Updated _cv_functions.py, _h_selection.py, _kernel_test.py, kernel_te…
Browse files Browse the repository at this point in the history
…st/_utils.py, _poisson_kernel_test.py

Incorporated changes in the test statistics (included U-statistics and V-statistics for the test), changed Uniformity Test Statistic, removed latex formatting from the h_selection plots, completed black formatting.
  • Loading branch information
rmj3197 committed Apr 29, 2024
1 parent 9ed3948 commit 61ca4a8
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 88 deletions.
9 changes: 2 additions & 7 deletions QuadratiK/kernel_test/_cv_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ def cv_normality(
sigma_hat,
num_iter=500,
quantile=0.95,
centering_type="param",
random_state=None,
n_jobs=8,
):
Expand Down Expand Up @@ -146,8 +145,6 @@ def cv_normality(
quantile : float, optional
The quantile of the distribution used to select the critical value.
centering_type : str, optional
random_state : int, None, optional.
Seed for random number generation. Defaults to None
Expand All @@ -164,12 +161,10 @@ def cv_normality(
Critical value for the specified dimension, size and quantile.
"""
results = Parallel(n_jobs=n_jobs)(
delayed(normal_cv_helper)(
size, h, mu_hat, sigma_hat, centering_type, i, random_state
)
delayed(normal_cv_helper)(size, h, mu_hat, sigma_hat, i, random_state)
for i in range(num_iter)
)
return np.quantile(results, quantile)
return np.quantile(results, quantile, axis=0)


def cv_ksample(
Expand Down
14 changes: 7 additions & 7 deletions QuadratiK/kernel_test/_h_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import pandas as pd
from scipy.stats import skew, skewnorm
from sklearn.utils.parallel import Parallel, delayed
import matplotlib

usetex = matplotlib.checkdep_usetex(True)
import matplotlib.pyplot as plt

from ._utils import stat_ksample, stat_two_sample, stat_normality_test
Expand Down Expand Up @@ -110,22 +113,19 @@ def _objective_one_sample(
random_state=random_state,
)

statistic = stat_normality_test(
xnew, h, np.array([mean_dat]), np.diag(s_dat), "param"
)
statistic = stat_normality_test(xnew, h, np.array([mean_dat]), np.diag(s_dat))
cv = cv_normality(
n,
h,
np.array([mean_dat]),
np.diag(s_dat),
num_iter,
quantile,
"param",
random_state=random_state,
n_jobs=n_jobs,
)
h0 = statistic < cv
return [rep_values, delta, h, h0]
return [rep_values, delta, h, h0[0]]


def _objective_two_sample(
Expand Down Expand Up @@ -759,11 +759,11 @@ def select_h(
group["power"],
marker="o",
linestyle="-",
label=f"$\\delta$={round(delta, 3)}",
label=f"delta={round(delta, 3)}",
)
plt.xlabel("h")
plt.ylabel("Power")
plt.title(r"h vs Power for different $\delta$")
plt.title("h vs Power for different delta")
plt.legend()
plt.close()
return (min_h, all_results, figure)
Expand Down
134 changes: 84 additions & 50 deletions QuadratiK/kernel_test/_kernel_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,49 +97,49 @@ class KernelTest:
Examples
--------
>>> # Example for normality test
>>> import numpy as np
>>> np.random.seed(78990)
>>> from QuadratiK.kernel_test import KernelTest
>>> np.random.seed(42)
>>> data = np.random.randn(100,5)
>>> normality_test = KernelTest(h=0.4, centering_type="param",random_state=42).test(data)
>>> print("Test : {}".format(normality_test.test_type_))
>>> print("Execution time: {:.3f}".format(normality_test.execution_time))
>>> print("H0 is Rejected : {}".format(normality_test.h0_rejected_))
>>> print("Test Statistic : {}".format(normality_test.test_statistic_))
>>> print("Critical Value (CV) : {}".format(normality_test.cv_))
>>> print("CV Method : {}".format(normality_test.cv_method_))
>>> print("Selected tuning parameter : {}".format(normality_test.h))
>>> # data generation
>>> data_norm = np.random.multivariate_normal(mean = np.zeros(4), cov = np.eye(4),size = 500)
>>> # performing the normality test
>>> normality_test = KernelTest(h=0.4, num_iter=150, method= "subsampling", random_state=42).test(data_norm)
>>> print(f"Test : {normality_test.test_type_}")
>>> print(f"Execution time: {normality_test.execution_time:.3f}")
>>> print(f"H0 is Rejected : {normality_test.un_h0_rejected_}")
>>> print(f"Test Statistic : {normality_test.un_test_statistic_}")
>>> print(f"Critical Value (CV) : {normality_test.un_cv_}")
>>> print(f"CV Method : {normality_test.cv_method_}")
... Test : Kernel-based quadratic distance Normality test
... Execution time: 0.096
... Execution time: 0.356
... H0 is Rejected : False
... Test Statistic : -8.588873037044384e-05
... Critical Value (CV) : 0.0004464111809800183
... Test Statistic : 0.01018599246239244
... Critical Value (CV) : 0.07765034009837886
... CV Method : Empirical
... Selected tuning parameter : 0.4
>>> # Example for two sample test
>>> import numpy as np
>>> np.random.seed(0)
>>> from scipy.stats import skewnorm
>>> from QuadratiK.kernel_test import KernelTest
>>> np.random.seed(42)
>>> X = np.random.randn(100,5)
>>> np.random.seed(42)
>>> Y = np.random.randn(100,5)
>>> two_sample_test = KernelTest(h=0.4, centering_type="param").test(X,Y)
>>> # data generation
>>> X_2 = np.random.multivariate_normal(mean = np.zeros(4), cov = np.eye(4), size=200)
>>> Y_2 = skewnorm.rvs(size=(200, 4),loc=np.zeros(4), scale=np.ones(4),a=np.repeat(0.5,4), random_state=0)
>>> # performing the two sample test
>>> two_sample_test = KernelTest(h = 2,num_iter = 150, random_state=42).test(X_2,Y_2)
>>> print("Test : {}".format(two_sample_test.test_type_))
>>> print("Execution time: {:.3f}".format(two_sample_test.execution_time))
>>> print("H0 is Rejected : {}".format(two_sample_test.h0_rejected_))
>>> print("Test Statistic : {}".format(two_sample_test.test_statistic_))
>>> print("Critical Value (CV) : {}".format(two_sample_test.cv_))
>>> print("Execution time: {:.3f} seconds".format(two_sample_test.execution_time))
>>> print("H0 is Rejected : {}".format(two_sample_test.un_h0_rejected_))
>>> print("Test Statistic : {}".format(two_sample_test.un_test_statistic_))
>>> print("Critical Value (CV) : {}".format(two_sample_test.un_cv_))
>>> print("CV Method : {}".format(two_sample_test.cv_method_))
>>> print("Selected tuning parameter : {}".format(two_sample_test.h))
... Test : Kernel-based quadratic distance two-sample test
... Execution time: 0.092
... H0 is Rejected : False
... Test Statistic : -0.019707895277270022
... Critical Value (CV) : 0.003842482597612725
... Execution time: 0.265 seconds
... H0 is Rejected : True
... Test Statistic : 0.035620906539451845
... Critical Value (CV) : 0.0023528111662230473
... CV Method : subsampling
... Selected tuning parameter : 0.4
... Selected tuning parameter : 2
"""

def __init__(
Expand Down Expand Up @@ -261,26 +261,27 @@ def test(self, x, y=None):
if k == 1:
self.sigma_hat = np.array([[np.take(self.sigma_hat, 0)]])

statistic = stat_normality_test(
self.x, self.h, self.mu_hat, self.sigma_hat, self.centering_type
)
statistic = stat_normality_test(self.x, self.h, self.mu_hat, self.sigma_hat)
cv = cv_normality(
size_x,
self.h,
self.mu_hat,
self.sigma_hat,
self.num_iter,
self.quantile,
self.centering_type,
self.random_state,
self.n_jobs,
)
h0 = statistic > cv
un_h0 = statistic[0] > cv[0]
vn_h0 = statistic[1] > cv[1]

self.test_type_ = "Kernel-based quadratic distance Normality test"
self.h0_rejected_ = h0
self.test_statistic_ = statistic
self.cv_ = cv
self.un_h0_rejected_ = un_h0
self.vn_h0_rejected_ = vn_h0
self.un_test_statistic_ = statistic[0]
self.vn_test_statistic_ = statistic[1]
self.un_cv_ = cv[0]
self.vn_cv_ = cv[1]
self.cv_method_ = "Empirical"
return self

Expand Down Expand Up @@ -342,14 +343,16 @@ def test(self, x, y=None):
h0 = statistic > cv

self.test_type_ = "Kernel-based quadratic distance two-sample test"
self.h0_rejected_ = h0
self.test_statistic_ = statistic
self.cv_ = cv
self.un_h0_rejected_ = h0
self.vn_h0_rejected_ = None
self.un_test_statistic_ = statistic
self.vn_test_statistic_ = None
self.un_cv_ = cv
self.vn_cv_ = None
self.cv_method_ = self.method
return self

else:
print("Entered this K Sample Else")
if (self.y is not None) and (self.x.shape[0] != self.y.shape[0]):
raise ValueError("'x' and 'y' must have the same number of rows.")

Expand All @@ -376,12 +379,16 @@ def test(self, x, y=None):
self.random_state,
self.n_jobs,
)
h0 = statistic[0] > cv[0]
un_h0 = statistic[0] > cv[0]
vn_h0 = statistic[1] > cv[1]

self.test_type_ = "Kernel-based quadratic distance K-sample test"
self.h0_rejected_ = h0
self.test_statistic_ = statistic
self.cv_ = cv
self.un_h0_rejected_ = un_h0
self.vn_h0_rejected_ = vn_h0
self.un_test_statistic_ = statistic[0]
self.vn_test_statistic_ = statistic[1]
self.un_cv_ = cv[0]
self.vn_cv_ = cv[1]
self.cv_method_ = self.method
return self

Expand Down Expand Up @@ -414,10 +421,37 @@ def summary(self, print_fmt="simple_grid"):
format with the kernel test results and summary statistics.
"""
res = pd.DataFrame()
res[""] = [self.test_type_, self.test_statistic_, self.cv_, self.h0_rejected_]
res = res.set_axis(
["Test Type", "Test Statistic", "Critical Value", "Reject H0"]
)
if self.vn_test_statistic_ is None:
res[""] = [
self.test_type_,
self.un_test_statistic_,
self.un_cv_,
self.un_h0_rejected_,
]
res = res.set_axis(
["Test Type", "Un Test Statistic", "Un Critical Value", "Reject H0"]
)
else:
res[""] = [
self.test_type_,
self.un_test_statistic_,
self.un_cv_,
self.un_h0_rejected_,
self.vn_test_statistic_,
self.vn_cv_,
self.vn_h0_rejected_,
]
res = res.set_axis(
[
"Test Type",
"Un Test Statistic",
"Un Critical Value",
"Un Reject H0",
"Vn Test Statistic",
"Vn Critical Value",
"Vn Reject H0",
]
)

summary_stats_df = self.stats()

Expand Down
41 changes: 21 additions & 20 deletions QuadratiK/kernel_test/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,14 @@ def nonparam_centering(kmat_zz, n_z):
centered kernel matrix : numpy.ndarray
Matrix of centered kernel.
"""
kmat = np.copy(kmat_zz)
np.fill_diagonal(kmat, 0)

k_center = (
kmat_zz
- np.sum(kmat_zz, axis=1, keepdims=True) / n_z
- np.sum(kmat_zz, axis=0, keepdims=True) / n_z
+ (np.sum(kmat_zz)) / (n_z * (n_z - 1))
- np.sum(kmat, axis=1, keepdims=True) / (n_z - 1)
- np.sum(kmat, axis=0, keepdims=True) / (n_z - 1)
+ (np.sum(kmat)) / (n_z * (n_z - 1))
)
return k_center

Expand Down Expand Up @@ -149,10 +152,10 @@ def stat_two_sample(x_mat, y_mat, h, mu_hat, sigma_hat, centering_type="nonparam
- 2 * (np.sum(k_center[:n_x, n_x : n_x + n_y]) / (n_x * n_y))
+ (np.sum(k_center[n_x : n_x + n_y, n_x : n_x + n_y]) / (n_y * (n_y - 1)))
)
return test_non_par
return n_z * test_non_par


def stat_normality_test(x_mat, h, mu_hat, sigma_hat, centering_type="param"):
def stat_normality_test(x_mat, h, mu_hat, sigma_hat):
"""
Compute kernel-based quadratic distance test for
Normality
Expand All @@ -179,15 +182,16 @@ def stat_normality_test(x_mat, h, mu_hat, sigma_hat, centering_type="param"):
k = x_mat.shape[1]
cov_h = (h**2) * np.identity(k)
kmat_zz = compute_kernel_matrix(x_mat, x_mat, cov_h)
np.fill_diagonal(kmat_zz, 0)

if centering_type == "nonparam":
k_center = nonparam_centering(kmat_zz, n_x)
elif centering_type == "param":
k_center = param_centering(kmat_zz, x_mat, cov_h, mu_hat, sigma_hat)
np.fill_diagonal(k_center, 0)
test_normality = np.sum(k_center) / (n_x * (n_x - 1))
return test_normality
k_center = param_centering(kmat_zz, x_mat, cov_h, mu_hat, sigma_hat)

# Compute the normality test V-statistic
Vn = n_x * np.sum(k_center) / (n_x) ** 2

# Compute the normality test U-statistic
Un = n_x * ((np.sum(k_center) - np.sum(np.diagonal(k_center))) / (n_x * (n_x - 1)))

return np.array([Un, Vn])


def stat_ksample(x, y, h):
Expand Down Expand Up @@ -245,12 +249,12 @@ def stat_ksample(x, y, h):
cum_size[l] : cum_size[l + 1], cum_size[r] : cum_size[r + 1]
]
)
stat1 = (k - 1) * trace_k + tn
stat2 = trace_k + tn / (k - 1)
stat1 = n * ((k - 1) * trace_k + tn)
stat2 = n * trace_k
return np.array([stat1, stat2])


def normal_cv_helper(size, h, mu_hat, sigma_hat, centering_type, n_rep, random_state):
def normal_cv_helper(size, h, mu_hat, sigma_hat, n_rep, random_state):
"""
Generate a sample from a multivariate normal distribution and perform a
kernel-based quadratic distance test for normality.
Expand All @@ -271,9 +275,6 @@ def normal_cv_helper(size, h, mu_hat, sigma_hat, centering_type, n_rep, random_s
sigma_hat : numpy.ndarray
Covariance matrix of the reference distribution.
centering_type : str
String indicating the method used for centering the normal kernel.
n_rep : int
The number of replication.
Expand All @@ -292,7 +293,7 @@ def normal_cv_helper(size, h, mu_hat, sigma_hat, centering_type, n_rep, random_s
generator = check_random_state(random_state + n_rep)

dat = generator.multivariate_normal(mu_hat.ravel(), sigma_hat, size)
return stat_normality_test(dat, h, mu_hat, sigma_hat, centering_type)
return stat_normality_test(dat, h, mu_hat, sigma_hat)


def bootstrap_helper_twosample(size_x, size_y, h, data_pool, n_rep, random_state):
Expand Down
4 changes: 2 additions & 2 deletions QuadratiK/poisson_kernel_test/_poisson_kernel_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ def test(self, x):
quantile=self.quantile,
random_state=self.random_state,
n_jobs=self.n_jobs,
)
) / np.sqrt(var_un)

self.test_type_ = method
self.u_statistic_h0_ = pk[0] > cv_un
self.u_statistic_h0_ = (pk[0] / np.sqrt(var_un)) > cv_un
self.u_statistic_un_ = pk[0] / np.sqrt(var_un)
self.u_statistic_cv_ = cv_un

Expand Down
Loading

0 comments on commit 61ca4a8

Please sign in to comment.