Updated _cv_functions.py, _h_selection.py, _kernel_test.py, kernel_te…

…st/_utils.py, _poisson_kernel_test.py Incorporated changes in the test statistics (included U-statistics and V-statistics for the test), changed Uniformity Test Statistic, removed latex formatting from the h_selection plots, completed black formatting.
rmj3197 · Apr 29, 2024 · 61ca4a8 · 61ca4a8
1 parent 9ed3948
commit 61ca4a8
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 88 deletions.
diff --git a/QuadratiK/kernel_test/_cv_functions.py b/QuadratiK/kernel_test/_cv_functions.py
@@ -110,7 +110,6 @@ def cv_normality(
     sigma_hat,
     num_iter=500,
     quantile=0.95,
-    centering_type="param",
     random_state=None,
     n_jobs=8,
 ):
@@ -146,8 +145,6 @@ def cv_normality(
         quantile : float, optional
             The quantile of the distribution used to select the critical value.
 
-        centering_type : str, optional
-
         random_state : int, None, optional.
             Seed for random number generation. Defaults to None
 
@@ -164,12 +161,10 @@ def cv_normality(
             Critical value for the specified dimension, size and quantile.
     """
     results = Parallel(n_jobs=n_jobs)(
-        delayed(normal_cv_helper)(
-            size, h, mu_hat, sigma_hat, centering_type, i, random_state
-        )
+        delayed(normal_cv_helper)(size, h, mu_hat, sigma_hat, i, random_state)
         for i in range(num_iter)
     )
-    return np.quantile(results, quantile)
+    return np.quantile(results, quantile, axis=0)
 
 
 def cv_ksample(

diff --git a/QuadratiK/kernel_test/_h_selection.py b/QuadratiK/kernel_test/_h_selection.py
@@ -6,6 +6,9 @@
 import pandas as pd
 from scipy.stats import skew, skewnorm
 from sklearn.utils.parallel import Parallel, delayed
+import matplotlib
+
+usetex = matplotlib.checkdep_usetex(True)
 import matplotlib.pyplot as plt
 
 from ._utils import stat_ksample, stat_two_sample, stat_normality_test
@@ -110,22 +113,19 @@ def _objective_one_sample(
         random_state=random_state,
     )
 
-    statistic = stat_normality_test(
-        xnew, h, np.array([mean_dat]), np.diag(s_dat), "param"
-    )
+    statistic = stat_normality_test(xnew, h, np.array([mean_dat]), np.diag(s_dat))
     cv = cv_normality(
         n,
         h,
         np.array([mean_dat]),
         np.diag(s_dat),
         num_iter,
         quantile,
-        "param",
         random_state=random_state,
         n_jobs=n_jobs,
     )
     h0 = statistic < cv
-    return [rep_values, delta, h, h0]
+    return [rep_values, delta, h, h0[0]]
 
 
 def _objective_two_sample(
@@ -759,11 +759,11 @@ def select_h(
                 group["power"],
                 marker="o",
                 linestyle="-",
-                label=f"$\\delta$={round(delta, 3)}",
+                label=f"delta={round(delta, 3)}",
             )
         plt.xlabel("h")
         plt.ylabel("Power")
-        plt.title(r"h vs Power for different $\delta$")
+        plt.title("h vs Power for different delta")
         plt.legend()
         plt.close()
         return (min_h, all_results, figure)

diff --git a/QuadratiK/kernel_test/_kernel_test.py b/QuadratiK/kernel_test/_kernel_test.py
@@ -97,49 +97,49 @@ class KernelTest:
 
     Examples
     --------
-    >>> # Example for normality test
     >>> import numpy as np
+    >>> np.random.seed(78990)
     >>> from QuadratiK.kernel_test import KernelTest
-    >>> np.random.seed(42)
-    >>> data = np.random.randn(100,5)
-    >>> normality_test = KernelTest(h=0.4, centering_type="param",random_state=42).test(data)
-    >>> print("Test : {}".format(normality_test.test_type_))
-    >>> print("Execution time: {:.3f}".format(normality_test.execution_time))
-    >>> print("H0 is Rejected : {}".format(normality_test.h0_rejected_))
-    >>> print("Test Statistic : {}".format(normality_test.test_statistic_))
-    >>> print("Critical Value (CV) : {}".format(normality_test.cv_))
-    >>> print("CV Method : {}".format(normality_test.cv_method_))
-    >>> print("Selected tuning parameter : {}".format(normality_test.h))
+    >>> # data generation
+    >>> data_norm = np.random.multivariate_normal(mean = np.zeros(4), cov = np.eye(4),size = 500)
+    >>> # performing the normality test
+    >>> normality_test = KernelTest(h=0.4, num_iter=150, method= "subsampling", random_state=42).test(data_norm)
+    >>> print(f"Test : {normality_test.test_type_}")
+    >>> print(f"Execution time: {normality_test.execution_time:.3f}")
+    >>> print(f"H0 is Rejected : {normality_test.un_h0_rejected_}")
+    >>> print(f"Test Statistic : {normality_test.un_test_statistic_}")
+    >>> print(f"Critical Value (CV) : {normality_test.un_cv_}")
+    >>> print(f"CV Method : {normality_test.cv_method_}")
     ... Test : Kernel-based quadratic distance Normality test
-    ... Execution time: 0.096
+    ... Execution time: 0.356
     ... H0 is Rejected : False
-    ... Test Statistic : -8.588873037044384e-05
-    ... Critical Value (CV) : 0.0004464111809800183
+    ... Test Statistic : 0.01018599246239244
+    ... Critical Value (CV) : 0.07765034009837886
     ... CV Method : Empirical
-    ... Selected tuning parameter : 0.4
 
-    >>> # Example for two sample test
     >>> import numpy as np
+    >>> np.random.seed(0)
+    >>> from scipy.stats import skewnorm
     >>> from QuadratiK.kernel_test import KernelTest
-    >>> np.random.seed(42)
-    >>> X = np.random.randn(100,5)
-    >>> np.random.seed(42)
-    >>> Y = np.random.randn(100,5)
-    >>> two_sample_test = KernelTest(h=0.4, centering_type="param").test(X,Y)
+    >>> # data generation
+    >>> X_2 = np.random.multivariate_normal(mean = np.zeros(4), cov = np.eye(4), size=200)
+    >>> Y_2 = skewnorm.rvs(size=(200, 4),loc=np.zeros(4), scale=np.ones(4),a=np.repeat(0.5,4), random_state=0)
+    >>> # performing the two sample test
+    >>> two_sample_test = KernelTest(h = 2,num_iter = 150, random_state=42).test(X_2,Y_2)
     >>> print("Test : {}".format(two_sample_test.test_type_))
-    >>> print("Execution time: {:.3f}".format(two_sample_test.execution_time))
-    >>> print("H0 is Rejected : {}".format(two_sample_test.h0_rejected_))
-    >>> print("Test Statistic : {}".format(two_sample_test.test_statistic_))
-    >>> print("Critical Value (CV) : {}".format(two_sample_test.cv_))
+    >>> print("Execution time: {:.3f} seconds".format(two_sample_test.execution_time))
+    >>> print("H0 is Rejected : {}".format(two_sample_test.un_h0_rejected_))
+    >>> print("Test Statistic : {}".format(two_sample_test.un_test_statistic_))
+    >>> print("Critical Value (CV) : {}".format(two_sample_test.un_cv_))
     >>> print("CV Method : {}".format(two_sample_test.cv_method_))
     >>> print("Selected tuning parameter : {}".format(two_sample_test.h))
     ... Test : Kernel-based quadratic distance two-sample test
-    ... Execution time: 0.092
-    ... H0 is Rejected : False
-    ... Test Statistic : -0.019707895277270022
-    ... Critical Value (CV) : 0.003842482597612725
+    ... Execution time: 0.265 seconds
+    ... H0 is Rejected : True
+    ... Test Statistic : 0.035620906539451845
+    ... Critical Value (CV) : 0.0023528111662230473
     ... CV Method : subsampling
-    ... Selected tuning parameter : 0.4
+    ... Selected tuning parameter : 2
     """
 
     def __init__(
@@ -261,26 +261,27 @@ def test(self, x, y=None):
                 if k == 1:
                     self.sigma_hat = np.array([[np.take(self.sigma_hat, 0)]])
 
-            statistic = stat_normality_test(
-                self.x, self.h, self.mu_hat, self.sigma_hat, self.centering_type
-            )
+            statistic = stat_normality_test(self.x, self.h, self.mu_hat, self.sigma_hat)
             cv = cv_normality(
                 size_x,
                 self.h,
                 self.mu_hat,
                 self.sigma_hat,
                 self.num_iter,
                 self.quantile,
-                self.centering_type,
                 self.random_state,
                 self.n_jobs,
             )
-            h0 = statistic > cv
+            un_h0 = statistic[0] > cv[0]
+            vn_h0 = statistic[1] > cv[1]
 
             self.test_type_ = "Kernel-based quadratic distance Normality test"
-            self.h0_rejected_ = h0
-            self.test_statistic_ = statistic
-            self.cv_ = cv
+            self.un_h0_rejected_ = un_h0
+            self.vn_h0_rejected_ = vn_h0
+            self.un_test_statistic_ = statistic[0]
+            self.vn_test_statistic_ = statistic[1]
+            self.un_cv_ = cv[0]
+            self.vn_cv_ = cv[1]
             self.cv_method_ = "Empirical"
             return self
 
@@ -342,14 +343,16 @@ def test(self, x, y=None):
                 h0 = statistic > cv
 
                 self.test_type_ = "Kernel-based quadratic distance two-sample test"
-                self.h0_rejected_ = h0
-                self.test_statistic_ = statistic
-                self.cv_ = cv
+                self.un_h0_rejected_ = h0
+                self.vn_h0_rejected_ = None
+                self.un_test_statistic_ = statistic
+                self.vn_test_statistic_ = None
+                self.un_cv_ = cv
+                self.vn_cv_ = None
                 self.cv_method_ = self.method
                 return self
 
             else:
-                print("Entered this K Sample Else")
                 if (self.y is not None) and (self.x.shape[0] != self.y.shape[0]):
                     raise ValueError("'x' and 'y' must have the same number of rows.")
 
@@ -376,12 +379,16 @@ def test(self, x, y=None):
                     self.random_state,
                     self.n_jobs,
                 )
-                h0 = statistic[0] > cv[0]
+                un_h0 = statistic[0] > cv[0]
+                vn_h0 = statistic[1] > cv[1]
 
                 self.test_type_ = "Kernel-based quadratic distance K-sample test"
-                self.h0_rejected_ = h0
-                self.test_statistic_ = statistic
-                self.cv_ = cv
+                self.un_h0_rejected_ = un_h0
+                self.vn_h0_rejected_ = vn_h0
+                self.un_test_statistic_ = statistic[0]
+                self.vn_test_statistic_ = statistic[1]
+                self.un_cv_ = cv[0]
+                self.vn_cv_ = cv[1]
                 self.cv_method_ = self.method
                 return self
 
@@ -414,10 +421,37 @@ def summary(self, print_fmt="simple_grid"):
                 format with the kernel test results and summary statistics.
         """
         res = pd.DataFrame()
-        res[""] = [self.test_type_, self.test_statistic_, self.cv_, self.h0_rejected_]
-        res = res.set_axis(
-            ["Test Type", "Test Statistic", "Critical Value", "Reject H0"]
-        )
+        if self.vn_test_statistic_ is None:
+            res[""] = [
+                self.test_type_,
+                self.un_test_statistic_,
+                self.un_cv_,
+                self.un_h0_rejected_,
+            ]
+            res = res.set_axis(
+                ["Test Type", "Un Test Statistic", "Un Critical Value", "Reject H0"]
+            )
+        else:
+            res[""] = [
+                self.test_type_,
+                self.un_test_statistic_,
+                self.un_cv_,
+                self.un_h0_rejected_,
+                self.vn_test_statistic_,
+                self.vn_cv_,
+                self.vn_h0_rejected_,
+            ]
+            res = res.set_axis(
+                [
+                    "Test Type",
+                    "Un Test Statistic",
+                    "Un Critical Value",
+                    "Un Reject H0",
+                    "Vn Test Statistic",
+                    "Vn Critical Value",
+                    "Vn Reject H0",
+                ]
+            )
 
         summary_stats_df = self.stats()
 

diff --git a/QuadratiK/kernel_test/_utils.py b/QuadratiK/kernel_test/_utils.py
@@ -53,11 +53,14 @@ def nonparam_centering(kmat_zz, n_z):
     centered kernel matrix : numpy.ndarray
         Matrix of centered kernel.
     """
+    kmat = np.copy(kmat_zz)
+    np.fill_diagonal(kmat, 0)
+
     k_center = (
         kmat_zz
-        - np.sum(kmat_zz, axis=1, keepdims=True) / n_z
-        - np.sum(kmat_zz, axis=0, keepdims=True) / n_z
-        + (np.sum(kmat_zz)) / (n_z * (n_z - 1))
+        - np.sum(kmat, axis=1, keepdims=True) / (n_z - 1)
+        - np.sum(kmat, axis=0, keepdims=True) / (n_z - 1)
+        + (np.sum(kmat)) / (n_z * (n_z - 1))
     )
     return k_center
 
@@ -149,10 +152,10 @@ def stat_two_sample(x_mat, y_mat, h, mu_hat, sigma_hat, centering_type="nonparam
         - 2 * (np.sum(k_center[:n_x, n_x : n_x + n_y]) / (n_x * n_y))
         + (np.sum(k_center[n_x : n_x + n_y, n_x : n_x + n_y]) / (n_y * (n_y - 1)))
     )
-    return test_non_par
+    return n_z * test_non_par
 
 
-def stat_normality_test(x_mat, h, mu_hat, sigma_hat, centering_type="param"):
+def stat_normality_test(x_mat, h, mu_hat, sigma_hat):
     """
     Compute kernel-based quadratic distance test for
     Normality
@@ -179,15 +182,16 @@ def stat_normality_test(x_mat, h, mu_hat, sigma_hat, centering_type="param"):
     k = x_mat.shape[1]
     cov_h = (h**2) * np.identity(k)
     kmat_zz = compute_kernel_matrix(x_mat, x_mat, cov_h)
-    np.fill_diagonal(kmat_zz, 0)
 
-    if centering_type == "nonparam":
-        k_center = nonparam_centering(kmat_zz, n_x)
-    elif centering_type == "param":
-        k_center = param_centering(kmat_zz, x_mat, cov_h, mu_hat, sigma_hat)
-    np.fill_diagonal(k_center, 0)
-    test_normality = np.sum(k_center) / (n_x * (n_x - 1))
-    return test_normality
+    k_center = param_centering(kmat_zz, x_mat, cov_h, mu_hat, sigma_hat)
+
+    # Compute the normality test V-statistic
+    Vn = n_x * np.sum(k_center) / (n_x) ** 2
+
+    # Compute the normality test U-statistic
+    Un = n_x * ((np.sum(k_center) - np.sum(np.diagonal(k_center))) / (n_x * (n_x - 1)))
+
+    return np.array([Un, Vn])
 
 
 def stat_ksample(x, y, h):
@@ -245,12 +249,12 @@ def stat_ksample(x, y, h):
                         cum_size[l] : cum_size[l + 1], cum_size[r] : cum_size[r + 1]
                     ]
                 )
-    stat1 = (k - 1) * trace_k + tn
-    stat2 = trace_k + tn / (k - 1)
+    stat1 = n * ((k - 1) * trace_k + tn)
+    stat2 = n * trace_k
     return np.array([stat1, stat2])
 
 
-def normal_cv_helper(size, h, mu_hat, sigma_hat, centering_type, n_rep, random_state):
+def normal_cv_helper(size, h, mu_hat, sigma_hat, n_rep, random_state):
     """
     Generate a sample from a multivariate normal distribution and perform a
     kernel-based quadratic distance test for normality.
@@ -271,9 +275,6 @@ def normal_cv_helper(size, h, mu_hat, sigma_hat, centering_type, n_rep, random_s
     sigma_hat : numpy.ndarray
         Covariance matrix of the reference distribution.
 
-    centering_type : str
-        String indicating the method used for centering the normal kernel.
-
     n_rep : int
         The number of replication.
 
@@ -292,7 +293,7 @@ def normal_cv_helper(size, h, mu_hat, sigma_hat, centering_type, n_rep, random_s
         generator = check_random_state(random_state + n_rep)
 
     dat = generator.multivariate_normal(mu_hat.ravel(), sigma_hat, size)
-    return stat_normality_test(dat, h, mu_hat, sigma_hat, centering_type)
+    return stat_normality_test(dat, h, mu_hat, sigma_hat)
 
 
 def bootstrap_helper_twosample(size_x, size_y, h, data_pool, n_rep, random_state):

diff --git a/QuadratiK/poisson_kernel_test/_poisson_kernel_test.py b/QuadratiK/poisson_kernel_test/_poisson_kernel_test.py
@@ -172,10 +172,10 @@ def test(self, x):
             quantile=self.quantile,
             random_state=self.random_state,
             n_jobs=self.n_jobs,
-        )
+        ) / np.sqrt(var_un)
 
         self.test_type_ = method
-        self.u_statistic_h0_ = pk[0] > cv_un
+        self.u_statistic_h0_ = (pk[0] / np.sqrt(var_un)) > cv_un
         self.u_statistic_un_ = pk[0] / np.sqrt(var_un)
         self.u_statistic_cv_ = cv_un