included coef for mddc boxplot, updated find_optimal_coef

rmj3197 · Aug 24, 2024 · 7146e6c · 7146e6c
1 parent cec0dfc
commit 7146e6c
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 21 deletions.
diff --git a/MDDC/MDDC/_find_optimal_coef.py b/MDDC/MDDC/_find_optimal_coef.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from mddc_cpp_helper import getEijMat
 
-from ._helper import apply_func, compute_fdr
+from ._helper import apply_func, compute_fdr, compute_fdr_all
 
 
 def find_optimal_coef(
@@ -13,6 +13,7 @@ def find_optimal_coef(
     target_fdr=0.05,
     grid=0.1,
     exclude_small_count=True,
+    col_specific_cutoff=True,
     seed=None,
 ):
     """
@@ -42,6 +43,11 @@ def find_optimal_coef(
         Whether to exclude cells with counts smaller than or equal to five
         when computing boxplot statistics. Default is True.
 
+    col_specific_cutoff : bool, optional
+        If `True`, then a single value of the coefficient is returned for the
+        entire dataset, else when `False` specific values corresponding to each
+        of the columns are returned.
+
     Returns:
     --------
     OptimalCoef : namedtuple
@@ -85,14 +91,26 @@ def find_optimal_coef(
             lambda row: apply_func(row, n, m, True), 1, sim_tables
         )
 
-    c_vec = np.repeat(1.5, m)
-    fdr_vec = np.ones(m)
+    if col_specific_cutoff:
+        c_vec = np.repeat(1.5, m)
+        fdr_vec = np.ones(m)
+
+        for i in range(m):
+            while fdr_vec[i] > target_fdr:
+                c_vec[i] += grid
+                fdr_vec[i] = compute_fdr(res_list, c_vec[i], i)
+
+        OptimalCoef = namedtuple("OptimalCoef", ["coef", "FDR"])
+        oc = OptimalCoef(c_vec, fdr_vec)
+
+    else:
+        c = 1.5
+        fdr = 1
+        while fdr > target_fdr:
+            c += grid
+            fdr = compute_fdr_all(res_list, c)
 
-    for i in range(m):
-        while fdr_vec[i] > target_fdr:
-            c_vec[i] += grid
-            fdr_vec[i] = compute_fdr(res_list, c_vec[i], i)
+        OptimalCoef = namedtuple("OptimalCoef", ["coef", "FDR"])
+        oc = OptimalCoef(c, fdr)
 
-    OptimalCoef = namedtuple("OptimalCoef", ["coef", "FDR"])
-    oc = OptimalCoef(c_vec, fdr_vec)
     return oc
diff --git a/MDDC/MDDC/_helper.py b/MDDC/MDDC/_helper.py
@@ -357,7 +357,7 @@ def normalize_column(a):
     return (a - mean_a) / sd_a
 
 
-def compute_whishi1(z_ij_mat, contin_table, a):
+def compute_whishi1(z_ij_mat, contin_table, coef, a):
     """
     Computes the upper whisker value from the boxplot statistics of a specific column in `z_ij_mat`.
 
@@ -371,6 +371,8 @@ def compute_whishi1(z_ij_mat, contin_table, a):
         A 2D array or matrix from which the column values are extracted for boxplot statistics computation.
     contin_table : numpy.ndarray
         A 2D array or matrix containing the indices to determine which rows of `z_ij_mat` to consider.
+    coef : float
+        Coefficient used for computing boxplot statistics.
     a : int
         The index of the column in `contin_table` to be used for filtering and extracting values from `z_ij_mat`.
 
@@ -380,10 +382,10 @@ def compute_whishi1(z_ij_mat, contin_table, a):
         The upper whisker value from the boxplot statistics of the extracted values.
     """
 
-    return boxplot_stats(z_ij_mat[contin_table[:, a] != 0, a])[3]
+    return boxplot_stats(z_ij_mat[contin_table[:, a] != 0, a], coef=coef)[3]
 
 
-def compute_whishi2(vec):
+def compute_whishi2(vec, coef):
     """
     Computes the upper whisker value from the boxplot statistics of the given vector.
 
@@ -394,13 +396,15 @@ def compute_whishi2(vec):
     ----------
     vec : numpy.ndarray
         A 1D array or sequence of numerical values for which the boxplot statistics are to be computed.
+    coef : float
+        Coefficient used for computing boxplot statistics.
 
     Returns:
     -------
     upper_whisker : float
         The upper whisker value from the boxplot statistics of the input vector.
     """
-    return boxplot_stats(vec)[3]
+    return boxplot_stats(vec, coef=coef)[3]
 
 
 def compute_whislo1(z_ij_mat, contin_table, a):
@@ -639,3 +643,32 @@ def compute_fdr(res_list, c_j, j):
     sum_outliers = np.sum(outliers, axis=1)
     mean_outliers = np.mean(sum_outliers)
     return mean_outliers
+
+
+def compute_fdr_all(res_list, c):
+    """
+    Computes the mean number of outliers for a 3D array.
+
+    This function calculates the average number of outliers across a number of
+    datasets.
+
+    Parameters:
+    -----------
+    res_list : numpy.ndarray
+        A 3D array containing the result values, where outliers will be computed along the rows.
+    c : float
+        The scaling factor passed to `get_boxplot_outliers` for determining the outlier threshold.
+
+    Returns:
+    --------
+    mean_outliers : float
+        The mean number of outliers across the rows of the specified component.
+    """
+    outliers = np.apply_along_axis(
+        lambda x: get_boxplot_outliers(x, c),
+        axis=1,
+        arr=res_list.reshape(res_list.shape[0], res_list.shape[1] * res_list.shape[2]),
+    )
+    sum_outliers = np.sum(outliers, axis=1)
+    mean_outliers = np.mean(sum_outliers)
+    return mean_outliers
diff --git a/MDDC/MDDC/_mddc.py b/MDDC/MDDC/_mddc.py
@@ -16,6 +16,7 @@ def mddc(
     separate=True,
     if_col_corr=False,
     corr_lim=0.8,
+    coef=1.5,
     chunk_size=None,
     n_jobs=-1,
     seed=None,
@@ -62,6 +63,14 @@ def mddc(
     corr_lim : float, optional, default=0.8
         Correlation threshold used to select connected adverse events. Utilized in Step 3 of MDDC algorithm.
 
+    coef : int, float, list, numpy.ndarray, default = 1.5
+        Used only when `method` = `boxplot`.
+        A numeric value or a list of numeric values. If a single numeric
+        value is provided, it will be applied uniformly across all columns of the
+        contingency table. If a list is provided, its length must match the number
+        of columns in the contingency table, and each value will be used as the
+        coefficient for the corresponding column.
+
     chunk_size : int, optional, default=None
         Useful in scenarios when the dimensions of the contingency table is large as well as the number of Monte Carlo replications. In such scenario the Monte Carlo samples
         need to be generated sequentially such that the memory footprint is manageable (or rather the generated samples fit into the RAM).
@@ -216,7 +225,7 @@ def mddc(
 
     elif method == "boxplot":
         high_outlier, r_pval, r_pval_adj = _mddc_boxplot(
-            contin_table_mat, col_specific_cutoff, separate, if_col_corr, corr_lim
+            contin_table_mat, col_specific_cutoff, separate, if_col_corr, corr_lim, coef
         )
 
         if isinstance(contin_table, pd.DataFrame):

diff --git a/MDDC/MDDC/_mddc_boxplot.py b/MDDC/MDDC/_mddc_boxplot.py
@@ -20,6 +20,7 @@ def _mddc_boxplot(
     separate=True,
     if_col_corr=False,
     corr_lim=0.8,
+    coef=1.5,
     n_jobs=-1,
 ):
     """
@@ -51,6 +52,13 @@ def _mddc_boxplot(
     corr_lim : float, optional, default=0.8
         Correlation threshold used to select connected adverse events. Utilized in Step 3 of MDDC algorithm.
 
+    coef : int, float, list, numpy.ndarray, default = 1.5
+        A numeric value or a list of numeric values. If a single numeric
+        value is provided, it will be applied uniformly across all columns of the
+        contingency table. If a list is provided, its length must match the number
+        of columns in the contingency table, and each value will be used as the
+        coefficient for the corresponding column.
+
     n_jobs : int, optional, default=-1
         n_jobs specifies the maximum number of concurrently
         running workers. If 1 is given, no joblib parallelism
@@ -63,13 +71,42 @@ def _mddc_boxplot(
     result : tuple
         A tuple with the following members:
         - 'signal': np.ndarray
-            Matrix indicating significant signals with count greater than five and identified in the step 2 by the Monte Carlo method. 1 indicates a signal, 0 indicates non-signal.
+            Matrix indicating significant signals with count greater than five and identified in
+            the step 2 by the Monte Carlo method. 1 indicates a signal, 0 indicates non-signal.
         - 'corr_signal_pval': np.ndarray
-            p-values for each cell in the contingency table in the step 5, when the :math:`r_{ij}` (residual) values are mapped back to the standard normal distribution.
+            p-values for each cell in the contingency table in the step 5, when the :math:`r_{ij}`
+            (residual) values are mapped back to the standard normal distribution.
         - 'corr_signal_adj_pval': np.ndarray
             Benjamini-Hochberg adjusted p values for each cell in the step 5.
     """
 
+    if not isinstance(coef, (int, float, list, np.ndarray)):
+        raise TypeError("'coef' must be a numeric value, 1D numpy array, or list.")
+
+    if col_specific_cutoff:
+        if isinstance(coef, (int, float)):  # Check if coef is a numeric value
+            coef = [coef] * contin_table.shape[
+                1
+            ]  # Replicate the numeric value for each column
+        elif isinstance(coef, list):
+            if len(coef) != contin_table.shape[1]:
+                raise ValueError(
+                    "Length of 'coef' does not match the number of columns (n_col)."
+                )
+        elif isinstance(coef, np.ndarray):
+            # Flatten the array and check the length
+            coef = coef.flatten()
+            if len(coef) != contin_table.shape[1]:
+                raise ValueError(
+                    "Length of 'coef' does not match the number of columns in contin_table."
+                )
+    else:
+        if not isinstance(coef, (int, float)):
+            if len(coef) != 1:
+                raise ValueError(
+                    "'coef' must be a numeric value when 'col_specific_cutoff' is False"
+                )
+
     z_ij_mat = getZijMat(contin_table, na=False)[0]
     res_all = z_ij_mat.flatten(order="F")
     res_nonzero = z_ij_mat[contin_table != 0].flatten(order="F")
@@ -80,7 +117,7 @@ def _mddc_boxplot(
             c_univ_drug = np.array(
                 list(
                     map(
-                        lambda a: compute_whishi1(z_ij_mat, contin_table, a),
+                        lambda a: compute_whishi1(z_ij_mat, contin_table, coef[a], a),
                         range(contin_table.shape[1]),
                     )
                 )
@@ -94,19 +131,21 @@ def _mddc_boxplot(
                 )
             )
         else:
-            c_univ_drug = np.apply_along_axis(compute_whishi2, 0, z_ij_mat)
+            c_univ_drug = np.apply_along_axis(compute_whishi2, 0, z_ij_mat, coef)
             zero_drug_cutoff = np.apply_along_axis(compute_whislo2, 0, z_ij_mat)
     else:
         if separate:
             c_univ_drug = np.repeat(
-                boxplot_stats(res_nonzero)[3], contin_table.shape[1]
+                boxplot_stats(res_nonzero, coef=coef)[3], contin_table.shape[1]
             )
 
             zero_drug_cutoff = np.repeat(
                 boxplot_stats(res_zero)[1], contin_table.shape[1]
             )
         else:
-            c_univ_drug = np.repeat(boxplot_stats(res_all)[3], contin_table.shape[1])
+            c_univ_drug = np.repeat(
+                boxplot_stats(res_all, coef=coef)[3], contin_table.shape[1]
+            )
             zero_drug_cutoff = np.repeat(
                 boxplot_stats(res_all)[1], contin_table.shape[1]
             )

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -31,7 +31,7 @@
     "sphinx.ext.githubpages",
     "nbsphinx",
     "sphinx.ext.intersphinx",
-    "myst_parser"
+    "myst_parser",
 ]
 myst_enable_extensions = [
     "amsmath",

diff --git a/docs/source/user_guide/MDDC_Usage.ipynb b/docs/source/user_guide/MDDC_Usage.ipynb
@@ -209,6 +209,40 @@
     "- (Muscle Disorder, Rosuvastatin)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Function for Finding Optimal Boxplot Coefficient"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MDDC.MDDC.find_optimal_coef(\n",
+    "  contin_table = statin49,\n",
+    "  rep = 1000,\n",
+    "  target_fdr = 0.05,\n",
+    "  grid = 0.1,\n",
+    "  col_specific_cutoff = True,\n",
+    "  exclude_small_count = True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This function outputs a list with the following components:\n",
+    "\n",
+    "- `coef`: A numeric vector containing the optimal coefficient ‘coef’ for each column of the input contingency table.\n",
+    "\n",
+    "- `FDR`: A numeric vector with the corresponding false discovery rate (FDR) for each column."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},