Skip to content

Commit

Permalink
included coef for mddc boxplot, updated find_optimal_coef
Browse files Browse the repository at this point in the history
  • Loading branch information
rmj3197 committed Aug 24, 2024
1 parent cec0dfc commit 7146e6c
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 21 deletions.
36 changes: 27 additions & 9 deletions MDDC/MDDC/_find_optimal_coef.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from mddc_cpp_helper import getEijMat

from ._helper import apply_func, compute_fdr
from ._helper import apply_func, compute_fdr, compute_fdr_all


def find_optimal_coef(
Expand All @@ -13,6 +13,7 @@ def find_optimal_coef(
target_fdr=0.05,
grid=0.1,
exclude_small_count=True,
col_specific_cutoff=True,
seed=None,
):
"""
Expand Down Expand Up @@ -42,6 +43,11 @@ def find_optimal_coef(
Whether to exclude cells with counts smaller than or equal to five
when computing boxplot statistics. Default is True.
col_specific_cutoff : bool, optional
If `True`, then a single value of the coefficient is returned for the
entire dataset, else when `False` specific values corresponding to each
of the columns are returned.
Returns:
--------
OptimalCoef : namedtuple
Expand Down Expand Up @@ -85,14 +91,26 @@ def find_optimal_coef(
lambda row: apply_func(row, n, m, True), 1, sim_tables
)

c_vec = np.repeat(1.5, m)
fdr_vec = np.ones(m)
if col_specific_cutoff:
c_vec = np.repeat(1.5, m)
fdr_vec = np.ones(m)

for i in range(m):
while fdr_vec[i] > target_fdr:
c_vec[i] += grid
fdr_vec[i] = compute_fdr(res_list, c_vec[i], i)

OptimalCoef = namedtuple("OptimalCoef", ["coef", "FDR"])
oc = OptimalCoef(c_vec, fdr_vec)

else:
c = 1.5
fdr = 1
while fdr > target_fdr:
c += grid
fdr = compute_fdr_all(res_list, c)

for i in range(m):
while fdr_vec[i] > target_fdr:
c_vec[i] += grid
fdr_vec[i] = compute_fdr(res_list, c_vec[i], i)
OptimalCoef = namedtuple("OptimalCoef", ["coef", "FDR"])
oc = OptimalCoef(c, fdr)

OptimalCoef = namedtuple("OptimalCoef", ["coef", "FDR"])
oc = OptimalCoef(c_vec, fdr_vec)
return oc
41 changes: 37 additions & 4 deletions MDDC/MDDC/_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def normalize_column(a):
return (a - mean_a) / sd_a


def compute_whishi1(z_ij_mat, contin_table, a):
def compute_whishi1(z_ij_mat, contin_table, coef, a):
"""
Computes the upper whisker value from the boxplot statistics of a specific column in `z_ij_mat`.
Expand All @@ -371,6 +371,8 @@ def compute_whishi1(z_ij_mat, contin_table, a):
A 2D array or matrix from which the column values are extracted for boxplot statistics computation.
contin_table : numpy.ndarray
A 2D array or matrix containing the indices to determine which rows of `z_ij_mat` to consider.
coef : float
Coefficient used for computing boxplot statistics.
a : int
The index of the column in `contin_table` to be used for filtering and extracting values from `z_ij_mat`.
Expand All @@ -380,10 +382,10 @@ def compute_whishi1(z_ij_mat, contin_table, a):
The upper whisker value from the boxplot statistics of the extracted values.
"""

return boxplot_stats(z_ij_mat[contin_table[:, a] != 0, a])[3]
return boxplot_stats(z_ij_mat[contin_table[:, a] != 0, a], coef=coef)[3]


def compute_whishi2(vec):
def compute_whishi2(vec, coef):
"""
Computes the upper whisker value from the boxplot statistics of the given vector.
Expand All @@ -394,13 +396,15 @@ def compute_whishi2(vec):
----------
vec : numpy.ndarray
A 1D array or sequence of numerical values for which the boxplot statistics are to be computed.
coef : float
Coefficient used for computing boxplot statistics.
Returns:
-------
upper_whisker : float
The upper whisker value from the boxplot statistics of the input vector.
"""
return boxplot_stats(vec)[3]
return boxplot_stats(vec, coef=coef)[3]


def compute_whislo1(z_ij_mat, contin_table, a):
Expand Down Expand Up @@ -639,3 +643,32 @@ def compute_fdr(res_list, c_j, j):
sum_outliers = np.sum(outliers, axis=1)
mean_outliers = np.mean(sum_outliers)
return mean_outliers


def compute_fdr_all(res_list, c):
"""
Computes the mean number of outliers for a 3D array.
This function calculates the average number of outliers across a number of
datasets.
Parameters:
-----------
res_list : numpy.ndarray
A 3D array containing the result values, where outliers will be computed along the rows.
c : float
The scaling factor passed to `get_boxplot_outliers` for determining the outlier threshold.
Returns:
--------
mean_outliers : float
The mean number of outliers across the rows of the specified component.
"""
outliers = np.apply_along_axis(
lambda x: get_boxplot_outliers(x, c),
axis=1,
arr=res_list.reshape(res_list.shape[0], res_list.shape[1] * res_list.shape[2]),
)
sum_outliers = np.sum(outliers, axis=1)
mean_outliers = np.mean(sum_outliers)
return mean_outliers
11 changes: 10 additions & 1 deletion MDDC/MDDC/_mddc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def mddc(
separate=True,
if_col_corr=False,
corr_lim=0.8,
coef=1.5,
chunk_size=None,
n_jobs=-1,
seed=None,
Expand Down Expand Up @@ -62,6 +63,14 @@ def mddc(
corr_lim : float, optional, default=0.8
Correlation threshold used to select connected adverse events. Utilized in Step 3 of MDDC algorithm.
coef : int, float, list, numpy.ndarray, default = 1.5
Used only when `method` = `boxplot`.
A numeric value or a list of numeric values. If a single numeric
value is provided, it will be applied uniformly across all columns of the
contingency table. If a list is provided, its length must match the number
of columns in the contingency table, and each value will be used as the
coefficient for the corresponding column.
chunk_size : int, optional, default=None
Useful in scenarios when the dimensions of the contingency table is large as well as the number of Monte Carlo replications. In such scenario the Monte Carlo samples
need to be generated sequentially such that the memory footprint is manageable (or rather the generated samples fit into the RAM).
Expand Down Expand Up @@ -216,7 +225,7 @@ def mddc(

elif method == "boxplot":
high_outlier, r_pval, r_pval_adj = _mddc_boxplot(
contin_table_mat, col_specific_cutoff, separate, if_col_corr, corr_lim
contin_table_mat, col_specific_cutoff, separate, if_col_corr, corr_lim, coef
)

if isinstance(contin_table, pd.DataFrame):
Expand Down
51 changes: 45 additions & 6 deletions MDDC/MDDC/_mddc_boxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def _mddc_boxplot(
separate=True,
if_col_corr=False,
corr_lim=0.8,
coef=1.5,
n_jobs=-1,
):
"""
Expand Down Expand Up @@ -51,6 +52,13 @@ def _mddc_boxplot(
corr_lim : float, optional, default=0.8
Correlation threshold used to select connected adverse events. Utilized in Step 3 of MDDC algorithm.
coef : int, float, list, numpy.ndarray, default = 1.5
A numeric value or a list of numeric values. If a single numeric
value is provided, it will be applied uniformly across all columns of the
contingency table. If a list is provided, its length must match the number
of columns in the contingency table, and each value will be used as the
coefficient for the corresponding column.
n_jobs : int, optional, default=-1
n_jobs specifies the maximum number of concurrently
running workers. If 1 is given, no joblib parallelism
Expand All @@ -63,13 +71,42 @@ def _mddc_boxplot(
result : tuple
A tuple with the following members:
- 'signal': np.ndarray
Matrix indicating significant signals with count greater than five and identified in the step 2 by the Monte Carlo method. 1 indicates a signal, 0 indicates non-signal.
Matrix indicating significant signals with count greater than five and identified in
the step 2 by the Monte Carlo method. 1 indicates a signal, 0 indicates non-signal.
- 'corr_signal_pval': np.ndarray
p-values for each cell in the contingency table in the step 5, when the :math:`r_{ij}` (residual) values are mapped back to the standard normal distribution.
p-values for each cell in the contingency table in the step 5, when the :math:`r_{ij}`
(residual) values are mapped back to the standard normal distribution.
- 'corr_signal_adj_pval': np.ndarray
Benjamini-Hochberg adjusted p values for each cell in the step 5.
"""

if not isinstance(coef, (int, float, list, np.ndarray)):
raise TypeError("'coef' must be a numeric value, 1D numpy array, or list.")

if col_specific_cutoff:
if isinstance(coef, (int, float)): # Check if coef is a numeric value
coef = [coef] * contin_table.shape[
1
] # Replicate the numeric value for each column
elif isinstance(coef, list):
if len(coef) != contin_table.shape[1]:
raise ValueError(
"Length of 'coef' does not match the number of columns (n_col)."
)
elif isinstance(coef, np.ndarray):
# Flatten the array and check the length
coef = coef.flatten()
if len(coef) != contin_table.shape[1]:
raise ValueError(
"Length of 'coef' does not match the number of columns in contin_table."
)
else:
if not isinstance(coef, (int, float)):
if len(coef) != 1:
raise ValueError(
"'coef' must be a numeric value when 'col_specific_cutoff' is False"
)

z_ij_mat = getZijMat(contin_table, na=False)[0]
res_all = z_ij_mat.flatten(order="F")
res_nonzero = z_ij_mat[contin_table != 0].flatten(order="F")
Expand All @@ -80,7 +117,7 @@ def _mddc_boxplot(
c_univ_drug = np.array(
list(
map(
lambda a: compute_whishi1(z_ij_mat, contin_table, a),
lambda a: compute_whishi1(z_ij_mat, contin_table, coef[a], a),
range(contin_table.shape[1]),
)
)
Expand All @@ -94,19 +131,21 @@ def _mddc_boxplot(
)
)
else:
c_univ_drug = np.apply_along_axis(compute_whishi2, 0, z_ij_mat)
c_univ_drug = np.apply_along_axis(compute_whishi2, 0, z_ij_mat, coef)
zero_drug_cutoff = np.apply_along_axis(compute_whislo2, 0, z_ij_mat)
else:
if separate:
c_univ_drug = np.repeat(
boxplot_stats(res_nonzero)[3], contin_table.shape[1]
boxplot_stats(res_nonzero, coef=coef)[3], contin_table.shape[1]
)

zero_drug_cutoff = np.repeat(
boxplot_stats(res_zero)[1], contin_table.shape[1]
)
else:
c_univ_drug = np.repeat(boxplot_stats(res_all)[3], contin_table.shape[1])
c_univ_drug = np.repeat(
boxplot_stats(res_all, coef=coef)[3], contin_table.shape[1]
)
zero_drug_cutoff = np.repeat(
boxplot_stats(res_all)[1], contin_table.shape[1]
)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"sphinx.ext.githubpages",
"nbsphinx",
"sphinx.ext.intersphinx",
"myst_parser"
"myst_parser",
]
myst_enable_extensions = [
"amsmath",
Expand Down
34 changes: 34 additions & 0 deletions docs/source/user_guide/MDDC_Usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,40 @@
"- (Muscle Disorder, Rosuvastatin)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Function for Finding Optimal Boxplot Coefficient"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"MDDC.MDDC.find_optimal_coef(\n",
" contin_table = statin49,\n",
" rep = 1000,\n",
" target_fdr = 0.05,\n",
" grid = 0.1,\n",
" col_specific_cutoff = True,\n",
" exclude_small_count = True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This function outputs a list with the following components:\n",
"\n",
"- `coef`: A numeric vector containing the optimal coefficient ‘coef’ for each column of the input contingency table.\n",
"\n",
"- `FDR`: A numeric vector with the corresponding false discovery rate (FDR) for each column."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down

0 comments on commit 7146e6c

Please sign in to comment.