From d0a097c9afb4f365c8e85a5682538e86f902bfcf Mon Sep 17 00:00:00 2001 From: lshpaner Date: Fri, 26 Jul 2024 21:00:18 -0700 Subject: [PATCH] added sig. figs to metrics, removed CKD data generator, versioned to 0.1.8a4 --- setup.py | 2 +- src/kfre/__init__.py | 2 +- src/kfre/perform_eval.py | 193 +++++---------------------------------- 3 files changed, 26 insertions(+), 171 deletions(-) diff --git a/setup.py b/setup.py index 538ebf1..540272c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="kfre", - version="0.1.8a3", + version="0.1.8a4", author="Leonid Shpaner", author_email="lshpaner@ucla.edu", description="A Python library for estimating kidney failure risk using the KFRE model developed by Tangri et al.", diff --git a/src/kfre/__init__.py b/src/kfre/__init__.py index 6ac879c..2c3dd6d 100644 --- a/src/kfre/__init__.py +++ b/src/kfre/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.8a3" +__version__ = "0.1.8a4" from .perform_eval import * diff --git a/src/kfre/perform_eval.py b/src/kfre/perform_eval.py index fd8a548..33ef850 100644 --- a/src/kfre/perform_eval.py +++ b/src/kfre/perform_eval.py @@ -49,7 +49,7 @@ def calc_esrd_outcome( if create_years_col: # Create a 'years' column based on the duration_col years_col = "ESRD_duration_years" - df[years_col] = round(df[duration_col] / 365.25) + df[years_col] = df[duration_col] / 365.25 else: # Use the provided duration_col directly @@ -151,6 +151,7 @@ def plot_kfre_metrics( show_years=[2, 5], plot_combinations=False, show_grids=False, + decimal_places=2, ): """ Generate the true labels and predicted probabilities for 2-year and 5-year @@ -203,6 +204,10 @@ def plot_kfre_metrics( show_grids : bool, optional Whether to show grid plots of all combinations. Default is False. + decimal_places : int, optional + Number of decimal places for AUC and AP scores in the plot legends. + Default is 2. + Returns: ------- tuple (optional) @@ -321,7 +326,7 @@ def plot_kfre_metrics( plt.plot( fpr, tpr, - label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.02f})", + label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.{decimal_places}f})", ) # Plot ROC curve plt.plot( [0, 1], [0, 1], linestyle="--", color="red" @@ -370,7 +375,7 @@ def plot_kfre_metrics( plt.plot( recall, precision, - label=f"{n}-variable {outcome} outcome (AP = {ap_score:.02f})", + label=f"{n}-variable {outcome} outcome (AP = {ap_score:.{decimal_places}f})", ) # Plot PR curve plt.xlabel("Recall") plt.ylabel("Precision") @@ -418,7 +423,7 @@ def plot_kfre_metrics( plt.plot( fpr, tpr, - label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.02f})", + label=f"{n}-variable {outcome} outcome (AUC = {auc_score:.{decimal_places}f})", ) # Plot ROC curve plt.plot( [0, 1], [0, 1], linestyle="--", color="red" @@ -465,7 +470,7 @@ def plot_kfre_metrics( plt.plot( recall, precision, - label=f"{n}-variable {outcome} outcome (AP = {ap_score:.02f})", + label=f"{n}-variable {outcome} outcome (AP = {ap_score:.{decimal_places}f})", ) # Plot PR curve plt.xlabel("Recall") plt.ylabel("Precision") @@ -562,7 +567,12 @@ def plot_kfre_metrics( ################################################################################ -def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]): +def eval_kfre_metrics( + df, + n_var_list, + outcome_years=[2, 5], + decimal_places=6, +): """ Calculate metrics for multiple outcomes and store the results in a DataFrame. @@ -581,6 +591,8 @@ def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]): List of variable numbers to consider, e.g., [4, 6, 8]. outcome_years : list of int, optional List of outcome years to consider, default is [2, 5]. + decimal_places : int, optional + Number of decimal places for the calculated metrics. Default is 6. Returns: ------- @@ -662,12 +674,12 @@ def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]): # Create a dictionary to store the calculated metrics metrics = { - "Precision/PPV": precision, - "Average Precision": average_precision, - "Sensitivity": sensitivity, - "Specificity": specificity, - "AUC ROC": auc_roc, - "Brier Score": brier, + "Precision/PPV": round(precision, decimal_places), + "Average Precision": round(average_precision, decimal_places), + "Sensitivity": round(sensitivity, decimal_places), + "Specificity": round(specificity, decimal_places), + "AUC ROC": round(auc_roc, decimal_places), + "Brier Score": round(brier, decimal_places), "Outcome": f"{outcome}_{n_var}_var_kfre", } @@ -686,160 +698,3 @@ def eval_kfre_metrics(df, n_var_list, outcome_years=[2, 5]): # Return the resulting DataFrame containing the performance metrics return metrics_df_n_var - - -################################################################################ -######################### CKD Random Data Generator ############################ -################################################################################ - - -class CKDDataGenerator: - def __new__(cls, *args, **kwargs): - instance = super(CKDDataGenerator, cls).__new__(cls) - instance.__init__(*args, **kwargs) - return instance.df - - def __init__( - self, - n_samples=100, - n=1000, - random_state=42, - use_bootstrap=True, - ranges=None, - ): - self.n_samples = n_samples - self.n = n - self.random_state = random_state - self.use_bootstrap = use_bootstrap - self.samples = [] - self.ranges = ranges if ranges else self.default_ranges() - self.df = ( - self._generate_data() - ) # Automatically generate data upon initialization - - def default_ranges(self): - return { - "Age": (18, 100), - "eGFR-EPI": (2, 120), - "uACR": (0.1, 2000), - "Albumin_g_dl": (3.0, 5.5), - "Phosphorous_mg_dl": (2.5, 5.0), - "Bicarbonate (mmol/L)": (16, 32), - "Calcium_mg_dl": (8.5, 10.5), - } - - def generate_ckd_data(self, n, random_state): - """ - Generate a DataFrame with random CKD data. - - Parameters: - ---------- - n : int - Number of random rows to generate. - random_state : int - Random seed for reproducibility. - - Returns: - ------- - pd.DataFrame - A DataFrame with random CKD data, including ESRD status and duration. - """ - # Set random seed for reproducibility - np.random.seed(random_state) - - # Generate random values that are clinically relevant for CKD patients - data = { - "Age": np.random.randint( - self.ranges["Age"][0], self.ranges["Age"][1], size=n - ), # Age in years, typical range for CKD patients - "SEX": np.random.choice(["male", "female"], size=n), - "eGFR-EPI": np.random.uniform( - self.ranges["eGFR-EPI"][0], self.ranges["eGFR-EPI"][1], size=n - ), # eGFR in mL/min/1.73 m^2, covering all CKD stages - "uACR": np.random.uniform( - self.ranges["uACR"][0], self.ranges["uACR"][1], size=n - ), # uACR in mg/g, covering all CKD stages - "Diabetes (1=yes; 0=no)": np.random.choice( - [0, 1], size=n - ), # Prevalence of diabetes in CKD - "Hypertension (1=yes; 0=no)": np.random.choice( - [0, 1], size=n - ), # Prevalence of hypertension in CKD - "Albumin_g_dl": np.random.uniform( - self.ranges["Albumin_g_dl"][0], - self.ranges["Albumin_g_dl"][1], - size=n, - ), # Serum albumin in g/dL, lower range for CKD - "Phosphorous_mg_dl": np.random.uniform( - self.ranges["Phosphorous_mg_dl"][0], - self.ranges["Phosphorous_mg_dl"][1], - size=n, - ), # Serum phosphorus in mg/dL, can be elevated in CKD - "Bicarbonate (mmol/L)": np.random.uniform( - self.ranges["Bicarbonate (mmol/L)"][0], - self.ranges["Bicarbonate (mmol/L)"][1], - size=n, - ), # Serum bicarbonate in mEq/L, often lower in CKD - "Calcium_mg_dl": np.random.uniform( - self.ranges["Calcium_mg_dl"][0], - self.ranges["Calcium_mg_dl"][1], - size=n, - ), # Serum calcium in mg/dL, slightly adjusted for CKD - } - - df = pd.DataFrame(data) - - # Ensure no values fall below their specified ranges - for col in df.columns: - if col in self.ranges: - min_val = self.ranges[col][0] - df[col] = df[col].clip(lower=min_val) - - # Define ESRD based on eGFR value - df["ESRD (1=yes; 0=no)"] = df["eGFR-EPI"].apply(lambda x: 1 if x < 15 else 0) - - # Create a column with random ESRD duration in years between 0 and 10 - df["ESRD_duration_years"] = np.random.uniform(0, 10, size=n) - - return df - - def bootstrap_ckd_data(self): - """ - Bootstrap CKD data multiple times and store all samples. - - Parameters: - ---------- - None - - Returns: - ------- - None - """ - np.random.seed(self.random_state) - self.samples = [] - - for i in tqdm(range(self.n_samples)): - sample_random_state = self.random_state + i - sample = self.generate_ckd_data( - self.n, - random_state=sample_random_state, - ) - self.samples.append(sample) - - def _generate_data(self): - """ - Generate CKD data, optionally using bootstrapping. - - Parameters: - ---------- - None - - Returns: - ------- - pd.DataFrame - """ - if self.use_bootstrap: - self.bootstrap_ckd_data() - return self.samples[np.random.randint(0, self.n_samples)] - else: - return self.generate_ckd_data(self.n, self.random_state)