diff --git a/src/dimet/__main__.py b/src/dimet/__main__.py index a371634..96919d6 100644 --- a/src/dimet/__main__.py +++ b/src/dimet/__main__.py @@ -25,7 +25,6 @@ def main_run_analysis(cfg: DictConfig) -> None: config=hydra.utils.instantiate(cfg.analysis.dataset)) dataset.preload() dataset.split_datafiles_by_compartment() - dataset.save_datafiles_split_by_compartment() method: Method = hydra.utils.instantiate( cfg.analysis.method).build() # method factory diff --git a/src/dimet/config/analysis/method/bivariate_analysis.yaml b/src/dimet/config/analysis/method/bivariate_analysis.yaml new file mode 100644 index 0000000..2ec0fa5 --- /dev/null +++ b/src/dimet/config/analysis/method/bivariate_analysis.yaml @@ -0,0 +1,26 @@ +_target_: dimet.method.BivariateAnalysisConfig + +label: bivariate analysis +name: Computation of the correlation of MDV profiles, or the metabolite time course profiles + +# (**) : automatically will run + +conditions_MDV_comparison: # (**) if >= 2 conditions and >=1 timepoint (timepoints run separately) + isotopologue_proportions: pearson + +timepoints_MDV_comparison: # (**) if >= 1 condition and >=2 timepoints + isotopologue_proportions: pearson + +conditions_metabolite_time_profiles: # (**) if >= 2 conditions AND >=2 time points in data + abundances: pearson + mean_enrichment: pearson + +correction_method: fdr_bh + +impute_values: + abundances: "min" + mean_enrichment: "min" + isotopologues: "min" + isotopologue_proportions: "min" + +output_include_gmean_arr_columns: True # if False, the 'gmean_arr_.." columns are excluded \ No newline at end of file diff --git a/src/dimet/data/__init__.py b/src/dimet/data/__init__.py index a25da3c..7603192 100644 --- a/src/dimet/data/__init__.py +++ b/src/dimet/data/__init__.py @@ -47,7 +47,6 @@ def build(self) -> "Dataset": class Dataset(BaseModel): config: DatasetConfig raw_data_folder: str = None - processed_data_folder: str = None sub_folder_absolute: str = None metadata_df: Optional[pd.DataFrame] = None abundances_df: Optional[pd.DataFrame] = None @@ -77,8 +76,7 @@ def preload(self): else: self.sub_folder_absolute = self.config.subfolder self.raw_data_folder = os.path.join(self.sub_folder_absolute, "raw") - self.processed_data_folder = os.path.join(self.sub_folder_absolute, - "processed") + # start loading the dataframes file_paths = [ ("metadata", os.path.join(self.raw_data_folder, @@ -106,10 +104,14 @@ def preload(self): dfs.append(pd.read_csv(file_path, sep="\t", header=0)) self.available_datasets.add(label) except FileNotFoundError: - logger.critical( - "File %s not found, continuing, " - "but this might fail miserably", - file_path) + if file_path.endswith(self.config.isotopologues + ".csv"): + message_detail = "isotopologue absolute values missing" + logger.critical( + "File %s not found (%s), continuing" + % (file_path, message_detail)) + else: + logger.critical("File %s not found, continuing", + file_path) dfs.append(None) except Exception as e: logger.error( @@ -169,19 +171,6 @@ def split_datafiles_by_compartment(self) -> None: frames_dict = set_samples_names(frames_dict, self.metadata_df) self.compartmentalized_dfs = frames_dict - def save_datafiles_split_by_compartment(self) -> None: - os.makedirs(self.processed_data_folder, exist_ok=True) - out_data_path = self.processed_data_folder - for file_name in self.compartmentalized_dfs.keys(): - for compartment in self.compartmentalized_dfs[file_name].keys(): - df = self.compartmentalized_dfs[file_name][compartment] - tmp_file_name = self.get_file_for_label(file_name) - output_file_name = f"{tmp_file_name}-{compartment}.csv" - df.to_csv(os.path.join(out_data_path, output_file_name), - sep="\t", header=True, index=False) - logger.info( - f"Saved the {compartment} compartment version " - f"of {file_name} in {out_data_path}") def get_file_for_label(self, label): if label == "abundances": @@ -210,7 +199,6 @@ class DataIntegration(Dataset): def set_dataset_integration_config(self): self.preload() self.split_datafiles_by_compartment() - self.save_datafiles_split_by_compartment() self.integration_files_folder_absolute = os.path.join( self.sub_folder_absolute, "integration_files") diff --git a/src/dimet/method/__init__.py b/src/dimet/method/__init__.py index f77014c..0e2b45f 100644 --- a/src/dimet/method/__init__.py +++ b/src/dimet/method/__init__.py @@ -15,6 +15,7 @@ metabolites_values_for_metabologram) from dimet.data import DataIntegration, Dataset from dimet.helpers import flatten +from dimet.processing.bivariate_analysis import bivariate_comparison from dimet.processing.differential_analysis import (differential_comparison, multi_group_compairson, time_course_analysis) @@ -184,6 +185,18 @@ def build(self) -> "MetabologramIntegration": return MetabologramIntegration(config=self) +class BivariateAnalysisConfig(MethodConfig): + """ + Sets default values or fills them for the bi-variate analysis + """ + correction_method: str = "fdr_bh" + output_include_gmean_arr_columns: bool = True + + def build(self) -> "BivariateAnalysis": + return BivariateAnalysis(config=self) + + + class Method(BaseModel): config: MethodConfig @@ -852,3 +865,78 @@ def check_expectations_config_metabo( except ValueError as e: logger.error(f"Data inconsistency: {e}") sys.exit(1) + + +class BivariateAnalysis(Method): + config: BivariateAnalysisConfig + + def run(self, cfg: DictConfig, dataset: Dataset) -> None: + """ + Runs bivariate analysis, the 'behavior' is the type of comparison: + - conditions_MDV_comparison + - timepoints_MDV_comparison + - conditions_metabolite_time_profiles + """ + logger.info( + "Will compute bi-variate analysis, with the following config: %s", + self.config) + + out_table_dir = os.path.join(os.getcwd(), cfg.table_path) + os.makedirs(out_table_dir, exist_ok=True) + self.check_expectations(cfg, dataset) + + datatype = "isotopologue_proportions" + if datatype in dataset.compartmentalized_dfs.keys(): + logger.info(f"Running bi-variate analysis with " + f"{datatype}:") + if len(cfg.analysis.conditions) >= 2: + logger.info("assessing MDV (Mass Distribution Vector) " + "between conditions") + bivariate_comparison( + datatype, dataset, cfg, + behavior="conditions_MDV_comparison", + out_table_dir=out_table_dir) + if len(dataset.metadata_df["timepoint"].unique()) >= 2: + logger.info("assessing MDV (Mass Distribution Vector) " + "between time-points") + bivariate_comparison( + datatype, dataset, cfg, + behavior="timepoints_MDV_comparison", + out_table_dir=out_table_dir) + + if (len(cfg.analysis.conditions) >= 2) and ( + len(dataset.metadata_df["timepoint"].unique()) >= 2): + for datatype in ["abundances", "mean_enrichment"]: + if datatype in dataset.compartmentalized_dfs.keys(): + logger.info(f"Running bi-variate analysis with " + f"{datatype} to compare " + f"time course profiles between conditions") + bivariate_comparison( + datatype, dataset, cfg, + behavior="conditions_metabolite_time_profiles", + out_table_dir=out_table_dir) + + def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None: + # check that necessary information is provided in the analysis config + try: + if ((len(cfg.analysis.conditions) < 2) and + (len(dataset.metadata_df["timepoint"].unique()) < 2)): + raise ValueError("Less than 2 conditions, " + "AND less than 2 timepoints, " + "impossible to run bi-variate analysis, " + "aborting") + if not set(cfg.analysis.conditions).issubset( + set(dataset.metadata_df['condition'])): + raise ValueError( + "Conditions provided for bi-variate analysis " + "in the config file " + "are not present in the metadata file, aborting" + ) + except ConfigAttributeError as e: + logger.error( + f"Mandatory parameter not provided in the config file:{e}, " + f"aborting") + sys.exit(1) + except ValueError as e: + logger.error(f"Data inconsistency:{e}") + sys.exit(1) \ No newline at end of file diff --git a/src/dimet/processing/bivariate_analysis.py b/src/dimet/processing/bivariate_analysis.py new file mode 100644 index 0000000..f712bfd --- /dev/null +++ b/src/dimet/processing/bivariate_analysis.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +@author: Johanna Galvis, Macha Nikolski +""" +import logging +import operator +import os + +from typing import List, Dict + +import numpy as np +import pandas as pd +import scipy.stats as stats +from dimet.constants import (assert_literal, availtest_methods_type, + data_files_keys_type) +from dimet.data import Dataset +from dimet.helpers import (arg_repl_zero2value, + compute_padj, + row_wise_nanstd_reduction) +from omegaconf import DictConfig + +logger = logging.getLogger(__name__) + + +def compute_statistical_correlation(df: pd.DataFrame, + test: str) -> pd.DataFrame: + """ + computes correlation test, e.g. pearson + row-wise for each dataframe + """ + stat_list = [] + pvalue_list = [] + for i, metabolite in enumerate(list(df['metabolite'])): + # array of n-(timepoints or m+x) geometrical means values + array_1 = df.loc[metabolite, "gmean_arr_1"] + array_2 = df.loc[metabolite, "gmean_arr_2"] + if test == "pearson": + try: + stat_res, pvalue = stats.pearsonr(array_1, array_2) + except ValueError: + stat_res, pvalue = (np.nan, np.nan) + + elif test == "spearman": + try: + stat_res, pvalue = stats.spearmanr(array_1, array_2) + except ValueError: + stat_res, pvalue = (np.nan, np.nan) + + stat_list.append(stat_res) + if np.isnan(stat_res): + pvalue_list.append(np.nan) + else: + pvalue_list.append(pvalue) + + df["correlation_coefficient"] = stat_list + df["pvalue"] = pvalue_list + + return df + + +def compute_test_for_df_dict(df_dict: Dict[str, pd.DataFrame], + test: str) -> Dict[str, pd.DataFrame]: + """parses a dictionary of dataframes + computes bivariate test : e.g. pearson + """ + for akey in df_dict.keys(): + df = df_dict[akey].copy() + df.index = df['metabolite'] + + df_dict[akey] = compute_statistical_correlation(df, test) + + return df_dict + + +def compute_bivariate_by_behavior( + df: pd.DataFrame, metadata_df: pd.DataFrame, comparison: List[str], + behavior: str, test: str) -> Dict[str, pd.DataFrame]: + """ + performs two steps: + 1. calls functions to compute geometric means, obtaining df's inside dict + 2. computes the bivariate statistical test (pearson by default) + """ + if behavior == "conditions_MDV_comparison": + df_dict = conditions_MDV_gmean_df_dict(df, metadata_df, comparison) + + elif behavior == "timepoints_MDV_comparison": + df_dict = timepoints_MDV_gmean_df_dict(df, metadata_df, comparison) + + elif behavior == "conditions_metabolite_time_profiles": + # only abundances or mean enrichment processed + df_dict = metabolite_time_profiles_gmean_df_dict( + df, metadata_df, comparison) + + # 2. compute the statistical test + df_dict = compute_test_for_df_dict(df_dict, test) + + return df_dict + + +def conditions_MDV_gmean_df_dict(df: pd.DataFrame, metadata_df: pd.DataFrame, + comparison: List[str] + ) -> Dict[str, pd.DataFrame]: + """ + Note: e.g. comparison [Ctl, Treated1] + Separately by time-point: + Using isotopologue proportions, by metabolite and condition, + computes the arrays of geometric means, ordered by m+x. + Outputs dict of df, each df with columns arr_1 and arr_2 (np arrays); + the keys of the dict are the time-points + """ + clue_isotopologue_df = compute_isotopologue_meaning(list(df.index)) + metabolites_uniq = clue_isotopologue_df["metabolite"].unique() + df_dict = dict() + for timepoint in list(metadata_df["timepoint"].unique()): + inner_gmean_dict = {"metabolite": metabolites_uniq, + "gmean_arr_1": list(), "gmean_arr_2": list()} + metadata_df_tp = metadata_df.loc[ + metadata_df["timepoint"] == timepoint, :] + df_timepoint = df[metadata_df_tp['name_to_plot']].copy() + for k, condition in enumerate(comparison): + metadata_df_tp_cond = metadata_df_tp.loc[ + metadata_df_tp["condition"] == condition, :] + df_one_group = df_timepoint[ + metadata_df_tp_cond['name_to_plot']].copy() + + inner_gmean_dict = inner_gmean_dict_filler( + k, inner_gmean_dict, df_one_group, clue_isotopologue_df) + + df_dict[timepoint] = pd.DataFrame(inner_gmean_dict) + + return df_dict + + +def timepoints_MDV_gmean_df_dict(df: pd.DataFrame, metadata_df: pd.DataFrame, + comparison: List[str] + ) -> Dict[str, pd.DataFrame]: + """ + Note: e.g. comparison [T1, T0] + Separately by condition: + Using isotopologue proportions, by metabolite and time-point, + computes the arrays of geometric means, ordered by m+x. + Outputs dict of df, the keys of the dict are the conditions + """ + clue_isotopologue_df = compute_isotopologue_meaning(list(df.index)) + metabolites_uniq = clue_isotopologue_df["metabolite"].unique() + df_dict = dict() + for condition in list(metadata_df['condition'].unique()): + metadata_df_sub1 = metadata_df.loc[ + metadata_df["condition"] == condition, :] + + df_sub1 = df[metadata_df_sub1['name_to_plot']].copy() + inner_gmean_dict = {"metabolite": metabolites_uniq, + "gmean_arr_1": list(), "gmean_arr_2": list()} + + for k, timepoint in enumerate(comparison): + metadata_df_a_gr = metadata_df_sub1.loc[ + metadata_df_sub1["timepoint"] == timepoint, :] + df_a_group = df_sub1[metadata_df_a_gr['name_to_plot']].copy() + inner_gmean_dict = inner_gmean_dict_filler( + k, inner_gmean_dict, df_a_group, clue_isotopologue_df) + + df_dict[condition] = pd.DataFrame(inner_gmean_dict) + + return df_dict + + +def inner_gmean_dict_filler( + k: int, inner_gmean_dict: Dict[str, List[None]], + df_a_group: pd.DataFrame, clue_isotopologue_df: pd.DataFrame +) -> Dict[str, List[np.array]]: + """ + For MDV, fills the dictionary (of one single condition or time point) + inner_gmean_dict = {"metabolite": metabolites_uniq, + "gmean_arr_1": list(), "gmean_arr_2": list()}, where + this function replaces (fills) 'gmean_arr_..' lists only. + output: inner_gmean_dict, filled: + 'gmean_arr_..' list of np.arrays (one np.array by metabolite), + (respects the order of the metabolites in the 'metabolite' key). + Note: each elem in np. array respects the order of isotopologues m+x. + """ + assert k in [0, 1], "k can only take value 0 or 1" + metabolites_uniq = clue_isotopologue_df["metabolite"].unique() + # compute the arrays of geometric means + df_a_group["gmean"] = np.around(df_a_group.apply( + lambda x: stats.gmean(x.dropna()), axis=1), decimals=6) + df_a_group['gmean'] = modify_gmean_by_sanity(df_a_group) + df_a_group["isotopologue_name"] = list(df_a_group.index) + # order these geometric means by m+x + merged_df = pd.merge(df_a_group, clue_isotopologue_df, how='left', + on="isotopologue_name").sort_values( + by=["metabolite", "m+x"]) + MDV_ordered_list_gmeans = list() + for i, metabolite in enumerate(metabolites_uniq): + mdv_arr = merged_df.loc[merged_df["metabolite"] == metabolite, + "gmean"].values + MDV_ordered_list_gmeans.append(np.array(mdv_arr)) + key_name = f'gmean_arr_{k + 1}' + inner_gmean_dict[key_name] = MDV_ordered_list_gmeans + + return inner_gmean_dict + + +def modify_gmean_by_sanity(df: pd.DataFrame) -> np.array: + """ + Accepts the df of samples of one single group, with gmean column (last). + If at least 2 no-NaN samples, gmean value is preserved, + otherwise gmean value is replaced by np.nan. + """ + gmean_values = df["gmean"].to_numpy() + df_samples = df.drop(columns=["gmean"]) + # Count non-NaN cells for each row + no_nan_count = df_samples.count(axis=1).to_numpy() + for i in range(len(gmean_values)): + if no_nan_count[i] < 2: + gmean_values[i] = np.nan + return gmean_values + + +def metabolite_time_profiles_gmean_df_dict( + df: pd.DataFrame, metadata_df: pd.DataFrame, comparison: List[str] +) -> Dict[str, pd.DataFrame]: + """ + Using mean enrichment or abundances, + computes the arrays of geometric means, ordered by time. + Outputs dict of df of arrays, for comparing 2 condition time profiles. + """ + metadata_df = metadata_df.loc[ + metadata_df['condition'].isin(comparison), :] + metadata_df['timenum'] = metadata_df['timenum'].astype(float) + metadata_df_sorted = metadata_df.sort_values(by="timenum") + df = df[metadata_df_sorted['name_to_plot']] + metabolites = list(df.index) + inner_gmean_dict = {"metabolite": metabolites, + "gmean_arr_1": list(), "gmean_arr_2": list()} + for k, condition in enumerate(comparison): + metadata_sorted2 = metadata_df_sorted.loc[ + metadata_df_sorted["condition"] == condition, :].copy() + tmp_gmean_time_dict = {} + for curr_time in list(metadata_sorted2['timepoint'].unique()): + metadata_sorted3 = metadata_sorted2.loc[ + metadata_sorted2['timepoint'] == curr_time, :].copy() + data_time_df = df[metadata_sorted3['name_to_plot']].copy() + data_time_df = data_time_df.assign( + gmean=data_time_df.apply(lambda x: stats.gmean(x.dropna()), + axis=1)) + data_time_df['gmean'] = modify_gmean_by_sanity(data_time_df) + tmp_gmean_time_dict[curr_time] = data_time_df["gmean"].tolist() + tmp_df = pd.DataFrame(tmp_gmean_time_dict) + # tmp_df : T0 T2h ... + # 0 0.471528 0.719920 ... + # 1 3.692766 1.743812 ... + key_name = f'gmean_arr_{k + 1}' + inner_gmean_dict[key_name] = list(np.around(tmp_df.to_numpy(), 6)) + + return {'metabo_time_profile': pd.DataFrame(inner_gmean_dict)} + + +def compute_isotopologue_meaning(isotopologues_list: List[str] + ) -> pd.DataFrame: + """output: df : + isotopologue_name metabolite m+x + Cit_m+0 Cit 0 + ... + """ + isotopologues_uniq = sorted(list(np.unique(np.array(isotopologues_list)))) + clue_isotopologue = pd.DataFrame({ + "isotopologue_name": isotopologues_uniq + }) + clue_isotopologue[["metabolite", "m+x"]] = ( + clue_isotopologue["isotopologue_name"].str.split( + "_m+", expand=True, regex=False)) + clue_isotopologue["m+x"] = clue_isotopologue["m+x"].astype(int) + clue_isotopologue = clue_isotopologue.sort_values( + by=["metabolite", "m+x"]) + return clue_isotopologue + + +def conditions_to_comparisons(conditions: List[str]) -> List[List[str]]: + """ + for the bi-variate analysis generates comparisons ONLY FOR CONDITIONS + example: input: conditions = [A, B, C] + output : [[A, B], [A, C], [B, C]] + The order of the comparison in each inner list, e.g. [A, B] or [B, A] + does not affect the result of the bi-variate test (verified) + """ + comparisons = list() + for i in conditions: + for j in conditions: + if i != j: + pair = np.sort(np.array([i, j]), axis=None) + if list(pair) not in comparisons: + comparisons.append(list(pair)) + return comparisons + + +def timepoints_to_comparisons(metadata_df: pd.DataFrame) -> List[List[str]]: + """generates comparisons specifically as consecutive time points + output: e.g. [[T0, T1], [T1, T2]]""" + # order the time by the numeric part + time_df = metadata_df[["timepoint", "timenum"]].copy().drop_duplicates() + time_df["timenum"] = time_df["timenum"].astype(float) + time_df = time_df.sort_values(by='timenum') + comparisons = list() + for i in range(time_df.shape[0] - 1): + j = i + 1 + comparisons.append([ + time_df["timepoint"].tolist()[j], time_df["timepoint"].tolist()[i]]) + + return comparisons + + +def set_comparisons_by_behavior(behavior: str, cfg: DictConfig, + metadata_df: pd.DataFrame) -> List[List[str]]: + """ + For the bi-variate analysis, builds a list of lists, each list is a + comparison. The 'behavior' determines if conditions (1) or time-points (2) + Output: + e.g.(1): comparisons = [['Ctl', 'Treated1'], ['Ctl', 'Treated2']] + or e.g.(2): comparisons = [[T1, T0], [T2, T1]] + """ + if behavior in ["conditions_metabolite_time_profiles", + "conditions_MDV_comparison"]: + comparisons = conditions_to_comparisons(cfg.analysis.conditions) + elif behavior == "timepoints_MDV_comparison": + comparisons = timepoints_to_comparisons(metadata_df) + return comparisons + + +def save_output(result_df: pd.DataFrame, compartment: str, dataset: Dataset, + file_name: data_files_keys_type, out_file_name_str: str, + test: str, out_table_dir: str, cfg: DictConfig) -> None: + """saves result to tab delimited file, for one comparison""" + + result = result_df.sort_values(["correlation_coefficient", "padj"], + ascending=True) + for name_col in ['correlation_coefficient', 'pvalue', 'padj']: + result[name_col] = np.around(result[name_col].to_numpy(), 6) + + result['compartment'] = compartment + out_order_columns = ['correlation_coefficient', 'pvalue', + 'padj', 'compartment', 'gmean_arr_1', 'gmean_arr_2'] + result = result[out_order_columns] # fix the order of columns + if not cfg.analysis.method.output_include_gmean_arr_columns: + result = result.drop(columns=['gmean_arr_1', 'gmean_arr_2']) + + base_file_name = dataset.get_file_for_label(file_name) + base_file_name += f"--{compartment}-{out_file_name_str}-{test}" + output_file_name = os.path.join(out_table_dir, + f"{base_file_name}.tsv") + result.to_csv( + output_file_name, + index_label="metabolite", header=True, sep="\t" + ) + logger.info(f"Saved the result in {output_file_name}") + + +def bivariate_run_and_save_current_comparison( + file_name: data_files_keys_type, df: pd.DataFrame, + metadata_df: pd.DataFrame, compartment: str, dataset: Dataset, + cfg: DictConfig, comparison: List[str], + behavior: str, test: str, out_table_dir: str) -> None: + """ + Runs a bivariate analysis for blocks of values, handling 3 behavior types: + a - conditions_MDV_comparison: + comparison of the MDV arrays between 2 user specified conditions (separately for each time point) + b - timepoints_MDV_comparison: + comparison of the MDV arrays between 2 consecutive time points (separately for each condition) + c - conditions_metabolite_time_profiles: + comparison of the time course profiles of the metabolites + total abundances and mean enrichment, between two conditions + Finally, computes correction for multiple tests, and saves results + """ + df_dict = compute_bivariate_by_behavior( + df, metadata_df, comparison, behavior, test + ) + for akey in df_dict.keys(): + df = df_dict[akey] + + df.set_index("metabolite", inplace=True) + + # apply multiple test correction method to df results + result_df = compute_padj(df, 0.05, + cfg.analysis.method.correction_method) + + comparison_str = "-".join(comparison) + out_file_name_str = f"-MDV-{comparison_str}--{akey}" + if akey == "metabo_time_profile": + out_file_name_str = f"{comparison_str}" + + save_output(result_df, compartment, dataset, file_name, + out_file_name_str, test, out_table_dir, cfg) + + +def bivariate_comparison( + file_name: data_files_keys_type, dataset: Dataset, cfg: DictConfig, + behavior: str, out_table_dir: str +) -> None: + """ + Bi-variate analysis is performed on compartmentalized versions + of data files + Attention: we replace zero values using the provided method + Writes the table with computed statistics in the relevant output directory + """ + assert_literal(file_name, data_files_keys_type, "file name") + assert behavior in ["conditions_metabolite_time_profiles", + "conditions_MDV_comparison", + "timepoints_MDV_comparison"], "wrong behavior chosen" + impute_value = cfg.analysis.method.impute_values[file_name] + test = cfg.analysis.method[behavior][file_name] # e.g. pearson + + for compartment, compartmentalized_df in \ + dataset.compartmentalized_dfs[file_name].items(): + df = compartmentalized_df + metadata_df_subset = dataset.metadata_df.loc[ + dataset.metadata_df['compartment'] == compartment, :] + metadata_df_subset = metadata_df_subset.loc[ + metadata_df_subset['condition'].isin(cfg.analysis.conditions), :] + + df = df[metadata_df_subset['name_to_plot']] + + val_instead_zero = arg_repl_zero2value(impute_value, df) + df = df.replace(to_replace=0, value=val_instead_zero) + # note: do not drop rows all zero or all nan, blocks can break ! + if file_name == "abundances": # only reduction values if abundances + df = row_wise_nanstd_reduction(df) + df = df.round(decimals=6) + + automatic_comparisons = set_comparisons_by_behavior( + behavior, cfg, metadata_df_subset + ) + for comparison in automatic_comparisons: + # e.g. comparison = ['A', 'B'], list of exactly two elements + bivariate_run_and_save_current_comparison( + file_name, df, metadata_df_subset, compartment, + dataset, cfg, comparison, + behavior, # specifies the type of comparison + test, out_table_dir) diff --git a/src/dimet/processing/differential_analysis.py b/src/dimet/processing/differential_analysis.py index 01504e9..61dd37c 100644 --- a/src/dimet/processing/differential_analysis.py +++ b/src/dimet/processing/differential_analysis.py @@ -81,30 +81,6 @@ def distance_or_overlap(df: pd.DataFrame, groups: List) -> pd.DataFrame: return df -def select_rows_with_sufficient_non_nan_values___previous_version( - df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame): - """ - Identifies rows in the input DataFrame that have enough replicates. - Separates the DataFrame into two parts based on the presence or absence - of enough replicates, - returning them as separate DataFrames. - """ - # TODO: this is too stringent --> ! - # ! -- > changed to: len([not NaN values]) >= 2 in each group - # see the re-writen function select_rows_with_sufficient_non_nan_values - try: - bad_df = df[(df["count_nan_samples_group1"] > 0) | ( - df["count_nan_samples_group2"] > 0)] - good_df = df[(df["count_nan_samples_group1"] == 0) & ( - df["count_nan_samples_group2"] == 0)] - - except Exception as e: - print(e) - print("Error in separate_before_stats (enough replicates ?)") - - return good_df, bad_df - - def select_rows_with_sufficient_non_nan_values( df: pd.DataFrame, groups: List[List[str]] ) -> (pd.DataFrame, pd.DataFrame): diff --git a/tests/test_bivariate_analysis.py b/tests/test_bivariate_analysis.py new file mode 100644 index 0000000..22586ca --- /dev/null +++ b/tests/test_bivariate_analysis.py @@ -0,0 +1,144 @@ +from unittest import TestCase + +import numpy as np +import pandas as pd +from scipy import stats + +from dimet.processing import bivariate_analysis + + +class TestBivariateAnalysis(TestCase): + + def test_conditions_to_comparisons(self): + conditions_list = ["A", "B", "C", "D"] + result = bivariate_analysis.conditions_to_comparisons( + conditions_list + ) + self.assertListEqual(result[0], ['A', 'B']) + self.assertListEqual(result[1], ['A', 'C']) + self.assertListEqual(result[2], ['A', 'D']) + self.assertListEqual(result[3], ['B', 'C']) + self.assertListEqual(result[4], ['B', 'D']) + + def test_metabolite_time_profiles_gmean_df_dict(self): + df = pd.DataFrame({'metabolite': [1, 2], + 'A-T0-1': [4.6, 4.6], 'A-T0-2': [6.5, 6.5], + 'A-T1-1': [2, 2], + 'A-T1-2': [10.1, 8.4], 'A-T4-1': [5.6, 3.6], + 'A-T4-1': [1.6, 1.8], + 'A-T7-1': [7.6, 7.6], 'A-T7-1': [8.1, 9.3], + 'B-T0-1': [8.1, 9.3], 'B-T0-2': [1.6, 1.8], + 'B-T1-1': [6, 7], + 'B-T1-2': [10.1, 8.4], 'B-T4-1': [3.2, 3.6], + 'B-T4-1': [6.5, 6.5], + 'B-T7-1': [4.6, 4.6], 'B-T7-1': [7.6, 7.6]}) + metadata_df = pd.DataFrame({ + 'condition': ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'], + 'timenum': [0, 0, 1.5, 1.5, 4, 4, 7, 7, + 0, 0, 1.5, 1.5, 4, 4, 7, 7], + 'timepoint': ['0h', '0h', '1.5h', '1.5h', '4h', '4h', '7h', '7h', + '0h', '0h', '1.5h', '1.5h', '4h', '4h', '7h', '7h'], + 'name_to_plot': ['A-T0-1', 'A-T0-2', 'A-T1-1', 'A-T1-2', + 'A-T4-1', 'A-T4-1', 'A-T7-1', 'A-T7-1', + 'B-T0-1', 'B-T0-2', 'B-T1-1', 'B-T1-2', + 'B-T4-1', 'B-T4-1', 'B-T7-1', 'B-T7-1']}) + comparison = ["A", "B"] + result = bivariate_analysis.metabolite_time_profiles_gmean_df_dict( + df, metadata_df, comparison + ) # result : + # {'metabo_time_profile': + # metabolite gmean_arr_1 gmean_arr_2 + # 0 [5.468089, 4.494441, 1.6, 8.1] [3.6, 7.7846, 6.5, 7.6] + # 1 [5.468089, 4.09878, 1.8, 9.3] [4.091455, 7.668116, 6.5, 7.6]} + self.assertTrue(list(result.keys())[0] == 'metabo_time_profile') + self.assertListEqual( + result['metabo_time_profile'].iloc[0, 1].tolist(), + [5.468089, 4.494441, 1.6, 8.1]) + + def test_modify_gmean_by_sanity(self): + df = pd.DataFrame({'A-T0-1': [4.6, np.nan], + 'A-T0-2': [6.5, np.nan], + 'A-T0-3': [np.nan, 2]}) + df.index = ["metabolite1", "metabolite2"] + df = df.assign(gmean=df.apply(lambda x: stats.gmean(x.dropna()), + axis=1)) + # df.gmean.to_numpy() : [5.46808925, 2. ] + result = bivariate_analysis.modify_gmean_by_sanity(df) + # result : [5.46808925, nan] np array + self.assertFalse(np.all(df["gmean"].to_numpy() == result)) + self.assertAlmostEqual(result[0], 5.46808925, places=5) + self.assertTrue(np.isnan(result[1])) + + def test_inner_gmean_dict_filler(self): + k = 0 + inner_gmean_dict = {"metabolite": ['CoA', 'Ala'], + "gmean_arr_1": list()} + + df_a_group = pd.DataFrame({ + 'A-T0-1': [0.2, 0.3, 0.5, 0.1, 0.2, 0.3, 0.4], + 'A-T0-2': [0.17, 0.33, 0.5, 0.1, 0.22, 0.27, 0.41], + 'A-T0-3': [0.2, 0.28, 0.52, 0.11, 0.18, 0.29, 0.42]}) + df_a_group.index = ['CoA_m+0', 'CoA_m+1', 'CoA_m+2', + 'Ala_m+0', 'Ala_m+1', 'Ala_m+2', 'Ala_m+3'] + + clue_isotopologue_df = pd.DataFrame({ + 'isotopologue_name': ['CoA_m+0', 'CoA_m+1', 'CoA_m+2', + 'Ala_m+0', 'Ala_m+1', 'Ala_m+2', 'Ala_m+3'], + 'metabolite': ['CoA', 'CoA', 'CoA', 'Ala', 'Ala', 'Ala', 'Ala'], + 'm+x': [0, 1, 2, 0, 1, 2, 3]}) + clue_isotopologue_df['m+x'] = clue_isotopologue_df['m+x'].astype(int) + + result = bivariate_analysis.inner_gmean_dict_filler( + k, inner_gmean_dict, df_a_group, clue_isotopologue_df + ) # result: + # {'metabolite': ['CoA', 'Ala'], + # ' gmean_arr_1': [array([0.189454, 0.302643, 0.50658 ]), + # array([0.103228, 0.199331, 0.286392, 0.409919])]} + self.assertTrue(np.allclose(np.array( + [0.189454, 0.302643, 0.50658]), + result['gmean_arr_1'][0], rtol=1e-6)) + self.assertTrue(np.allclose(np.array( + [0.103228, 0.199331, 0.286392, 0.409919]), + result['gmean_arr_1'][1], rtol=1e-6)) + + def test_compute_test_for_df_dict(self): + df = pd.DataFrame({ + 'metabolite': ['CoA', 'Ala'], + 'gmean_arr_1': [np.array([0.1894, 0.3026, 0.506]), + np.array([0.1032, 0.1993, 0.2863, 0.4099])], + 'gmean_arr_2': [np.array([0.506, 0.3026, 0.1894]), + np.array([0.4099, 0.2863, 0.1993, 0.1032])], + }) + df_dict = {'T0': df} # df comparing two conditions A vs B, at T0 + test = "pearson" + result = bivariate_analysis.compute_test_for_df_dict(df_dict, test) + self.assertEqual('T0', list(result.keys())[0]) + self.assertTrue( + set(list(['metabolite', 'pvalue', 'correlation_coefficient'] + )).issubset(set(list(result['T0'].columns))) + ) + + def test_compute_statistical_correlation(self): + df = pd.DataFrame({ + 'metabolite': ['CoA', 'Ala'], + 'gmean_arr_1': [np.array([0.1894, 0.3026, 0.506]), + np.array([0.1032, 0.1993, 0.2863, 0.4099])], + 'gmean_arr_2': [np.array([0.506, 0.3026, 0.1894]), + np.array([0.4099, 0.2863, 0.1993, 0.1032])], + }) + df.index = ['CoA', 'Ala'] + test = "pearson" + result = bivariate_analysis.compute_statistical_correlation(df, test) + + self.assertAlmostEqual(result.loc['CoA', 'correlation_coefficient'], + -0.947312, places=5) + self.assertAlmostEqual(result.loc['CoA', 'pvalue'], + 0.207574, places=5) + self.assertAlmostEqual(result.loc['Ala', 'correlation_coefficient'], + -0.992587, places=5) + self.assertAlmostEqual(result.loc['Ala', 'pvalue'], + 0.007413, places=5) + + +