Merge pull request #20 from cbib/new-bivar

New: bivariate analysis
cbib · Jan 22, 2024 · e5ba409 · e5ba409
2 parents 64211e2 + 5931508
commit e5ba409
Show file tree

Hide file tree

Showing 7 changed files with 705 additions and 46 deletions.
diff --git a/src/dimet/__main__.py b/src/dimet/__main__.py
@@ -25,7 +25,6 @@ def main_run_analysis(cfg: DictConfig) -> None:
         config=hydra.utils.instantiate(cfg.analysis.dataset))
     dataset.preload()
     dataset.split_datafiles_by_compartment()
-    dataset.save_datafiles_split_by_compartment()
     method: Method = hydra.utils.instantiate(
         cfg.analysis.method).build()  # method factory
 

diff --git a/src/dimet/config/analysis/method/bivariate_analysis.yaml b/src/dimet/config/analysis/method/bivariate_analysis.yaml
@@ -0,0 +1,26 @@
+_target_: dimet.method.BivariateAnalysisConfig
+
+label: bivariate analysis
+name: Computation of the correlation of MDV profiles, or the metabolite time course profiles
+
+# (**) : automatically will run
+
+conditions_MDV_comparison: # (**) if >= 2 conditions and >=1 timepoint (timepoints run separately)
+  isotopologue_proportions: pearson
+
+timepoints_MDV_comparison:  # (**) if >= 1 condition and >=2 timepoints
+  isotopologue_proportions: pearson
+
+conditions_metabolite_time_profiles:  # (**) if >= 2 conditions  AND >=2 time points in data
+  abundances: pearson
+  mean_enrichment: pearson
+
+correction_method: fdr_bh
+
+impute_values:
+  abundances: "min"
+  mean_enrichment: "min"
+  isotopologues: "min"
+  isotopologue_proportions: "min"
+
+output_include_gmean_arr_columns: True # if False, the 'gmean_arr_.." columns are excluded
diff --git a/src/dimet/data/__init__.py b/src/dimet/data/__init__.py
@@ -47,7 +47,6 @@ def build(self) -> "Dataset":
 class Dataset(BaseModel):
     config: DatasetConfig
     raw_data_folder: str = None
-    processed_data_folder: str = None
     sub_folder_absolute: str = None
     metadata_df: Optional[pd.DataFrame] = None
     abundances_df: Optional[pd.DataFrame] = None
@@ -77,8 +76,7 @@ def preload(self):
         else:
             self.sub_folder_absolute = self.config.subfolder
         self.raw_data_folder = os.path.join(self.sub_folder_absolute, "raw")
-        self.processed_data_folder = os.path.join(self.sub_folder_absolute,
-                                                  "processed")
+
         # start loading the dataframes
         file_paths = [
             ("metadata", os.path.join(self.raw_data_folder,
@@ -106,10 +104,14 @@ def preload(self):
                     dfs.append(pd.read_csv(file_path, sep="\t", header=0))
                 self.available_datasets.add(label)
             except FileNotFoundError:
-                logger.critical(
-                    "File %s not found, continuing, "
-                    "but this might fail miserably",
-                    file_path)
+                if file_path.endswith(self.config.isotopologues + ".csv"):
+                    message_detail = "isotopologue absolute values missing"
+                    logger.critical(
+                        "File %s not found (%s), continuing"
+                        % (file_path, message_detail))
+                else:
+                    logger.critical("File %s not found, continuing",
+                                    file_path)
                 dfs.append(None)
             except Exception as e:
                 logger.error(
@@ -169,19 +171,6 @@ def split_datafiles_by_compartment(self) -> None:
         frames_dict = set_samples_names(frames_dict, self.metadata_df)
         self.compartmentalized_dfs = frames_dict
 
-    def save_datafiles_split_by_compartment(self) -> None:
-        os.makedirs(self.processed_data_folder, exist_ok=True)
-        out_data_path = self.processed_data_folder
-        for file_name in self.compartmentalized_dfs.keys():
-            for compartment in self.compartmentalized_dfs[file_name].keys():
-                df = self.compartmentalized_dfs[file_name][compartment]
-                tmp_file_name = self.get_file_for_label(file_name)
-                output_file_name = f"{tmp_file_name}-{compartment}.csv"
-                df.to_csv(os.path.join(out_data_path, output_file_name),
-                          sep="\t", header=True, index=False)
-                logger.info(
-                    f"Saved the {compartment} compartment version "
-                    f"of {file_name} in {out_data_path}")
 
     def get_file_for_label(self, label):
         if label == "abundances":
@@ -210,7 +199,6 @@ class DataIntegration(Dataset):
     def set_dataset_integration_config(self):
         self.preload()
         self.split_datafiles_by_compartment()
-        self.save_datafiles_split_by_compartment()
 
         self.integration_files_folder_absolute = os.path.join(
             self.sub_folder_absolute, "integration_files")

diff --git a/src/dimet/method/__init__.py b/src/dimet/method/__init__.py
@@ -15,6 +15,7 @@
                              metabolites_values_for_metabologram)
 from dimet.data import DataIntegration, Dataset
 from dimet.helpers import flatten
+from dimet.processing.bivariate_analysis import bivariate_comparison
 from dimet.processing.differential_analysis import (differential_comparison,
                                                     multi_group_compairson,
                                                     time_course_analysis)
@@ -184,6 +185,18 @@ def build(self) -> "MetabologramIntegration":
         return MetabologramIntegration(config=self)
 
 
+class BivariateAnalysisConfig(MethodConfig):
+    """
+    Sets default values or fills them for the bi-variate analysis
+    """
+    correction_method: str = "fdr_bh"
+    output_include_gmean_arr_columns: bool = True
+
+    def build(self) -> "BivariateAnalysis":
+        return BivariateAnalysis(config=self)
+
+
+
 class Method(BaseModel):
     config: MethodConfig
 
@@ -852,3 +865,78 @@ def check_expectations_config_metabo(
         except ValueError as e:
             logger.error(f"Data inconsistency: {e}")
             sys.exit(1)
+
+
+class BivariateAnalysis(Method):
+    config: BivariateAnalysisConfig
+
+    def run(self, cfg: DictConfig, dataset: Dataset) -> None:
+        """
+        Runs bivariate analysis, the 'behavior' is the type of comparison:
+        - conditions_MDV_comparison
+        - timepoints_MDV_comparison
+        - conditions_metabolite_time_profiles
+        """
+        logger.info(
+            "Will compute bi-variate analysis, with the following config: %s",
+            self.config)
+
+        out_table_dir = os.path.join(os.getcwd(), cfg.table_path)
+        os.makedirs(out_table_dir, exist_ok=True)
+        self.check_expectations(cfg, dataset)
+
+        datatype = "isotopologue_proportions"
+        if datatype in dataset.compartmentalized_dfs.keys():
+            logger.info(f"Running bi-variate analysis with "
+                        f"{datatype}:")
+            if len(cfg.analysis.conditions) >= 2:
+                logger.info("assessing MDV (Mass Distribution Vector) "
+                            "between conditions")
+                bivariate_comparison(
+                    datatype, dataset, cfg,
+                    behavior="conditions_MDV_comparison",
+                    out_table_dir=out_table_dir)
+            if len(dataset.metadata_df["timepoint"].unique()) >= 2:
+                logger.info("assessing MDV (Mass Distribution Vector) "
+                            "between time-points")
+                bivariate_comparison(
+                    datatype, dataset, cfg,
+                    behavior="timepoints_MDV_comparison",
+                    out_table_dir=out_table_dir)
+
+        if (len(cfg.analysis.conditions) >= 2) and (
+           len(dataset.metadata_df["timepoint"].unique()) >= 2):
+            for datatype in ["abundances", "mean_enrichment"]:
+                if datatype in dataset.compartmentalized_dfs.keys():
+                    logger.info(f"Running bi-variate analysis with "
+                                f"{datatype} to compare "
+                                f"time course profiles between conditions")
+                    bivariate_comparison(
+                        datatype, dataset, cfg,
+                        behavior="conditions_metabolite_time_profiles",
+                        out_table_dir=out_table_dir)
+
+    def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None:
+        # check that necessary information is provided in the analysis config
+        try:
+            if ((len(cfg.analysis.conditions) < 2) and
+                    (len(dataset.metadata_df["timepoint"].unique()) < 2)):
+                raise ValueError("Less than 2 conditions, "
+                                 "AND less than 2 timepoints, "
+                                 "impossible to run bi-variate analysis, "
+                                 "aborting")
+            if not set(cfg.analysis.conditions).issubset(
+                        set(dataset.metadata_df['condition'])):
+                raise ValueError(
+                    "Conditions provided for bi-variate analysis "
+                    "in the config file "
+                    "are not present in the metadata file, aborting"
+                )
+        except ConfigAttributeError as e:
+            logger.error(
+                f"Mandatory parameter not provided in the config file:{e}, "
+                f"aborting")
+            sys.exit(1)
+        except ValueError as e:
+            logger.error(f"Data inconsistency:{e}")
+            sys.exit(1)