Merge pull request #27 from cbib/access-test-bivar

Access test bivar
cbib · Mar 21, 2024 · 05e5152 · 05e5152
2 parents 8f347e3 + ab9722a
commit 05e5152
Show file tree

Hide file tree

Showing 12 changed files with 136 additions and 41 deletions.
diff --git a/src/dimet/__main__.py b/src/dimet/__main__.py
@@ -21,7 +21,8 @@
 @hydra.main(config_path="./config", config_name="config", version_base=None)
 def main_run_analysis(cfg: DictConfig) -> None:
     logger.info(f"The current working directory is {os.getcwd()}")
-    logger.info("Current configuration is %s", OmegaConf.to_yaml(cfg))
+    logger.info("Current configuration and defaults, %s",
+                OmegaConf.to_yaml(cfg))
 
     dataset: Dataset = Dataset(
         config=hydra.utils.instantiate(cfg.analysis.dataset))

diff --git a/src/dimet/config/analysis/method/bivariate_analysis.yaml b/src/dimet/config/analysis/method/bivariate_analysis.yaml
@@ -6,14 +6,16 @@ name: Computation of the correlation of MDV profiles, or the metabolite time cou
 # (**) : automatically will run
 
 conditions_MDV_comparison: # (**) if >= 2 conditions and >=1 timepoint (timepoints run separately)
-  isotopologue_proportions: pearson
+  isotopologue_proportions: spearman
 
 timepoints_MDV_comparison:  # (**) if >= 1 condition and >=2 timepoints
-  isotopologue_proportions: pearson
+  isotopologue_proportions: spearman
 
 conditions_metabolite_time_profiles:  # (**) if >= 2 conditions  AND >=2 time points in data
-  abundances: pearson
-  mean_enrichment: pearson
+  abundances: spearman
+  mean_enrichment: spearman
+
+# test modification accessible through external config 'statistical_test'
 
 correction_method: fdr_bh
 

diff --git a/src/dimet/config/analysis/method/differential_analysis.yaml b/src/dimet/config/analysis/method/differential_analysis.yaml
@@ -26,4 +26,6 @@ disfit_tail_option: "auto"
 
 # Note: the disfit_tail_option depends on the comparison and the data:
 #    if advanced knowledge of both, set "two-sided" or "right-tailed"
-#    otherwise leave "auto" as default
+#    otherwise leave "auto" as default
+
+# Note2: the statistical_test options can be modified using the external config (not here)
diff --git a/src/dimet/config/analysis/method/time_course_analysis.yaml b/src/dimet/config/analysis/method/time_course_analysis.yaml
@@ -27,3 +27,5 @@ disfit_tail_option: "auto"
 # Note: the disfit_tail_option depends on the comparison and the data:
 #    if advanced knowledge of both, set "two-sided" or "right-tailed"
 #    otherwise leave "auto" as default
+
+# Note2: the statistical_test options can be modified using the external config (not here)
diff --git a/src/dimet/constants.py b/src/dimet/constants.py
@@ -25,6 +25,8 @@ def assert_literal(value: str, lit_type, check: Optional[str] = None):
     "isotopologues",
 ]
 
+supported_file_extension = ["csv", "tsv"]
+
 availtest_methods = ["MW", "KW", "ranksum", "Wcox", "Tt", "BrMu", "prm-scipy",
                      "disfit", "none"]
 
@@ -64,3 +66,6 @@ def assert_literal(value: str, lit_type, check: Optional[str] = None):
 columns_transcripts_config_keys = ['ID', 'values']
 
 metabolites_values_for_metabologram = ['log2FC', 'FC']
+
+# minimum non-zero value tolerated when values are fractions or proportions
+minimum_tolerated_fraction_value = 1e-4
diff --git a/src/dimet/data/__init__.py b/src/dimet/data/__init__.py
@@ -11,7 +11,7 @@
 from dimet.constants import molecular_types_for_metabologram
 from dimet.helpers import (df_to_dict_by_compartment,
                            drop_all_nan_metabolites_on_comp_frames,
-                           set_samples_names,
+                           extfind, set_samples_names,
                            verify_metadata_sample_not_duplicated,
                            message_bad_separator_input)
 
@@ -76,20 +76,23 @@ def preload(self):
             self.sub_folder_absolute = self.config.subfolder
 
         # start loading the dataframes
+        ext = self.get_files_extension_as_dict()  # extension str by file
         file_paths = [
-            ("metadata", os.path.join(self.sub_folder_absolute,
-                                      self.config.metadata + ".csv")),
-            ("abundances", os.path.join(self.sub_folder_absolute,
-                                        self.config.abundances + ".csv")),
+            ("metadata", os.path.join(
+                self.sub_folder_absolute,
+                self.config.metadata + "." + ext['metadata'])),
+            ("abundances", os.path.join(
+                self.sub_folder_absolute,
+                self.config.abundances + "." + ext['abundances'])),
             ("mean_enrichment", os.path.join(
                 self.sub_folder_absolute,
-                self.config.mean_enrichment + ".csv")),
+                self.config.mean_enrichment + "." + ext['mean_enrichment'])),
             ("isotopologue_proportions", os.path.join(
                 self.sub_folder_absolute,
-                self.config.isotopologue_proportions + ".csv")),
+                self.config.isotopologue_proportions + "." + ext['isotopologue_proportions'])),
             ("isotopologues", os.path.join(
                 self.sub_folder_absolute,
-                self.config.isotopologues + ".csv")),
+                self.config.isotopologues + "." + ext['isotopologues'])),
         ]
         dfs = []
         for label, file_path in file_paths:
@@ -103,13 +106,13 @@ def preload(self):
                     dfs.append(pd.read_csv(file_path, sep="\t", header=0))
                 self.available_datasets.add(label)
             except FileNotFoundError:
-                if file_path.endswith(self.config.isotopologues + ".csv"):
+                if file_path.endswith(self.config.isotopologues + "." + ext['isotopologues']):
                     message_detail = "isotopologue absolute values missing"
                     logger.critical(
-                        "File %s not found (%s), continuing"
+                        "File %s not found (%s), continue"
                         % (file_path, message_detail))
                 else:
-                    logger.critical("File %s not found, continuing",
+                    logger.critical("File %s not found, continue",
                                     file_path)
                 dfs.append(None)
             except Exception as e:
@@ -182,6 +185,22 @@ def get_file_for_label(self, label):
         else:
             raise ValueError(f"Unknown label {label}")
 
+    def get_files_extension_as_dict(self):
+        """returns dictionary of file extensions, uses extfind (helpers)"""
+        extension_dict: Dict[str, str] = dict()
+        extension_dict['metadata'] = extfind(self.sub_folder_absolute,
+                                             self.config.metadata)
+        extension_dict['abundances'] = extfind(self.sub_folder_absolute,
+                                               self.config.abundances)
+        extension_dict['mean_enrichment'] = extfind(
+            self.sub_folder_absolute, self.config.mean_enrichment)
+        extension_dict['isotopologues'] = extfind(self.sub_folder_absolute,
+                                                  self.config.isotopologues)
+        extension_dict['isotopologue_proportions'] = extfind(
+            self.sub_folder_absolute,
+            self.config.isotopologue_proportions)
+        return extension_dict
+
 
 class DataIntegrationConfig(DatasetConfig):
     transcripts: ListConfig
@@ -225,32 +244,39 @@ def load_deg_dfs(self):
         # the keys are integers, with the order of files in the dataset yml
         for i, file_name in enumerate(self.config.transcripts):
             try:
+                file_extension = extfind(self.sub_folder_absolute, file_name)
                 path_deg_file = os.path.join(
                     self.sub_folder_absolute,
-                    f"{file_name}.csv")
+                    f"{file_name}.{file_extension}")
                 deg_df = pd.read_csv(path_deg_file, sep='\t', header=0)
                 self.deg_dfs[i] = deg_df
             except FileNotFoundError:
-                logger.info(f"{file_name}.csv: file not found")
+                logger.info(f"{file_name}.{file_extension}: file not found")
             except Exception as e:
-                logger.info(f'Error while opening file {file_name}.csv {e}')
+                logger.info(
+                    f'Error while opening file {file_name}.{file_extension} '
+                    f' \n {e}')
 
         logger.info("Finished loading transcripts dataframes: "
                     "%s", self.config.transcripts)
 
     def load_pathways_dfs(self):
         for k in self.config.pathways.keys():
             try:
+                file_extension = extfind(
+                    self.sub_folder_absolute, self.config.pathways[k])
                 path_file = os.path.join(
                     self.sub_folder_absolute,
-                    f"{self.config.pathways[k]}.csv")
+                    f"{self.config.pathways[k]}.{file_extension}")
                 pathway_df = pd.read_csv(path_file, sep='\t', header=0)
                 self.pathways_dfs[k] = pathway_df
             except FileNotFoundError:
-                logger.info(f"{self.config.pathways[k]}.csv: file not found")
+                logger.info(
+                    f"{self.config.pathways[k]}.{file_extension}: not found")
             except Exception as e:
-                logger.info(f'Error while opening file '
-                            f'{self.config.pathways[k]}.csv {e}')
+                logger.info(
+                    f'{e}. Error while opening file '
+                    f'{self.config.pathways[k]}.{file_extension} \n {e}')
 
         logger.info("Finished loading pathways dataframes: "
                     "%s", self.config.pathways)

diff --git a/src/dimet/helpers.py b/src/dimet/helpers.py
@@ -3,12 +3,16 @@
 """
 @author: Johanna Galvis, Florian Specque, Macha Nikolski
 """
+import os
 import logging
 from collections.abc import Iterable
 from functools import reduce
 from typing import Dict, List
 
-from dimet.constants import assert_literal, overlap_methods_types
+from dimet.constants import (assert_literal,
+                             minimum_tolerated_fraction_value,
+                             overlap_methods_types,
+                             supported_file_extension)
 
 import numpy as np
 
@@ -73,7 +77,7 @@ def concatenate_dataframes(df1: pd.DataFrame, df2: pd.DataFrame,
     df2 = df2.reindex(columns=df1.columns, fill_value=np.nan)
     df3 = df3.reindex(columns=df1.columns, fill_value=np.nan)
     # please leave ignore_index as False:
-    # otherwise numbers and not metabolites appear in .csv exported results:
+    # otherwise numbers and not metabolites appear in exported results:
     result = pd.concat([df1, df2, df3], ignore_index=False)
     return result
 
@@ -197,7 +201,9 @@ def arg_repl_zero2value(how: str, df: pd.DataFrame) -> float:
             except Exception as e:
                 logger.info(f"{e}. {err_msg}")
                 raise ValueError(err_msg)
-        min_value = df[df > 0].min(skipna=True).min(skipna=True)
+        min_value = df[
+            df >= minimum_tolerated_fraction_value  # '> 0' prone to errors if values < 1e-6 exist in the data (and round is 6 places!)
+        ].min(skipna=True).min(skipna=True)
         output_value = min_value / denominator
     else:
         try:
@@ -488,3 +494,18 @@ def msg_correction_method_not_suitable(filename: str, test: str) -> str:
                f" for multiple tests correction (e.g. Bonferroni, "
                f" B-H, or other), is unsuitable and will be omitted")
     return message
+
+
+def extfind(parent_folder_absolute, file_name):
+    repertoire_extension = supported_file_extension
+    out_str = ""
+    for x in repertoire_extension:
+        if not os.path.exists(
+                os.path.join(parent_folder_absolute,
+                             file_name + f".{x}")):
+            continue
+        else:
+            out_str = x
+    if out_str not in repertoire_extension:
+        out_str = "csv"  # if none of the supported extensions, try this one
+    return out_str
diff --git a/src/dimet/method/__init__.py b/src/dimet/method/__init__.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import sys
-from typing import Union
+from typing import Dict, Union
 
 import hydra
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
@@ -191,6 +191,14 @@ class BivariateAnalysisConfig(MethodConfig):
     """
     correction_method: str = "fdr_bh"
     output_include_gmean_arr_columns: bool = True
+    conditions_MDV_comparison: Dict[str, str] = {
+        'isotopologue_proportions': 'spearman'}
+    timepoints_MDV_comparison: Dict[str, str] = {
+        'isotopologue_proportions': 'spearman'}
+    conditions_metabolite_time_profiles: Dict[str, str] = {
+        'abundances': 'spearman',
+        'mean_enrichment': 'spearman'
+    }
 
     def build(self) -> "BivariateAnalysis":
         return BivariateAnalysis(config=self)
@@ -873,37 +881,64 @@ def check_expectations_config_metabo(
 class BivariateAnalysis(Method):
     config: BivariateAnalysisConfig
 
+    @staticmethod
+    def replace_test__if_user_external_config(cfg):
+        """
+        replaces the test in bivariate internal config if user set another
+         test in the external config
+        """
+        if "statistical_test" in list(cfg.analysis.keys()):
+            if cfg.analysis.statistical_test is not None:
+                user_test = cfg.analysis.statistical_test
+                cfg.analysis.method.conditions_MDV_comparison[
+                    'isotopologue_proportions'] = user_test
+                cfg.analysis.method.timepoints_MDV_comparison[
+                    'isotopologue_proportions'] = user_test
+                cfg.analysis.method.conditions_metabolite_time_profiles[
+                    'abundances'] = user_test
+                cfg.analysis.method.conditions_metabolite_time_profiles[
+                    'mean_enrichment'] = user_test
+        else:
+            pass
+        return cfg
+
     def run(self, cfg: DictConfig, dataset: Dataset) -> None:
         """
         Runs bivariate analysis, the 'behavior' is the type of comparison:
         - conditions_MDV_comparison
         - timepoints_MDV_comparison
         - conditions_metabolite_time_profiles
         """
+        logger.info(f"The current working directory is {os.getcwd()}")
+
+        cfg = self.replace_test__if_user_external_config(cfg)
+        logger.info("Current configuration is %s", OmegaConf.to_yaml(cfg))
+
         logger.info(
             "Will compute bi-variate analysis, with the following config: %s",
             self.config)
 
         out_table_dir = os.path.join(os.getcwd(), cfg.table_path)
         os.makedirs(out_table_dir, exist_ok=True)
+
         self.check_expectations(cfg, dataset)
 
-        datatype = "isotopologue_proportions"
-        if datatype in dataset.compartmentalized_dfs.keys():
+        datatype_mdv = "isotopologue_proportions"
+        if datatype_mdv in dataset.compartmentalized_dfs.keys():
             logger.info(f"Running bi-variate analysis with "
-                        f"{datatype}:")
+                        f"{datatype_mdv}:")
             if len(cfg.analysis.conditions) >= 2:
                 logger.info("assessing MDV (Mass Distribution Vector) "
                             "between conditions")
                 bivariate_comparison(
-                    datatype, dataset, cfg,
+                    datatype_mdv, dataset, cfg,
                     behavior="conditions_MDV_comparison",
                     out_table_dir=out_table_dir)
             if len(dataset.metadata_df["timepoint"].unique()) >= 2:
                 logger.info("assessing MDV (Mass Distribution Vector) "
                             "between time-points")
                 bivariate_comparison(
-                    datatype, dataset, cfg,
+                    datatype_mdv, dataset, cfg,
                     behavior="timepoints_MDV_comparison",
                     out_table_dir=out_table_dir)
 

diff --git a/src/dimet/processing/bivariate_analysis.py b/src/dimet/processing/bivariate_analysis.py
@@ -32,7 +32,7 @@ def compute_statistical_correlation(df: pd.DataFrame,
     stat_list = []
     pvalue_list = []
     for i, metabolite in enumerate(list(df['metabolite'])):
-        # array of n-(timepoints or m+x) geometrical means values
+        # array of n (timepoints or m+x) geometrical means values
         array_1 = df.loc[metabolite, "gmean_arr_1"]
         array_2 = df.loc[metabolite, "gmean_arr_2"]
         if test == "pearson":
@@ -79,7 +79,7 @@ def compute_bivariate_by_behavior(
     """
     performs two steps:
     1. calls functions to compute geometric means, obtaining df's inside dict
-    2. computes the bivariate statistical test (pearson by default)
+    2. computes the bivariate statistical test
     """
     if behavior == "conditions_MDV_comparison":
         df_dict = conditions_MDV_gmean_df_dict(df, metadata_df, comparison)

diff --git a/src/dimet/processing/pca_analysis.py b/src/dimet/processing/pca_analysis.py
@@ -121,12 +121,12 @@ def pca_global_compartment_dataset(df: pd.DataFrame,
 
 def send_to_tables(pca_results_compartment_dict: dict,
                    out_table_dir: str) -> None:
-    """ Save each result to .csv files """
+    """ Save each result to tab delimited files """
     for tup in pca_results_compartment_dict.keys():
         out_table = "--".join(list(tup))
         for df in pca_results_compartment_dict[tup].keys():
             pca_results_compartment_dict[tup][df].to_csv(
-                os.path.join(out_table_dir, f"{out_table}_{df}.csv"),
+                os.path.join(out_table_dir, f"{out_table}_{df}.tsv"),
                 sep='\t', index=False)
     logger.info(f"Saved pca tables in {out_table_dir}")
 
@@ -136,7 +136,7 @@ def run_pca_analysis(file_name: data_files_keys_type,
                      out_table_dir: str, mode: str) -> Union[None, dict]:
     """
     Generates all PCA results, both global (default) and with splited data.
-     - mode='save_tables', the PCA tables are saved to .csv;
+     - mode='save_tables', the PCA tables are saved to tab delimited files;
      or
      - mode='return_results_dict', returns the results object (dict)
     """

diff --git a/src/dimet/visualization/abundance_bars.py b/src/dimet/visualization/abundance_bars.py
@@ -62,8 +62,9 @@ def plot_one_metabolite(df: pd.DataFrame,
         palette=palette_choice,
         alpha=1,
         edgecolor="black",
-        errcolor="black",
-        errwidth=1.7,
+        # errcolor="black",   # deprecated in seaborn0.13.2
+        # errwidth=1.7,   # deprecated in seaborn0.13.2
+        err_kws={'linewidth': 1.7, 'color': 'black'},
         capsize=0.12,
     )
     if do_stripplot: