Skip to content

Commit

Permalink
Merge pull request #27 from cbib/access-test-bivar
Browse files Browse the repository at this point in the history
Access test bivar
  • Loading branch information
johaGL authored Mar 21, 2024
2 parents 8f347e3 + ab9722a commit 05e5152
Show file tree
Hide file tree
Showing 12 changed files with 136 additions and 41 deletions.
3 changes: 2 additions & 1 deletion src/dimet/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
@hydra.main(config_path="./config", config_name="config", version_base=None)
def main_run_analysis(cfg: DictConfig) -> None:
logger.info(f"The current working directory is {os.getcwd()}")
logger.info("Current configuration is %s", OmegaConf.to_yaml(cfg))
logger.info("Current configuration and defaults, %s",
OmegaConf.to_yaml(cfg))

dataset: Dataset = Dataset(
config=hydra.utils.instantiate(cfg.analysis.dataset))
Expand Down
10 changes: 6 additions & 4 deletions src/dimet/config/analysis/method/bivariate_analysis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ name: Computation of the correlation of MDV profiles, or the metabolite time cou
# (**) : automatically will run

conditions_MDV_comparison: # (**) if >= 2 conditions and >=1 timepoint (timepoints run separately)
isotopologue_proportions: pearson
isotopologue_proportions: spearman

timepoints_MDV_comparison: # (**) if >= 1 condition and >=2 timepoints
isotopologue_proportions: pearson
isotopologue_proportions: spearman

conditions_metabolite_time_profiles: # (**) if >= 2 conditions AND >=2 time points in data
abundances: pearson
mean_enrichment: pearson
abundances: spearman
mean_enrichment: spearman

# test modification accessible through external config 'statistical_test'

correction_method: fdr_bh

Expand Down
4 changes: 3 additions & 1 deletion src/dimet/config/analysis/method/differential_analysis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@ disfit_tail_option: "auto"

# Note: the disfit_tail_option depends on the comparison and the data:
# if advanced knowledge of both, set "two-sided" or "right-tailed"
# otherwise leave "auto" as default
# otherwise leave "auto" as default

# Note2: the statistical_test options can be modified using the external config (not here)
2 changes: 2 additions & 0 deletions src/dimet/config/analysis/method/time_course_analysis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,5 @@ disfit_tail_option: "auto"
# Note: the disfit_tail_option depends on the comparison and the data:
# if advanced knowledge of both, set "two-sided" or "right-tailed"
# otherwise leave "auto" as default

# Note2: the statistical_test options can be modified using the external config (not here)
5 changes: 5 additions & 0 deletions src/dimet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def assert_literal(value: str, lit_type, check: Optional[str] = None):
"isotopologues",
]

supported_file_extension = ["csv", "tsv"]

availtest_methods = ["MW", "KW", "ranksum", "Wcox", "Tt", "BrMu", "prm-scipy",
"disfit", "none"]

Expand Down Expand Up @@ -64,3 +66,6 @@ def assert_literal(value: str, lit_type, check: Optional[str] = None):
columns_transcripts_config_keys = ['ID', 'values']

metabolites_values_for_metabologram = ['log2FC', 'FC']

# minimum non-zero value tolerated when values are fractions or proportions
minimum_tolerated_fraction_value = 1e-4
62 changes: 44 additions & 18 deletions src/dimet/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from dimet.constants import molecular_types_for_metabologram
from dimet.helpers import (df_to_dict_by_compartment,
drop_all_nan_metabolites_on_comp_frames,
set_samples_names,
extfind, set_samples_names,
verify_metadata_sample_not_duplicated,
message_bad_separator_input)

Expand Down Expand Up @@ -76,20 +76,23 @@ def preload(self):
self.sub_folder_absolute = self.config.subfolder

# start loading the dataframes
ext = self.get_files_extension_as_dict() # extension str by file
file_paths = [
("metadata", os.path.join(self.sub_folder_absolute,
self.config.metadata + ".csv")),
("abundances", os.path.join(self.sub_folder_absolute,
self.config.abundances + ".csv")),
("metadata", os.path.join(
self.sub_folder_absolute,
self.config.metadata + "." + ext['metadata'])),
("abundances", os.path.join(
self.sub_folder_absolute,
self.config.abundances + "." + ext['abundances'])),
("mean_enrichment", os.path.join(
self.sub_folder_absolute,
self.config.mean_enrichment + ".csv")),
self.config.mean_enrichment + "." + ext['mean_enrichment'])),
("isotopologue_proportions", os.path.join(
self.sub_folder_absolute,
self.config.isotopologue_proportions + ".csv")),
self.config.isotopologue_proportions + "." + ext['isotopologue_proportions'])),
("isotopologues", os.path.join(
self.sub_folder_absolute,
self.config.isotopologues + ".csv")),
self.config.isotopologues + "." + ext['isotopologues'])),
]
dfs = []
for label, file_path in file_paths:
Expand All @@ -103,13 +106,13 @@ def preload(self):
dfs.append(pd.read_csv(file_path, sep="\t", header=0))
self.available_datasets.add(label)
except FileNotFoundError:
if file_path.endswith(self.config.isotopologues + ".csv"):
if file_path.endswith(self.config.isotopologues + "." + ext['isotopologues']):
message_detail = "isotopologue absolute values missing"
logger.critical(
"File %s not found (%s), continuing"
"File %s not found (%s), continue"
% (file_path, message_detail))
else:
logger.critical("File %s not found, continuing",
logger.critical("File %s not found, continue",
file_path)
dfs.append(None)
except Exception as e:
Expand Down Expand Up @@ -182,6 +185,22 @@ def get_file_for_label(self, label):
else:
raise ValueError(f"Unknown label {label}")

def get_files_extension_as_dict(self):
"""returns dictionary of file extensions, uses extfind (helpers)"""
extension_dict: Dict[str, str] = dict()
extension_dict['metadata'] = extfind(self.sub_folder_absolute,
self.config.metadata)
extension_dict['abundances'] = extfind(self.sub_folder_absolute,
self.config.abundances)
extension_dict['mean_enrichment'] = extfind(
self.sub_folder_absolute, self.config.mean_enrichment)
extension_dict['isotopologues'] = extfind(self.sub_folder_absolute,
self.config.isotopologues)
extension_dict['isotopologue_proportions'] = extfind(
self.sub_folder_absolute,
self.config.isotopologue_proportions)
return extension_dict


class DataIntegrationConfig(DatasetConfig):
transcripts: ListConfig
Expand Down Expand Up @@ -225,32 +244,39 @@ def load_deg_dfs(self):
# the keys are integers, with the order of files in the dataset yml
for i, file_name in enumerate(self.config.transcripts):
try:
file_extension = extfind(self.sub_folder_absolute, file_name)
path_deg_file = os.path.join(
self.sub_folder_absolute,
f"{file_name}.csv")
f"{file_name}.{file_extension}")
deg_df = pd.read_csv(path_deg_file, sep='\t', header=0)
self.deg_dfs[i] = deg_df
except FileNotFoundError:
logger.info(f"{file_name}.csv: file not found")
logger.info(f"{file_name}.{file_extension}: file not found")
except Exception as e:
logger.info(f'Error while opening file {file_name}.csv {e}')
logger.info(
f'Error while opening file {file_name}.{file_extension} '
f' \n {e}')

logger.info("Finished loading transcripts dataframes: "
"%s", self.config.transcripts)

def load_pathways_dfs(self):
for k in self.config.pathways.keys():
try:
file_extension = extfind(
self.sub_folder_absolute, self.config.pathways[k])
path_file = os.path.join(
self.sub_folder_absolute,
f"{self.config.pathways[k]}.csv")
f"{self.config.pathways[k]}.{file_extension}")
pathway_df = pd.read_csv(path_file, sep='\t', header=0)
self.pathways_dfs[k] = pathway_df
except FileNotFoundError:
logger.info(f"{self.config.pathways[k]}.csv: file not found")
logger.info(
f"{self.config.pathways[k]}.{file_extension}: not found")
except Exception as e:
logger.info(f'Error while opening file '
f'{self.config.pathways[k]}.csv {e}')
logger.info(
f'{e}. Error while opening file '
f'{self.config.pathways[k]}.{file_extension} \n {e}')

logger.info("Finished loading pathways dataframes: "
"%s", self.config.pathways)
Expand Down
27 changes: 24 additions & 3 deletions src/dimet/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
"""
@author: Johanna Galvis, Florian Specque, Macha Nikolski
"""
import os
import logging
from collections.abc import Iterable
from functools import reduce
from typing import Dict, List

from dimet.constants import assert_literal, overlap_methods_types
from dimet.constants import (assert_literal,
minimum_tolerated_fraction_value,
overlap_methods_types,
supported_file_extension)

import numpy as np

Expand Down Expand Up @@ -73,7 +77,7 @@ def concatenate_dataframes(df1: pd.DataFrame, df2: pd.DataFrame,
df2 = df2.reindex(columns=df1.columns, fill_value=np.nan)
df3 = df3.reindex(columns=df1.columns, fill_value=np.nan)
# please leave ignore_index as False:
# otherwise numbers and not metabolites appear in .csv exported results:
# otherwise numbers and not metabolites appear in exported results:
result = pd.concat([df1, df2, df3], ignore_index=False)
return result

Expand Down Expand Up @@ -197,7 +201,9 @@ def arg_repl_zero2value(how: str, df: pd.DataFrame) -> float:
except Exception as e:
logger.info(f"{e}. {err_msg}")
raise ValueError(err_msg)
min_value = df[df > 0].min(skipna=True).min(skipna=True)
min_value = df[
df >= minimum_tolerated_fraction_value # '> 0' prone to errors if values < 1e-6 exist in the data (and round is 6 places!)
].min(skipna=True).min(skipna=True)
output_value = min_value / denominator
else:
try:
Expand Down Expand Up @@ -488,3 +494,18 @@ def msg_correction_method_not_suitable(filename: str, test: str) -> str:
f" for multiple tests correction (e.g. Bonferroni, "
f" B-H, or other), is unsuitable and will be omitted")
return message


def extfind(parent_folder_absolute, file_name):
repertoire_extension = supported_file_extension
out_str = ""
for x in repertoire_extension:
if not os.path.exists(
os.path.join(parent_folder_absolute,
file_name + f".{x}")):
continue
else:
out_str = x
if out_str not in repertoire_extension:
out_str = "csv" # if none of the supported extensions, try this one
return out_str
47 changes: 41 additions & 6 deletions src/dimet/method/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
import sys
from typing import Union
from typing import Dict, Union

import hydra
from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
Expand Down Expand Up @@ -191,6 +191,14 @@ class BivariateAnalysisConfig(MethodConfig):
"""
correction_method: str = "fdr_bh"
output_include_gmean_arr_columns: bool = True
conditions_MDV_comparison: Dict[str, str] = {
'isotopologue_proportions': 'spearman'}
timepoints_MDV_comparison: Dict[str, str] = {
'isotopologue_proportions': 'spearman'}
conditions_metabolite_time_profiles: Dict[str, str] = {
'abundances': 'spearman',
'mean_enrichment': 'spearman'
}

def build(self) -> "BivariateAnalysis":
return BivariateAnalysis(config=self)
Expand Down Expand Up @@ -873,37 +881,64 @@ def check_expectations_config_metabo(
class BivariateAnalysis(Method):
config: BivariateAnalysisConfig

@staticmethod
def replace_test__if_user_external_config(cfg):
"""
replaces the test in bivariate internal config if user set another
test in the external config
"""
if "statistical_test" in list(cfg.analysis.keys()):
if cfg.analysis.statistical_test is not None:
user_test = cfg.analysis.statistical_test
cfg.analysis.method.conditions_MDV_comparison[
'isotopologue_proportions'] = user_test
cfg.analysis.method.timepoints_MDV_comparison[
'isotopologue_proportions'] = user_test
cfg.analysis.method.conditions_metabolite_time_profiles[
'abundances'] = user_test
cfg.analysis.method.conditions_metabolite_time_profiles[
'mean_enrichment'] = user_test
else:
pass
return cfg

def run(self, cfg: DictConfig, dataset: Dataset) -> None:
"""
Runs bivariate analysis, the 'behavior' is the type of comparison:
- conditions_MDV_comparison
- timepoints_MDV_comparison
- conditions_metabolite_time_profiles
"""
logger.info(f"The current working directory is {os.getcwd()}")

cfg = self.replace_test__if_user_external_config(cfg)
logger.info("Current configuration is %s", OmegaConf.to_yaml(cfg))

logger.info(
"Will compute bi-variate analysis, with the following config: %s",
self.config)

out_table_dir = os.path.join(os.getcwd(), cfg.table_path)
os.makedirs(out_table_dir, exist_ok=True)

self.check_expectations(cfg, dataset)

datatype = "isotopologue_proportions"
if datatype in dataset.compartmentalized_dfs.keys():
datatype_mdv = "isotopologue_proportions"
if datatype_mdv in dataset.compartmentalized_dfs.keys():
logger.info(f"Running bi-variate analysis with "
f"{datatype}:")
f"{datatype_mdv}:")
if len(cfg.analysis.conditions) >= 2:
logger.info("assessing MDV (Mass Distribution Vector) "
"between conditions")
bivariate_comparison(
datatype, dataset, cfg,
datatype_mdv, dataset, cfg,
behavior="conditions_MDV_comparison",
out_table_dir=out_table_dir)
if len(dataset.metadata_df["timepoint"].unique()) >= 2:
logger.info("assessing MDV (Mass Distribution Vector) "
"between time-points")
bivariate_comparison(
datatype, dataset, cfg,
datatype_mdv, dataset, cfg,
behavior="timepoints_MDV_comparison",
out_table_dir=out_table_dir)

Expand Down
4 changes: 2 additions & 2 deletions src/dimet/processing/bivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def compute_statistical_correlation(df: pd.DataFrame,
stat_list = []
pvalue_list = []
for i, metabolite in enumerate(list(df['metabolite'])):
# array of n-(timepoints or m+x) geometrical means values
# array of n (timepoints or m+x) geometrical means values
array_1 = df.loc[metabolite, "gmean_arr_1"]
array_2 = df.loc[metabolite, "gmean_arr_2"]
if test == "pearson":
Expand Down Expand Up @@ -79,7 +79,7 @@ def compute_bivariate_by_behavior(
"""
performs two steps:
1. calls functions to compute geometric means, obtaining df's inside dict
2. computes the bivariate statistical test (pearson by default)
2. computes the bivariate statistical test
"""
if behavior == "conditions_MDV_comparison":
df_dict = conditions_MDV_gmean_df_dict(df, metadata_df, comparison)
Expand Down
6 changes: 3 additions & 3 deletions src/dimet/processing/pca_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,12 @@ def pca_global_compartment_dataset(df: pd.DataFrame,

def send_to_tables(pca_results_compartment_dict: dict,
out_table_dir: str) -> None:
""" Save each result to .csv files """
""" Save each result to tab delimited files """
for tup in pca_results_compartment_dict.keys():
out_table = "--".join(list(tup))
for df in pca_results_compartment_dict[tup].keys():
pca_results_compartment_dict[tup][df].to_csv(
os.path.join(out_table_dir, f"{out_table}_{df}.csv"),
os.path.join(out_table_dir, f"{out_table}_{df}.tsv"),
sep='\t', index=False)
logger.info(f"Saved pca tables in {out_table_dir}")

Expand All @@ -136,7 +136,7 @@ def run_pca_analysis(file_name: data_files_keys_type,
out_table_dir: str, mode: str) -> Union[None, dict]:
"""
Generates all PCA results, both global (default) and with splited data.
- mode='save_tables', the PCA tables are saved to .csv;
- mode='save_tables', the PCA tables are saved to tab delimited files;
or
- mode='return_results_dict', returns the results object (dict)
"""
Expand Down
5 changes: 3 additions & 2 deletions src/dimet/visualization/abundance_bars.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ def plot_one_metabolite(df: pd.DataFrame,
palette=palette_choice,
alpha=1,
edgecolor="black",
errcolor="black",
errwidth=1.7,
# errcolor="black", # deprecated in seaborn0.13.2
# errwidth=1.7, # deprecated in seaborn0.13.2
err_kws={'linewidth': 1.7, 'color': 'black'},
capsize=0.12,
)
if do_stripplot:
Expand Down
Loading

0 comments on commit 05e5152

Please sign in to comment.