Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New: bivariate analysis #20

Merged
merged 3 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/dimet/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def main_run_analysis(cfg: DictConfig) -> None:
config=hydra.utils.instantiate(cfg.analysis.dataset))
dataset.preload()
dataset.split_datafiles_by_compartment()
dataset.save_datafiles_split_by_compartment()
method: Method = hydra.utils.instantiate(
cfg.analysis.method).build() # method factory

Expand Down
26 changes: 26 additions & 0 deletions src/dimet/config/analysis/method/bivariate_analysis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
_target_: dimet.method.BivariateAnalysisConfig

label: bivariate analysis
name: Computation of the correlation of MDV profiles, or the metabolite time course profiles

# (**) : automatically will run

conditions_MDV_comparison: # (**) if >= 2 conditions and >=1 timepoint (timepoints run separately)
isotopologue_proportions: pearson

timepoints_MDV_comparison: # (**) if >= 1 condition and >=2 timepoints
isotopologue_proportions: pearson

conditions_metabolite_time_profiles: # (**) if >= 2 conditions AND >=2 time points in data
abundances: pearson
mean_enrichment: pearson

correction_method: fdr_bh

impute_values:
abundances: "min"
mean_enrichment: "min"
isotopologues: "min"
isotopologue_proportions: "min"

output_include_gmean_arr_columns: True # if False, the 'gmean_arr_.." columns are excluded
30 changes: 9 additions & 21 deletions src/dimet/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def build(self) -> "Dataset":
class Dataset(BaseModel):
config: DatasetConfig
raw_data_folder: str = None
processed_data_folder: str = None
sub_folder_absolute: str = None
metadata_df: Optional[pd.DataFrame] = None
abundances_df: Optional[pd.DataFrame] = None
Expand Down Expand Up @@ -77,8 +76,7 @@ def preload(self):
else:
self.sub_folder_absolute = self.config.subfolder
self.raw_data_folder = os.path.join(self.sub_folder_absolute, "raw")
self.processed_data_folder = os.path.join(self.sub_folder_absolute,
"processed")

# start loading the dataframes
file_paths = [
("metadata", os.path.join(self.raw_data_folder,
Expand Down Expand Up @@ -106,10 +104,14 @@ def preload(self):
dfs.append(pd.read_csv(file_path, sep="\t", header=0))
self.available_datasets.add(label)
except FileNotFoundError:
logger.critical(
"File %s not found, continuing, "
"but this might fail miserably",
file_path)
if file_path.endswith(self.config.isotopologues + ".csv"):
message_detail = "isotopologue absolute values missing"
logger.critical(
"File %s not found (%s), continuing"
% (file_path, message_detail))
else:
logger.critical("File %s not found, continuing",
file_path)
dfs.append(None)
except Exception as e:
logger.error(
Expand Down Expand Up @@ -169,19 +171,6 @@ def split_datafiles_by_compartment(self) -> None:
frames_dict = set_samples_names(frames_dict, self.metadata_df)
self.compartmentalized_dfs = frames_dict

def save_datafiles_split_by_compartment(self) -> None:
os.makedirs(self.processed_data_folder, exist_ok=True)
out_data_path = self.processed_data_folder
for file_name in self.compartmentalized_dfs.keys():
for compartment in self.compartmentalized_dfs[file_name].keys():
df = self.compartmentalized_dfs[file_name][compartment]
tmp_file_name = self.get_file_for_label(file_name)
output_file_name = f"{tmp_file_name}-{compartment}.csv"
df.to_csv(os.path.join(out_data_path, output_file_name),
sep="\t", header=True, index=False)
logger.info(
f"Saved the {compartment} compartment version "
f"of {file_name} in {out_data_path}")

def get_file_for_label(self, label):
if label == "abundances":
Expand Down Expand Up @@ -210,7 +199,6 @@ class DataIntegration(Dataset):
def set_dataset_integration_config(self):
self.preload()
self.split_datafiles_by_compartment()
self.save_datafiles_split_by_compartment()

self.integration_files_folder_absolute = os.path.join(
self.sub_folder_absolute, "integration_files")
Expand Down
88 changes: 88 additions & 0 deletions src/dimet/method/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
metabolites_values_for_metabologram)
from dimet.data import DataIntegration, Dataset
from dimet.helpers import flatten
from dimet.processing.bivariate_analysis import bivariate_comparison
from dimet.processing.differential_analysis import (differential_comparison,
multi_group_compairson,
time_course_analysis)
Expand Down Expand Up @@ -184,6 +185,18 @@ def build(self) -> "MetabologramIntegration":
return MetabologramIntegration(config=self)


class BivariateAnalysisConfig(MethodConfig):
"""
Sets default values or fills them for the bi-variate analysis
"""
correction_method: str = "fdr_bh"
output_include_gmean_arr_columns: bool = True

def build(self) -> "BivariateAnalysis":
return BivariateAnalysis(config=self)



class Method(BaseModel):
config: MethodConfig

Expand Down Expand Up @@ -852,3 +865,78 @@ def check_expectations_config_metabo(
except ValueError as e:
logger.error(f"Data inconsistency: {e}")
sys.exit(1)


class BivariateAnalysis(Method):
config: BivariateAnalysisConfig

def run(self, cfg: DictConfig, dataset: Dataset) -> None:
"""
Runs bivariate analysis, the 'behavior' is the type of comparison:
- conditions_MDV_comparison
- timepoints_MDV_comparison
- conditions_metabolite_time_profiles
"""
logger.info(
"Will compute bi-variate analysis, with the following config: %s",
self.config)

out_table_dir = os.path.join(os.getcwd(), cfg.table_path)
os.makedirs(out_table_dir, exist_ok=True)
self.check_expectations(cfg, dataset)

datatype = "isotopologue_proportions"
if datatype in dataset.compartmentalized_dfs.keys():
logger.info(f"Running bi-variate analysis with "
f"{datatype}:")
if len(cfg.analysis.conditions) >= 2:
logger.info("assessing MDV (Mass Distribution Vector) "
"between conditions")
bivariate_comparison(
datatype, dataset, cfg,
behavior="conditions_MDV_comparison",
out_table_dir=out_table_dir)
if len(dataset.metadata_df["timepoint"].unique()) >= 2:
logger.info("assessing MDV (Mass Distribution Vector) "
"between time-points")
bivariate_comparison(
datatype, dataset, cfg,
behavior="timepoints_MDV_comparison",
out_table_dir=out_table_dir)

if (len(cfg.analysis.conditions) >= 2) and (
len(dataset.metadata_df["timepoint"].unique()) >= 2):
for datatype in ["abundances", "mean_enrichment"]:
if datatype in dataset.compartmentalized_dfs.keys():
logger.info(f"Running bi-variate analysis with "
f"{datatype} to compare "
f"time course profiles between conditions")
bivariate_comparison(
datatype, dataset, cfg,
behavior="conditions_metabolite_time_profiles",
out_table_dir=out_table_dir)

def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None:
# check that necessary information is provided in the analysis config
try:
if ((len(cfg.analysis.conditions) < 2) and
(len(dataset.metadata_df["timepoint"].unique()) < 2)):
raise ValueError("Less than 2 conditions, "
"AND less than 2 timepoints, "
"impossible to run bi-variate analysis, "
"aborting")
if not set(cfg.analysis.conditions).issubset(
set(dataset.metadata_df['condition'])):
raise ValueError(
"Conditions provided for bi-variate analysis "
"in the config file "
"are not present in the metadata file, aborting"
)
except ConfigAttributeError as e:
logger.error(
f"Mandatory parameter not provided in the config file:{e}, "
f"aborting")
sys.exit(1)
except ValueError as e:
logger.error(f"Data inconsistency:{e}")
sys.exit(1)
Loading