From 2f09a8565a51bed1c028ef6d83bd717f0fefa5c3 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 22 Jul 2024 11:23:25 +0800 Subject: [PATCH 001/203] minor change --- dance/pipeline.py | 4 ++-- dance/transforms/filter.py | 4 ++-- dance/transforms/graph/dstg_graph.py | 14 +++++++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/dance/pipeline.py b/dance/pipeline.py index d831ab10..8a5d02d8 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -1086,9 +1086,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N) # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random. # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact. - step3_start_k = default(step2_pipeline_planer.config.step3_start_k, 0) + step3_start_k = step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0 #Some sweep_ids of step3 that have already been run - step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids + step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else ( step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids))) diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py index 78e6d83f..1388d86d 100644 --- a/dance/transforms/filter.py +++ b/dance/transforms/filter.py @@ -145,9 +145,9 @@ def prepCounts(self, x): elif self._FILTER_TARGET == "cells": n_counts = np.sum(x, axis=1) if isinstance(self.min_counts, float) and 0 <= self.min_counts <= 1: - min_counts = np.percentile(n_counts, self.min_counts) + min_counts = np.percentile(n_counts, self.min_counts * 100) else: - max_counts = np.percentile(n_counts, self.max_counts) + max_counts = np.percentile(n_counts, self.max_counts * 100) return min_counts, max_counts else: return self.min_counts, self.max_counts diff --git a/dance/transforms/graph/dstg_graph.py b/dance/transforms/graph/dstg_graph.py index b261fede..e51a3393 100644 --- a/dance/transforms/graph/dstg_graph.py +++ b/dance/transforms/graph/dstg_graph.py @@ -1,3 +1,5 @@ +from typing import Sequence + import networkx as nx import numpy as np import pandas as pd @@ -32,17 +34,23 @@ class DSTGraph(BaseTransform): _DISPLAY_ATTRS = ("k_filter", "num_cc", "ref_split", "inf_split") - def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test", **kwargs): + def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test", + channels: Sequence[str | None] = (None, None), channel_types: Sequence[str | None] = ("obsm", "obsm"), + **kwargs): super().__init__(**kwargs) self.k_filter = k_filter self.num_cc = num_cc self.ref_split = ref_split self.inf_split = inf_split + self.channels = channels + self.channel_types = channel_types def __call__(self, data): - x_ref = data.get_feature(return_type="numpy", split_name=self.ref_split) - x_inf = data.get_feature(return_type="numpy", split_name=self.inf_split) + x_ref = data.get_feature(return_type="numpy", split_name=self.ref_split, channel=self.channels[0], + channel_type=self.channel_types[0]) + x_inf = data.get_feature(return_type="numpy", split_name=self.inf_split, channel=self.channels[1], + channel_type=self.channel_types[1]) adj = compute_dstg_adj(x_ref, x_inf, k_filter=self.k_filter, num_cc=self.num_cc) data.data.obsp[self.out] = adj From 15c29dbd0e1b79f6f213b45687d94b09ae9b77d4 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 22 Jul 2024 11:29:06 +0800 Subject: [PATCH 002/203] minor change --- dance/pipeline.py | 4 ++-- dance/transforms/filter.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dance/pipeline.py b/dance/pipeline.py index d831ab10..542e32a1 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -1086,9 +1086,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N) # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random. # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact. - step3_start_k = default(step2_pipeline_planer.config.step3_start_k, 0) + step3_start_k=step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0 #Some sweep_ids of step3 that have already been run - step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids + step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else ( step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids))) diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py index 78e6d83f..fe420093 100644 --- a/dance/transforms/filter.py +++ b/dance/transforms/filter.py @@ -145,9 +145,9 @@ def prepCounts(self, x): elif self._FILTER_TARGET == "cells": n_counts = np.sum(x, axis=1) if isinstance(self.min_counts, float) and 0 <= self.min_counts <= 1: - min_counts = np.percentile(n_counts, self.min_counts) + min_counts = np.percentile(n_counts, self.min_counts*100) else: - max_counts = np.percentile(n_counts, self.max_counts) + max_counts = np.percentile(n_counts, self.max_counts*100) return min_counts, max_counts else: return self.min_counts, self.max_counts From 675e0bfdfef65bb8e60bce73852fe32defd959cb Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 22 Jul 2024 11:33:53 +0800 Subject: [PATCH 003/203] minor change --- dance/pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dance/pipeline.py b/dance/pipeline.py index 59a7aa28..8a5d02d8 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -1086,10 +1086,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N) # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random. # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact. - step3_start_k=step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0 + step3_start_k = step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0 #Some sweep_ids of step3 that have already been run step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None - step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else ( step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids))) From 1eb305c43a8c73a39fff64db83c93dc06e35d980 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 22 Jul 2024 14:50:00 +0800 Subject: [PATCH 004/203] minor changes --- dance/datasets/multimodality.py | 11 + .../multi_modality/joint_embedding/dcca.py | 7 +- dance/pipeline.py | 10 +- dance/transforms/cell_feature.py | 18 +- dance/transforms/filter.py | 77 +++++-- dance/transforms/misc.py | 22 ++ dance/transforms/normalize.py | 50 ++++- dance/utils/wrappers.py | 50 +++++ .../multi_modality/joint_embedding/dcca.py | 3 +- examples/tuning/joint_embedding_dcca/main.py | 202 ++++++++++++++++++ .../tuning/predict_modality_babel/main.py | 114 ++++++++++ 11 files changed, 533 insertions(+), 31 deletions(-) create mode 100644 examples/tuning/joint_embedding_dcca/main.py create mode 100644 examples/tuning/predict_modality_babel/main.py diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py index 477aff94..7df32f7d 100644 --- a/dance/datasets/multimodality.py +++ b/dance/datasets/multimodality.py @@ -7,7 +7,10 @@ import mudata as md import numpy as np import scanpy as sc +import scipy import scipy.sparse as sp +import sklearn +from sklearn.utils import issparse from dance import logger from dance.data import Data @@ -572,6 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre def _raw_to_dance(self, raw_data): mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data) + self.to_array([mod1, mod2, meta1, meta2, test_sol]) assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned" mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol}) @@ -581,6 +585,13 @@ def _raw_to_dance(self, raw_data): return data + def to_array(self, datas): + for data in datas: + if scipy.sparse.issparse(data.X): + data.X = np.array(data.X.todense()).astype(float) + if "counts" in data.layers and scipy.sparse.issparse(data.layers["counts"]): + data.layers["counts"] = np.array(data.layers["counts"].todense()).astype(float) + def _maybe_preprocess(self, raw_data): if self.preprocess is None: return raw_data diff --git a/dance/modules/multi_modality/joint_embedding/dcca.py b/dance/modules/multi_modality/joint_embedding/dcca.py index b5356ada..bb6d9a69 100644 --- a/dance/modules/multi_modality/joint_embedding/dcca.py +++ b/dance/modules/multi_modality/joint_embedding/dcca.py @@ -11,6 +11,7 @@ import collections import math import os +import sys import time import warnings from collections import OrderedDict @@ -385,7 +386,7 @@ def fit(self, train_loader, test_loader, total_loader, model_pre, args, criterio train_loss_list = [] reco_epoch_test = 0 - test_like_max = 100000 + test_like_max = sys.maxsize flag_break = 0 patience_epoch = 0 @@ -394,7 +395,7 @@ def fit(self, train_loader, test_loader, total_loader, model_pre, args, criterio model_pre.eval() start = time.time() - + best_dict = None for epoch in range(1, args.max_epoch + 1): self.train() @@ -636,7 +637,7 @@ def fit(self, train_loader, test_loader, total_loader, model_pre, args, criterio break duration = time.time() - start - self.load_state_dict(best_dict) + self.load_state_dict(best_dict if best_dict is not None else self.state_dict()) print('Finish training, total time is: ' + str(duration) + 's') self.eval() diff --git a/dance/pipeline.py b/dance/pipeline.py index d831ab10..09595c52 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -1056,6 +1056,12 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_ for target, d_p in p1.default_params.items(): if target == p2["target"]: p2["params"] = d_p + for p1, p2 in zip(step2_pipeline_planer.config.pipeline, pipeline): #need order + if "params" in p1: + for key, value in p1.params.items(): + if "params" not in p2: + p2.params = {} + p2.params[key] = value temp_conf = conf.copy() temp_conf.pipeline = pipeline temp_conf.wandb = step2_pipeline_planer.config.wandb @@ -1086,9 +1092,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N) # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random. # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact. - step3_start_k = default(step2_pipeline_planer.config.step3_start_k, 0) + step3_start_k = step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0 #Some sweep_ids of step3 that have already been run - step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids + step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else ( step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids))) diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py index 465a12b0..4e90c97d 100644 --- a/dance/transforms/cell_feature.py +++ b/dance/transforms/cell_feature.py @@ -8,9 +8,11 @@ from dance.typing import Optional, Union from dance.utils.matrix import normalize from dance.utils.status import deprecated +from dance.utils.wrappers import add_mod_and_transform @register_preprocessor("feature", "cell") +@add_mod_and_transform class WeightedFeaturePCA(BaseTransform): """Compute the weighted gene PCA as cell features. @@ -66,6 +68,7 @@ def __call__(self, data): @register_preprocessor("feature", "cell") +@add_mod_and_transform class WeightedFeatureSVD(BaseTransform): """Compute the weighted gene SVD as cell features. @@ -127,6 +130,7 @@ def __call__(self, data): @register_preprocessor("feature", "cell") +@add_mod_and_transform class CellPCA(BaseTransform): """Reduce cell feature matrix with PCA. @@ -145,10 +149,9 @@ def __init__(self, n_components: Union[float, int] = 400, *, channel: Optional[s self.n_components = n_components self.channel = channel - self.mod = mod def __call__(self, data): - feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod) + feat = data.get_feature(return_type="numpy", channel=self.channel) if self.n_components > min(feat.shape): self.logger.warning( f"n_components={self.n_components} must be between 0 and min(n_samples, n_features)={min(feat.shape)} with svd_solver='full'" @@ -167,6 +170,7 @@ def __call__(self, data): @register_preprocessor("feature", "cell") +@add_mod_and_transform class CellSVD(BaseTransform): """Reduce cell feature matrix with SVD. @@ -185,10 +189,9 @@ def __init__(self, n_components: Union[float, int] = 400, *, channel: Optional[s self.n_components = n_components self.channel = channel - self.mod = mod def __call__(self, data): - feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod) + feat = data.get_feature(return_type="numpy", channel=self.channel) if isinstance(self.n_components, float): n_components = min(feat.shape) - 1 svd = TruncatedSVD(n_components=n_components) @@ -215,7 +218,8 @@ def __call__(self, data): @register_preprocessor("feature", "cell") -@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") +@add_mod_and_transform +# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") class FeatureCellPlaceHolder(BaseTransform): """Used as a placeholder to skip the process. @@ -229,13 +233,12 @@ class FeatureCellPlaceHolder(BaseTransform): def __init__(self, n_components: int = 400, *, channel: Optional[str] = None, mod: Optional[str] = None, **kwargs): super().__init__(**kwargs) self.channel = channel - self.mod = mod self.logger.info( "n_components in FeatureCellPlaceHolder is used to make the parameters consistent and will not have any actual effect." ) def __call__(self, data): - feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod) + feat = data.get_feature(return_type="numpy", channel=self.channel) cell_feat = feat gene_feat = feat.T data.data.obsm[self.out] = cell_feat @@ -305,6 +308,7 @@ def __call__(self, data): @register_preprocessor("feature", "cell") # NOTE: register any custom preprocessing function to be used for tuning +@add_mod_and_transform class GaussRandProjFeature(BaseTransform): """Custom preprocessing to extract cell feature via Gaussian random projection.""" diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py index 78e6d83f..8ec0ac1f 100644 --- a/dance/transforms/filter.py +++ b/dance/transforms/filter.py @@ -3,6 +3,7 @@ from typing import get_args import anndata as ad +import mudata as md import numpy as np import pandas as pd import scanpy as sc @@ -20,6 +21,7 @@ from dance.typing import Dict, GeneSummaryMode, List, Literal, Logger, Optional, Tuple, Union from dance.utils import default from dance.utils.status import deprecated +from dance.utils.wrappers import add_mod_and_transform def get_count(count_or_ratio: Optional[Union[float, int]], total: int) -> Optional[int]: @@ -48,6 +50,7 @@ def get_count(count_or_ratio: Optional[Union[float, int]], total: int) -> Option @register_preprocessor("filter") +@add_mod_and_transform class FilterScanpy(BaseTransform): """Scanpy filtering transformation with additional options.""" @@ -145,9 +148,9 @@ def prepCounts(self, x): elif self._FILTER_TARGET == "cells": n_counts = np.sum(x, axis=1) if isinstance(self.min_counts, float) and 0 <= self.min_counts <= 1: - min_counts = np.percentile(n_counts, self.min_counts) + min_counts = np.percentile(n_counts, self.min_counts * 100) else: - max_counts = np.percentile(n_counts, self.max_counts) + max_counts = np.percentile(n_counts, self.max_counts * 100) return min_counts, max_counts else: return self.min_counts, self.max_counts @@ -268,6 +271,31 @@ def __init__( inplace=inplace, **kwargs) +@register_preprocessor("filter", "cell") +@add_mod_and_transform +class FilterCellsCommonMod(BaseTransform): + + def __init__(self, mod1: str, mod2: str, sol: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + self.mod1 = mod1 + self.mod2 = mod2 + self.sol = sol + + def __call__(self, data: Data): + md_data = data.data + data_mod1 = md_data.mod[self.mod1] + data_mod2 = md_data.mod[self.mod2] + common_cells = list(set(data_mod1.obs.index) & set(data_mod2.obs.index)) + data_mod1 = data_mod1[common_cells, :] + data_mod2 = data_mod2[common_cells, :] + data.data.mod[self.mod1] = data_mod1 + data.data.mod[self.mod2] = data_mod2 + if self.sol is not None: + test_sol = md_data.mod[self.sol] + test_sol = test_sol[common_cells, :] + data.data.mod[self.sol] = test_sol + + @register_preprocessor("filter", "gene") class FilterGenesCommon(BaseTransform): """Filter genes by taking the common genes across batches or splits. @@ -472,6 +500,7 @@ def __call__(self, data): @register_preprocessor("filter", "gene") +@add_mod_and_transform class FilterGenesPercentile(FilterGenes): """Filter genes based on percentiles of the summarized gene expressions. @@ -540,6 +569,7 @@ def _get_preserve_mask(self, gene_summary): @register_preprocessor("filter", "gene") +@add_mod_and_transform class FilterGenesTopK(FilterGenes): """Select top/bottom genes based on the summarized gene expressions. @@ -708,6 +738,7 @@ def __call__(self, data): @register_preprocessor("filter", "gene") +@add_mod_and_transform class FilterGenesRegression(BaseTransform): """Select genes based on regression. @@ -733,18 +764,19 @@ class FilterGenesRegression(BaseTransform): _DISPLAY_ATTRS = ("num_genes", ) def __init__(self, method: str = "enclasc", num_genes: int = 1000, *, channel: Optional[str] = None, - mod: Optional[str] = None, skip_count_check: bool = False, inplace=True, **kwargs): + channel_type: Optional[str] = None, mod: Optional[str] = None, skip_count_check: bool = False, + inplace=True, **kwargs): super().__init__(**kwargs) self.num_genes = num_genes self.channel = channel - self.mod = mod self.method = method self.skip_count_check = skip_count_check self.inplace = inplace + self.channel_type = channel_type def __call__(self, data): - feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod) + feat = data.get_feature(return_type="numpy", channel=self.channel, channel_type=self.channel_type) if not self.skip_count_check and np.mod(feat, 1).sum(): warnings.warn("Expecting count data as input, but the input feature matrix does not appear to be count." @@ -995,6 +1027,7 @@ def gini_func(x, weights=None): @register_preprocessor("filter", "gene") +@add_mod_and_transform class FilterGenesScanpyOrder(BaseTransform): """Scanpy filtering gene transformation with additional options. @@ -1084,6 +1117,7 @@ def __call__(self, data: Data): @register_preprocessor("filter", "gene") +@add_mod_and_transform class HighlyVariableGenesRawCount(AnnDataTransform): """Filter for highly variable genes using raw count matrix. @@ -1120,9 +1154,10 @@ class HighlyVariableGenesRawCount(AnnDataTransform): """ - def __init__(self, layer: Optional[str] = None, n_top_genes: Optional[int] = 1000, span: Optional[float] = 0.3, - subset: bool = True, inplace: bool = True, batch_key: Optional[str] = None, check_values: bool = True, - **kwargs): + def __init__(self, channel: Optional[str] = None, channel_type: Optional[str] = None, + n_top_genes: Optional[int] = 1000, span: Optional[float] = 0.3, subset: bool = True, + inplace: bool = True, batch_key: Optional[str] = None, check_values: bool = True, **kwargs): + layer = channel if channel_type == "layers" else None super().__init__(sc.pp.highly_variable_genes, layer=layer, n_top_genes=n_top_genes, batch_key=batch_key, check_values=check_values, span=span, subset=subset, inplace=inplace, flavor="seurat_v3", **kwargs) @@ -1158,6 +1193,7 @@ def __call__(self, data): @register_preprocessor("filter", "gene") +@add_mod_and_transform class HighlyVariableGenesLogarithmizedByTopGenes(AnnDataTransform): """Filter for highly variable genes based on top genes. @@ -1197,16 +1233,19 @@ class HighlyVariableGenesLogarithmizedByTopGenes(AnnDataTransform): """ - def __init__(self, layer: Optional[str] = None, n_top_genes: Optional[int] = 1000, n_bins: int = 20, - flavor: Literal["seurat", "cell_ranger"] = "seurat", subset: bool = True, inplace: bool = True, - batch_key: Optional[str] = None, **kwargs): + def __init__(self, channel: Optional[str] = None, channel_type: Optional[str] = None, + n_top_genes: Optional[int] = 1000, n_bins: int = 20, flavor: Literal["seurat", + "cell_ranger"] = "seurat", + subset: bool = True, inplace: bool = True, batch_key: Optional[str] = None, **kwargs): + layer = channel if channel_type == "layers" else None super().__init__(sc.pp.highly_variable_genes, layer=layer, n_top_genes=n_top_genes, n_bins=n_bins, flavor=flavor, subset=subset, inplace=inplace, batch_key=batch_key, **kwargs) self.logger.info("Expects logarithmized data") @register_preprocessor("filter", "gene") -@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") +@add_mod_and_transform +# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") class FilterGenesPlaceHolder(BaseTransform): """Used as a placeholder to skip the process.""" @@ -1237,10 +1276,11 @@ def __call__(self, data: Data) -> Data: @register_preprocessor("filter", "gene") -@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") +@add_mod_and_transform +# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") class FilterGenesNumberPlaceHolder(BaseTransform): - def __init__(self, **kwargs): + def __init__(self, channel=None, channel_type=None, **kwargs): super().__init__(**kwargs) def __call__(self, data: Data) -> Data: @@ -1248,6 +1288,7 @@ def __call__(self, data: Data) -> Data: @register_preprocessor("filter", "gene") +@add_mod_and_transform class HighlyVariableGenesLogarithmizedByMeanAndDisp(AnnDataTransform): """Filter for highly variable genes based on mean and dispersion. @@ -1293,10 +1334,12 @@ class HighlyVariableGenesLogarithmizedByMeanAndDisp(AnnDataTransform): """ - def __init__(self, layer: Optional[str] = None, min_disp: Optional[float] = 0.5, max_disp: Optional[float] = np.inf, + def __init__(self, channel: Optional[str] = None, channel_type: Optional[str] = None, + min_disp: Optional[float] = 0.5, max_disp: Optional[float] = np.inf, min_mean: Optional[float] = 0.0125, max_mean: Optional[float] = 3, n_bins: int = 20, flavor: Literal["seurat", "cell_ranger"] = "seurat", subset: bool = True, inplace: bool = True, batch_key: Optional[str] = None, **kwargs): + layer = channel if channel_type == "layers" else None super().__init__(sc.pp.highly_variable_genes, layer=layer, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_bins=n_bins, flavor=flavor, subset=subset, inplace=inplace, batch_key=batch_key, **kwargs) @@ -1304,7 +1347,8 @@ def __init__(self, layer: Optional[str] = None, min_disp: Optional[float] = 0.5, @register_preprocessor("filter", "cell") -@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") +@add_mod_and_transform +# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") class FilterCellsPlaceHolder(BaseTransform): """Used as a placeholder to skip the process.""" @@ -1335,6 +1379,7 @@ def __call__(self, data: Data) -> Data: @register_preprocessor("filter", "cell") +@add_mod_and_transform class FilterCellsScanpyOrder(BaseTransform): """Scanpy filtering cell transformation with additional options. diff --git a/dance/transforms/misc.py b/dance/transforms/misc.py index fa765928..8b47c8b5 100644 --- a/dance/transforms/misc.py +++ b/dance/transforms/misc.py @@ -1,6 +1,10 @@ from pprint import pformat +from typing import Optional + +import mudata as md from dance import logger +from dance.data.base import Data from dance.registry import register_preprocessor from dance.transforms.base import BaseTransform from dance.typing import Any, Dict, Tuple @@ -153,3 +157,21 @@ def __init__(self, *, split_name: str, **kwargs): def __call__(self, data): self.logger.info("Popping split: {self.split_name!r}") data.pop(split_name=self.split_name) + + +@register_preprocessor("misc") +class AlignMod(BaseTransform): + """Aligning mods and metadata in multimodal data.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, data: Data) -> Data: + mod1, mod2, meta1, meta2, test_sol = data.data.mod.values() + meta1 = meta1[:, mod1.var.index] + meta2 = meta2[:, mod2.var.index] + test_sol = test_sol[:, mod1.var.index] + data.data.mod["meta1"] = meta1 + data.data.mod["meta2"] = meta2 + data.data.mod["test_sol"] = test_sol + return data diff --git a/dance/transforms/normalize.py b/dance/transforms/normalize.py index f2deedae..b0fb7af6 100644 --- a/dance/transforms/normalize.py +++ b/dance/transforms/normalize.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import scanpy as sc +import scipy import scipy.sparse as sp import statsmodels.discrete.discrete_model import statsmodels.nonparametric.kernel_regression @@ -18,9 +19,11 @@ from dance.typing import Dict, Iterable, List, Literal, LogLevel, NormMode, Number, Optional, Union from dance.utils.matrix import normalize from dance.utils.status import deprecated +from dance.utils.wrappers import add_mod_and_transform @register_preprocessor("normalize") +@add_mod_and_transform class ScaleFeature(BaseTransform): """Scale the feature matrix in the AnnData object. @@ -169,6 +172,37 @@ def __call__(self, data: Data) -> Data: @register_preprocessor("normalize") +@add_mod_and_transform +class tfidfTransform(BaseTransform): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.idf = None + self.fitted = False + + def fit(self, X): + self.idf = X.shape[0] / X.sum(axis=0) + self.fitted = True + + def transform(self, X): + if not self.fitted: + raise RuntimeError('Transformer was not fitted on any data') + if scipy.sparse.issparse(X): + tf = X.multiply(1 / X.sum(axis=1)) + return tf.multiply(self.idf) + else: + tf = X / X.sum(axis=1, keepdims=True) + return tf * self.idf + + def __call__(self, data): + X = data.data.X + self.fit(X) + data.data.X = self.transform(X) + return data + + +@register_preprocessor("normalize") +@add_mod_and_transform class ScTransform(BaseTransform): """ScTransform normalization and variance stabiliation. @@ -399,7 +433,8 @@ def __call__(self, data: Data): z[gn[genes_step1]] = 1 w = pd.Series(index=gn, data=np.zeros(gn.size, dtype='int')) - w[gn] = genes_log_gmean + # w[gn] = genes_log_gmean + w[gn] = genes_log_gmean.astype(int) #need to think selected_data.var['genes_step1_sct'] = z selected_data.var['log10_gmean_sct'] = w @@ -453,6 +488,8 @@ def _parallel_init(igenes_bin_regress, iumi_bin, ign, imm, ips): def _parallel_wrapper(j): name = gn[genes_bin_regress[j]] y = umi_bin[:, j].A.flatten() + y[np.isinf(y) | np.isnan(y)] = 0 + mm[np.isinf(mm) | np.isnan(mm)] = 0 pr = statsmodels.discrete.discrete_model.Poisson(y, mm) res = pr.fit(disp=False) mu = res.predict() @@ -490,6 +527,7 @@ def info(n, th, mu, y, w): @register_preprocessor("normalize") +@add_mod_and_transform class Log1P(AnnDataTransform): """Logarithmize the data matrix. @@ -527,6 +565,7 @@ def __init__(self, base: Optional[Number] = None, copy: bool = False, chunked: b @register_preprocessor("normalize") +@add_mod_and_transform class NormalizeTotal(AnnDataTransform): """Normalize counts per cell. @@ -583,9 +622,15 @@ def __init__(self, target_sum: Optional[float] = None, max_fraction: float = 0.0 if max_fraction == 1.0: self.logger.info("max_fraction set to 1.0, this is equivalent to setting exclude_highly_expressed=False.") + def __call__(self, data): + if scipy.sparse.issparse(data.data.X): + data.data.X = np.array(data.data.X.todense()) + return super().__call__(data) + @register_preprocessor("normalize") -@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") +@add_mod_and_transform +# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline") class NormalizePlaceHolder(BaseTransform): """Used as a placeholder to skip the process.""" @@ -597,6 +642,7 @@ def __call__(self, data: Data) -> Data: @register_preprocessor("normalize") +@add_mod_and_transform class NormalizeTotalLog1P(BaseTransform): """Normalize total counts followed by log1p transformation. diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py index eb20eff7..11314694 100644 --- a/dance/utils/wrappers.py +++ b/dance/utils/wrappers.py @@ -1,11 +1,15 @@ import datetime import functools import time +from typing import Union +import anndata +import mudata import numpy as np import torch from dance import logger +from dance.data.base import Data from dance.typing import Any, Callable @@ -85,3 +89,49 @@ def wrapped_func(*args): return func(*new_args) return wrapped_func + + +import functools + + +def add_mod_and_transform(cls): + original_init = cls.__init__ + original_call = cls.__call__ + cls.add_mod_and_transform = "add_mod_and_transform" + + @functools.wraps(original_init) + def new_init(self, *args, **kwargs): + mod = kwargs.pop('mod', None) + original_init(self, *args, **kwargs) + self.mod = mod + + @functools.wraps(original_call) + def new_call(self, data: Data, *args, **kwargs): + if hasattr(self, 'mod') and self.mod is not None: + md_data = data.data + ad_data = Data(data=transform_mod_to_anndata(md_data, self.mod)) + res = original_call(self, ad_data, *args, **kwargs) + data.data.mod[self.mod] = ad_data.data + else: + return original_call(self, data, *args, **kwargs) + + cls.__init__ = new_init + cls.__call__ = new_call + return cls + + +def transform_mod_to_anndata(mod_data: mudata.MuData, mod_key: str): + return mod_data.mod[mod_key] + + +# 使用装饰器 +@add_mod_and_transform +class MyClass: + + def __init__(self, x, **kwargs): + self.x = x + print("-------") + print(**kwargs) + + def __call__(self, data): + return data diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index 73668160..beefdeb0 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -71,7 +71,8 @@ def parameter_setting(): le = preprocessing.LabelEncoder() labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense(), 1) / 100 - data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"] + # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"] + data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense(), 1) / 100 data.mod["mod1"].obsm["labels"] = labels data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1", diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py new file mode 100644 index 00000000..86d6d1f9 --- /dev/null +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -0,0 +1,202 @@ +import argparse +import gc +import os +import pprint +import sys +from pathlib import Path + +import anndata as ad +import numpy as np +import pandas as pd +import scipy +import torch +import torch.utils.data as data_utils +from sklearn import preprocessing + +import dance.utils.metrics as metrics +import wandb +from dance import logger +from dance.datasets.multimodality import JointEmbeddingNIPSDataset +from dance.modules.multi_modality.joint_embedding.dcca import DCCA +from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.utils import set_seed + + +def parameter_setting(): + parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis") + + parser.add_argument("--latent_fusion", "-olf1", type=str, default="First_simulate_fusion.csv", + help="fusion latent code file") + parser.add_argument("--latent_1", "-ol1", type=str, default="scRNA_latent_combine.csv", + help="first latent code file") + parser.add_argument("--latent_2", "-ol2", type=str, default="scATAC_latent.csv", help="seconde latent code file") + parser.add_argument("--denoised_1", "-od1", type=str, default="scRNA_seq_denoised.csv", + help="outfile for denoised file1") + parser.add_argument("--normalized_1", "-on1", type=str, default="scRNA_seq_normalized_combine.tsv", + help="outfile for normalized file1") + parser.add_argument("--denoised_2", "-od2", type=str, default="scATAC_seq_denoised.csv", + help="outfile for denoised file2") + + parser.add_argument("--workdir", "-wk", type=str, default="./new_test/", help="work path") + parser.add_argument("--outdir", "-od", type=str, default="./new_test/", help="Output path") + + parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate") + parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay") + parser.add_argument("--eps", type=float, default=0.01, help="eps") + + parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size") + + parser.add_argument("--seed", type=int, default=1, help="Random seed for repeat results") + parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim") + parser.add_argument("--max_epoch", "-me", type=int, default=10, help="Max epoches") + parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration") + parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch") + parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test") + parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") + parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") + parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("--final_rate", type=float, default=1e-4) + parser.add_argument("--scale_factor", type=float, default=4) + + parser.add_argument("--cache", action="store_true", help="Cache processed data.") + parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) + parser.add_argument("--count", type=int, default=2) + parser.add_argument("--sweep_id", type=str, default=None) + parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) + parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + return parser + + +if __name__ == "__main__": + parser = parameter_setting() + args = parser.parse_args() + + args.sf1 = 5 + args.sf2 = 1 + args.cluster1 = args.cluster2 = 4 + args.lr1 = 0.01 + args.flr1 = 0.001 + args.lr2 = 0.005 + args.flr2 = 0.0005 + + res = None + logger.info(f"\n{pprint.pformat(vars(args))}") + file_root_path = Path(args.root_path, args.subtask).resolve() + logger.info(f"\n files is saved in {file_root_path}") + pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): + wandb.init(settings=wandb.Settings(start_method='thread')) + set_seed(args.seed) + # model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=4, layer_d_1=[4, 128], hidden2_1=128, + # layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=4, layer_d_2=[4], hidden2_2=4, args=args, + # Type_1="NB", Type_2="Bernoulli", ground_truth1=torch.cat([train_labels, test_labels]), cycle=1, + # attention_loss="Eucli") # yapf: disable + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") + data = dataset.load_data() + + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + + le = preprocessing.LabelEncoder() + labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense() if scipy.sparse.issparse(data.mod["mod2"].X) else data.mod["mod2"].X, 1) / 100 + # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"] + data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense() if scipy.sparse.issparse(data.mod["mod1"].X) else data.mod["mod1"].X, 1) / 100 + data.mod["mod1"].obsm["labels"] = labels + + # data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1", + # feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"], + # feature_channel=["counts", "counts", None, None, "size_factors", + # "size_factors"], label_channel="labels") + (x_train, y_train, x_train_raw, y_train_raw, x_train_size, + y_train_size), train_labels = data.get_train_data(return_type="torch") + (x_test, y_test, x_test_raw, y_test_raw, x_test_size, + y_test_size), test_labels = data.get_test_data(return_type="torch") + + Nfeature1 = x_train.shape[1] + Nfeature2 = y_train.shape[1] + + device = torch.device(args.device) + train = data_utils.TensorDataset(x_train.float(), x_train_raw, x_train_size.float(), y_train.float(), y_train_raw, + y_train_size.float()) + + train_loader = data_utils.DataLoader(train, batch_size=args.batch_size, shuffle=True) + + test = data_utils.TensorDataset(x_test.float(), x_test_raw, x_test_size.float(), y_test.float(), y_test_raw, + y_test_size.float()) + + test_loader = data_utils.DataLoader(test, batch_size=args.batch_size, shuffle=False) + + total = data_utils.TensorDataset( + torch.cat([x_train, x_test]).float(), torch.cat([x_train_raw, x_test_raw]), + torch.cat([x_train_size, x_test_size]).float(), + torch.cat([y_train, y_test]).float(), torch.cat([y_train_raw, y_test_raw]), + torch.cat([y_train_size, y_test_size]).float()) + + total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) + model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=50, layer_d_1=[50, 128], hidden2_1=128, + layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=50, layer_d_2=[50], hidden2_2=50, + args=args, ground_truth1=torch.cat([train_labels, test_labels]), Type_1="NB", Type_2="Bernoulli", + cycle=1, attention_loss="Eucli").to(device) + model.to(device) + model.fit(train_loader, test_loader, total_loader, "RNA") + + emb1, emb2 = model.predict(total_loader) + embeds = np.concatenate([emb1, emb2], 1) + print(embeds) + + adata = ad.AnnData( + X=embeds, + obs=data.mod["mod1"].obs, + ) + adata_sol = data.mod["test_sol"] + adata = adata[adata_sol.obs_names] + adata_sol.obsm['X_emb'] = adata.X + score = metrics.labeled_clustering_evaluate(adata, adata_sol) + # score.update(metrics.integration_openproblems_evaluate(adata_sol)) + score.update({ + # 'seed': args.seed + k, + 'subtask': args.subtask, + 'method': 'dcca', + }) + + # if res is not None: + # res = res.append(score, ignore_index=True) + # else: + # for s in score: + # score[s] = [score[s]] + # res = pd.DataFrame(score) + wandb.log({"ARI":score["dance_ari"]}) + wandb.finish() + torch.cuda.empty_cache() + #主要是报错时没有执行这些命令导致的,我感觉 + del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1 + del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels + del labels,le,dataset,score + gc.collect() + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( + evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch + save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) + if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": + get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, + conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", + root_path=file_root_path, required_funs=["AlignMod","FilterCellsCommonMod","FilterCellsCommonMod","SetConfig"], + required_indexes=[2,11,14,sys.maxsize], metric="ARI") + if args.tune_mode == "pipeline_params": + run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) + +"""To reproduce DCCA on other samples, please refer to command lines belows: + +GEX-ADT: +$ python dcca.py --subtask openproblems_bmmc_cite_phase2 --device cuda + +GEX-ATAC: +$ python dcca.py --subtask openproblems_bmmc_multiome_phase2 --device cuda + +""" diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py new file mode 100644 index 00000000..4079608e --- /dev/null +++ b/examples/tuning/predict_modality_babel/main.py @@ -0,0 +1,114 @@ +import argparse +import logging +import os +import sys +from pathlib import Path + +import pandas as pd +import torch + +import wandb +from dance import logger +from dance.datasets.multimodality import ModalityPredictionDataset +from dance.modules.multi_modality.predict_modality.babel import BabelWrapper +from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.utils import set_seed + +if __name__ == "__main__": + OPTIMIZER_DICT = { + "adam": torch.optim.Adam, + "rmsprop": torch.optim.RMSprop, + } + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2_rna") + parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("-cpu", "--cpus", default=1, type=int) + parser.add_argument("-seed", "--seed", default=1, type=int) + parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("-m", "--model_folder", default="./models") + parser.add_argument("--outdir", "-o", default="./logs", help="Directory to output to") + parser.add_argument("--lossweight", type=float, default=1., help="Relative loss weight") + parser.add_argument("--lr", "-l", type=float, default=0.01, help="Learning rate") + parser.add_argument("--batchsize", "-b", type=int, default=64, help="Batch size") + parser.add_argument("--hidden", type=int, default=64, help="Hidden dimensions") + parser.add_argument("--earlystop", type=int, default=20, help="Early stopping after N epochs") + parser.add_argument("--naive", "-n", action="store_true", help="Use a naive model instead of lego model") + parser.add_argument("--resume", action="store_true") + parser.add_argument("--max_epochs", type=int, default=500) + + parser.add_argument("--cache", action="store_true", help="Cache processed data.") + parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) + parser.add_argument("--count", type=int, default=2) + parser.add_argument("--sweep_id", type=str, default=None) + parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) + parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + args = parser.parse_args() + args.resume = True + + torch.set_num_threads(args.cpus) + args.outdir = os.path.abspath(args.outdir) + os.makedirs(args.model_folder, exist_ok=True) + os.makedirs(args.outdir, exist_ok=True) + # Specify output log file + fh = logging.FileHandler(f"{args.outdir}/training_{args.subtask}_{args.seed}.log", "w") + fh.setLevel(logging.INFO) + logger.addHandler(fh) + file_root_path = Path(args.root_path, args.subtask).resolve() + logger.info(f"\n files is saved in {file_root_path}") + pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + for arg in vars(args): + logger.info(f"Parameter {arg}: {getattr(args, arg)}") + + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): + wandb.init(settings=wandb.Settings(start_method='thread')) + rndseed = args.seed + set_seed(rndseed) + dataset = ModalityPredictionDataset(args.subtask, preprocess=None) + data = dataset.load_data() + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + + # Obtain training and testing data + x_train, y_train = data.get_train_data(return_type="torch") + x_test, y_test = data.get_test_data(return_type="torch") + x_train, y_train, x_test, y_test = x_train.float(), y_train.float(), x_test.float(), y_test.float() + # Train and evaluate the model + #突然想到,或许有些算法可以降维,而有些算法不能降维,所以还是要依据算法而定 + model = BabelWrapper(args, dim_in=x_train.shape[1], dim_out=y_train.shape[1]) + model.fit(x_train, y_train, val_ratio=0.15) + wandb.log({'rmse': model.score(x_test, y_test)}) + wandb.finish() + + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( + evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch + save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) + if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": + get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, + conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", + root_path=file_root_path, + required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", + "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") + if args.tune_mode == "pipeline_params": + run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) +"""To reproduce BABEL on other samples, please refer to command lines belows: + +GEX to ADT (subset): +$ python babel.py --subtask openproblems_bmmc_cite_phase2_rna_subset --device cuda + +GEX to ADT: +$ python babel.py --subtask openproblems_bmmc_cite_phase2_rna --device cuda + +ADT to GEX: +$ python babel.py --subtask openproblems_bmmc_cite_phase2_mod2 --device cuda + +GEX to ATAC: +$ python babel.py --subtask openproblems_bmmc_multiome_phase2_rna --device cuda + +ATAC to GEX: +$ python babel.py --subtask openproblems_bmmc_multiome_phase2_mod2 --device cuda + +""" From 44577317c4317e74ed75a23b8f472a136498add6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 08:47:32 +0000 Subject: [PATCH 005/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_dcca/main.py | 2 +- examples/tuning/predict_modality_babel/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 86d6d1f9..9f6fe897 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py index 4079608e..112317f7 100644 --- a/examples/tuning/predict_modality_babel/main.py +++ b/examples/tuning/predict_modality_babel/main.py @@ -6,8 +6,8 @@ import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import ModalityPredictionDataset from dance.modules.multi_modality.predict_modality.babel import BabelWrapper From b3e0c1b382a32f7f269c69f31909c91f65b76317 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 22 Jul 2024 19:01:39 +0800 Subject: [PATCH 006/203] minor change --- dance/transforms/graph/dstg_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/transforms/graph/dstg_graph.py b/dance/transforms/graph/dstg_graph.py index e51a3393..f5cefe46 100644 --- a/dance/transforms/graph/dstg_graph.py +++ b/dance/transforms/graph/dstg_graph.py @@ -1,4 +1,4 @@ -from typing import Sequence +from typing import Sequence, Union import networkx as nx import numpy as np @@ -35,7 +35,7 @@ class DSTGraph(BaseTransform): _DISPLAY_ATTRS = ("k_filter", "num_cc", "ref_split", "inf_split") def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test", - channels: Sequence[str | None] = (None, None), channel_types: Sequence[str | None] = ("obsm", "obsm"), + channels: Sequence[Union[str , None]] = (None, None), channel_types: Sequence[Union[str , None]] = ("obsm", "obsm"), **kwargs): super().__init__(**kwargs) From 16f3dc1367475772024b6e275037b42deef8109c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:02:49 +0000 Subject: [PATCH 007/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/transforms/graph/dstg_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/transforms/graph/dstg_graph.py b/dance/transforms/graph/dstg_graph.py index f5cefe46..5b6b792f 100644 --- a/dance/transforms/graph/dstg_graph.py +++ b/dance/transforms/graph/dstg_graph.py @@ -35,8 +35,8 @@ class DSTGraph(BaseTransform): _DISPLAY_ATTRS = ("k_filter", "num_cc", "ref_split", "inf_split") def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test", - channels: Sequence[Union[str , None]] = (None, None), channel_types: Sequence[Union[str , None]] = ("obsm", "obsm"), - **kwargs): + channels: Sequence[Union[str, None]] = (None, None), + channel_types: Sequence[Union[str, None]] = ("obsm", "obsm"), **kwargs): super().__init__(**kwargs) self.k_filter = k_filter From 83032fc1456ee26e3db6859740e69999d47e3425 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 22 Jul 2024 23:38:55 +0800 Subject: [PATCH 008/203] minor change --- .../tuning/joint_embedding_scmvae/main.py | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 examples/tuning/joint_embedding_scmvae/main.py diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py new file mode 100644 index 00000000..6017eec2 --- /dev/null +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -0,0 +1,177 @@ +import argparse +import os +import pprint +import sys +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +import torch.utils.data as data_utils +from sklearn import preprocessing + +import wandb +from dance import logger +from dance.datasets.multimodality import JointEmbeddingNIPSDataset +from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE +from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.transforms.preprocess import calculate_log_library_size +from dance.utils import set_seed + + +def parameter_setting(): + parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis") + + parser.add_argument("--workdir", "-wk", type=str, default="./new_test", help="work path") + parser.add_argument("--outdir", "-od", type=str, default="./new_test", help="Output path") + + parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate") + parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay") + parser.add_argument("--eps", type=float, default=0.01, help="eps") + parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + + parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size") + parser.add_argument('-seed', '--seed', type=int, default=1, help='Random seed for repeat results') + parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim") + parser.add_argument("--max_epoch", "-me", type=int, default=25, help="Max epoches") + parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration") + parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch") + parser.add_argument("--epoch_per_test", "-ept", type=int, default=1, + help="Epoch per test, must smaller than max iteration.") + parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") + parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") + parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("--final_rate", type=float, default=1e-4) + parser.add_argument("--scale_factor", type=float, default=4) + + parser.add_argument("--cache", action="store_true", help="Cache processed data.") + parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) + parser.add_argument("--count", type=int, default=2) + parser.add_argument("--sweep_id", type=str, default=None) + parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) + parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + + return parser + + +if __name__ == "__main__": + parser = parameter_setting() + args = parser.parse_args() + assert args.max_iteration > args.epoch_per_test + device = torch.device(args.device) + args.lr = 0.001 + args.anneal_epoch = 200 + res = None + logger.info(f"\n{pprint.pformat(vars(args))}") + file_root_path = Path(args.root_path, args.subtask).resolve() + logger.info(f"\n files is saved in {file_root_path}") + pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): + wandb.init(settings=wandb.Settings(start_method='thread')) + set_seed(args.seed) + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") + data = dataset.load_data() + + le = preprocessing.LabelEncoder() + labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + data.mod["mod1"].obsm["labels"] = labels + data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["layers", "layers"], + feature_channel=["counts", "counts"], label_channel="labels") + + (x_train, y_train), _ = data.get_train_data(return_type="torch") + (x_test, y_test), labels = data.get_test_data(return_type="torch") + + lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()])) + lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()])) + lib_mean1 = torch.from_numpy(lib_mean1) + lib_var1 = torch.from_numpy(lib_var1) + lib_mean2 = torch.from_numpy(lib_mean2) + lib_var2 = torch.from_numpy(lib_var2) + + Nfeature1 = x_train.shape[1] + Nfeature2 = y_train.shape[1] + train_size = len(data.get_split_idx("train")) + train = data_utils.TensorDataset(x_train, lib_mean1[:train_size], lib_var1[:train_size], lib_mean2[:train_size], + lib_var2[:train_size], y_train) + + valid = data_utils.TensorDataset(x_test, lib_mean1[train_size:], lib_var1[train_size:], lib_mean2[train_size:], + lib_var2[train_size:], y_test) + + total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) + + total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) + + x_test = torch.cat([x_train, x_test]) + y_test = torch.cat([y_train, y_test]) + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) + model = scMVAE( + encoder_1=[Nfeature1, 1024, 128, 128], + hidden_1=128, + Z_DIMS=22, + decoder_share=[22, 128, 256], + share_hidden=128, + decoder_1=[128, 128, 1024], + hidden_2=1024, + encoder_l=[Nfeature1, 128], + hidden3=128, + encoder_2=[Nfeature2, 1024, 128, 128], + hidden_4=128, + encoder_l1=[Nfeature2, 128], + hidden3_1=128, + decoder_2=[128, 128, 1024], + hidden_5=1024, + drop_rate=0.1, + log_variational=True, + Type="ZINB", + device=device, + n_centroids=22, + penality="GMM", + model=1, + ) + model.to(device) + model.init_gmm_params(total_loader) + model.fit(args, train, valid, args.final_rate, args.scale_factor, device) + + # embeds = model.predict(x_test, y_test).cpu().numpy() + score = model.score(x_test, y_test, labels) + score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + torch.cuda.empty_cache() + # score.update({ + # 'seed': args.seed + k, + # 'subtask': args.subtask, + # 'method': 'scmvae', + # }) + + # if res is not None: + # res = res.append(score, ignore_index=True) + # else: + # for s in score: + # score[s] = [score[s]] + # res = pd.DataFrame(score) + + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( + evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch + save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) + if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": + get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, + conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", + root_path=file_root_path, + required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", + "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") + if args.tune_mode == "pipeline_params": + run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) +"""To reproduce scMVAE on other samples, please refer to command lines belows: + +GEX-ADT: +$ python scmvae.py --subtask openproblems_bmmc_cite_phase2 --device cuda + +GEX-ATAC: +$ python scmvae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda + +""" From 4bbe9b71aaab48e0ecd027edfc0cdbdcbad97023 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:39:25 +0000 Subject: [PATCH 009/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmvae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 6017eec2..a5d21251 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -8,9 +8,9 @@ import pandas as pd import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE From d2a4d0b364853b2a2a68be603a65bdf0115028f7 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 19:32:50 +0800 Subject: [PATCH 010/203] minor change --- examples/tuning/joint_embedding_scmvae/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 6017eec2..bdd3f029 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -77,8 +77,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) le = preprocessing.LabelEncoder() labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) data.mod["mod1"].obsm["labels"] = labels - data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["layers", "layers"], - feature_channel=["counts", "counts"], label_channel="labels") + data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["obsm", "obsm"], + feature_channel=["feature.cell", "feature.cell"], label_channel="labels") (x_train, y_train), _ = data.get_train_data(return_type="torch") (x_test, y_test), labels = data.get_test_data(return_type="torch") From 1a310205a0c8fd4dd20f99a1383b8dc8fedbc769 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 19:39:24 +0800 Subject: [PATCH 011/203] change score --- examples/tuning/joint_embedding_dcca/main.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 9f6fe897..032bf0b2 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing import dance.utils.metrics as metrics +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA @@ -159,12 +159,12 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) adata = adata[adata_sol.obs_names] adata_sol.obsm['X_emb'] = adata.X score = metrics.labeled_clustering_evaluate(adata, adata_sol) - # score.update(metrics.integration_openproblems_evaluate(adata_sol)) - score.update({ - # 'seed': args.seed + k, - 'subtask': args.subtask, - 'method': 'dcca', - }) + score.update(metrics.integration_openproblems_evaluate(adata_sol)) + # score.update({ + # 'seed': args.seed + k, + # 'subtask': args.subtask, + # 'method': 'dcca', + # }) # if res is not None: # res = res.append(score, ignore_index=True) @@ -172,7 +172,9 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # for s in score: # score[s] = [score[s]] # res = pd.DataFrame(score) - wandb.log({"ARI":score["dance_ari"]}) + score["ARI"]=score["dance_ari"] + del score["dance_ari"] + wandb.log(score) wandb.finish() torch.cuda.empty_cache() #主要是报错时没有执行这些命令导致的,我感觉 From 2419e244bf238c764ff07ea9972bf17b6e48f8ba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:43:31 +0000 Subject: [PATCH 012/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_dcca/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 032bf0b2..0efe7567 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA From 18187e5e0618e07bc65dd6e68449ee5fd11447f8 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 20:11:00 +0800 Subject: [PATCH 013/203] minor change --- examples/tuning/joint_embedding_scmvae/main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 6cbedb88..996f8093 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -77,8 +77,13 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) le = preprocessing.LabelEncoder() labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) data.mod["mod1"].obsm["labels"] = labels - data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["obsm", "obsm"], - feature_channel=["feature.cell", "feature.cell"], label_channel="labels") + + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + (x_train, y_train), _ = data.get_train_data(return_type="torch") (x_test, y_test), labels = data.get_test_data(return_type="torch") @@ -105,7 +110,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) x_test = torch.cat([x_train, x_test]) y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))#这里大概会有问题 model = scMVAE( encoder_1=[Nfeature1, 1024, 128, 128], hidden_1=128, From 77ad75ae9c34fa56a2eeb246a6c297d57d58eb20 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:11:29 +0000 Subject: [PATCH 014/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmvae/main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 996f8093..3b2abcd2 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -77,14 +77,13 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) le = preprocessing.LabelEncoder() labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) data.mod["mod1"].obsm["labels"] = labels - + # Prepare preprocessing pipeline and apply it to data kwargs = {tune_mode: dict(wandb.config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) - (x_train, y_train), _ = data.get_train_data(return_type="torch") (x_test, y_test), labels = data.get_test_data(return_type="torch") @@ -110,7 +109,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) x_test = torch.cat([x_train, x_test]) y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))#这里大概会有问题 + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题 model = scMVAE( encoder_1=[Nfeature1, 1024, 128, 128], hidden_1=128, From ea7dff5bc80f9e0c609897d56be0ab9738b8eacd Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 20:13:12 +0800 Subject: [PATCH 015/203] minor changes --- examples/tuning/joint_embedding_dcca/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 032bf0b2..96628e68 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -114,6 +114,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"], # feature_channel=["counts", "counts", None, None, "size_factors", # "size_factors"], label_channel="labels") + #TODO 感觉layers中的counts才是raw (x_train, y_train, x_train_raw, y_train_raw, x_train_size, y_train_size), train_labels = data.get_train_data(return_type="torch") (x_test, y_test, x_test_raw, y_test_raw, x_test_size, From 72c0b71a0878ccb55a0f0d66dd7320374e08d800 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 20:15:18 +0800 Subject: [PATCH 016/203] minor changes --- examples/tuning/joint_embedding_jae/main.py | 114 ++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 examples/tuning/joint_embedding_jae/main.py diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py new file mode 100644 index 00000000..48059370 --- /dev/null +++ b/examples/tuning/joint_embedding_jae/main.py @@ -0,0 +1,114 @@ +import argparse +import os +import pprint +import sys +from pathlib import Path + +import numpy as np +import pandas as pd +import torch + +import wandb +from dance import logger +from dance.datasets.multimodality import JointEmbeddingNIPSDataset +from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper +from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.utils import set_seed + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", + choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") + parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") + parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") + parser.add_argument("-seed", "--seed", default=1, type=int) + parser.add_argument("-cpu", "--cpus", default=1, type=int) + parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("-bs", "--batch_size", default=128, type=int) + parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) + parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + + parser.add_argument("--cache", action="store_true", help="Cache processed data.") + parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) + parser.add_argument("--count", type=int, default=2) + parser.add_argument("--sweep_id", type=str, default=None) + parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) + parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + + args = parser.parse_args() + + device = args.device + pre_normalize = bool(args.normalize) + torch.set_num_threads(args.cpus) + rndseed = args.seed + set_seed(rndseed) + + res = None + logger.info(f"\n{pprint.pformat(vars(args))}") + file_root_path = Path(args.root_path, args.subtask).resolve() + logger.info(f"\n files is saved in {file_root_path}") + pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): + wandb.init(settings=wandb.Settings(start_method='thread')) + set_seed(args.seed) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder) + data = dataset.load_data() + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + + (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score, + G2M_score) = data.get_train_data(return_type="torch") + (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch") + X_train = torch.cat([X_mod1_train, X_mod2_train], dim=1) + phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) + X_test = torch.cat([X_mod1_test, X_mod2_test], dim=1) + X_test = torch.cat([X_train, X_test]).float().to(device) + test_id = np.arange(X_test.shape[0]) + labels = torch.cat([cell_type, cell_type_test]).numpy() + adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] + + model = JAEWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), + num_phases=phase_score.shape[1], num_features=X_train.shape[1]) + model.fit(X_train, cell_type, batch_label, phase_score, max_epochs=50) + + embeds = model.predict(X_test, test_id).cpu().numpy() + print(embeds) + + score = model.score(X_test, test_id, labels, metric="clustering") + score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) + score.update({ + 'subtask': args.subtask, + 'method': 'jae', + }) + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + torch.cuda.empty_cache() + + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( + evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch + save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) + if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": + get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, + conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", + root_path=file_root_path, + required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", + "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") + if args.tune_mode == "pipeline_params": + run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) +"""To reproduce JAE on other samples, please refer to command lines belows: + +GEX-ADT: +$ python jae.py --subtask openproblems_bmmc_cite_phase2 --device cuda + +GEX-ATAC: +$ python jae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda + +""" From 4571f5ce7d24b1fce70eb81c9b48a48d69e1f016 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:20:34 +0000 Subject: [PATCH 017/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 48059370..a3108167 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper From 87c7b4e0847f9cec48a0726dd5ad6a18bf633097 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 20:24:18 +0800 Subject: [PATCH 018/203] minor change --- examples/tuning/joint_embedding_scmvae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 3b2abcd2..c52f2108 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -109,7 +109,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) x_test = torch.cat([x_train, x_test]) y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题 + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 model = scMVAE( encoder_1=[Nfeature1, 1024, 128, 128], hidden_1=128, From 0f4f027739dd2fa1708108d0071b30a9a4872f60 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 23 Jul 2024 22:38:01 +0800 Subject: [PATCH 019/203] minor change --- .../tuning/joint_embedding_scmogcn/main.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 examples/tuning/joint_embedding_scmogcn/main.py diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py new file mode 100644 index 00000000..d1f681d1 --- /dev/null +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -0,0 +1,121 @@ +import argparse +import os +import pprint +import sys +from pathlib import Path + +import numpy as np +import pandas as pd +import torch + +import wandb +from dance import logger +from dance.datasets.multimodality import JointEmbeddingNIPSDataset +from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper +from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.transforms.graph.cell_feature_graph import CellFeatureBipartiteGraph +from dance.utils import set_seed + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", + choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") + parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") + parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") + parser.add_argument("-l", "--layers", default=3, type=int, choices=[3, 4, 5, 6, 7]) + parser.add_argument("-dis", "--disable_propagation", default=0, type=int, choices=[0, 1, 2]) + parser.add_argument("-seed", "--seed", default=1, type=int) + parser.add_argument("-cpu", "--cpus", default=1, type=int) + parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("-bs", "--batch_size", default=512, type=int) + parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) + parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + + parser.add_argument("--cache", action="store_true", help="Cache processed data.") + parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) + parser.add_argument("--count", type=int, default=2) + parser.add_argument("--sweep_id", type=str, default=None) + parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) + parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + + args = parser.parse_args() + + device = args.device + pre_normalize = bool(args.normalize) + torch.set_num_threads(args.cpus) + rndseed = args.seed + set_seed(rndseed) + + res = None + logger.info(f"\n{pprint.pformat(vars(args))}") + file_root_path = Path(args.root_path, args.subtask).resolve() + logger.info(f"\n files is saved in {file_root_path}") + pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): + wandb.init(settings=wandb.Settings(start_method='thread')) + set_seed(args.seed) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True) + data = dataset.load_data() + train_size = len(data.get_split_idx("train")) + + data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod1")(data) + data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod2")(data) + # data.set_config( + # feature_mod=["mod1", "mod2"], + # label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"], + # feature_channel=["X_pca", "X_pca"], + # label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], + # ) + (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch") + phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) + test_id = np.arange(x_mod1.shape[0]) + labels = cell_type.numpy() + adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] + model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), + num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1]) + model.fit( + g_mod1=data.data["mod1"].uns["g"], + g_mod2=data.data["mod2"].uns["g"], + train_size=train_size, + cell_type=cell_type, + batch_label=batch_label, + phase_score=phase_score, + ) + + embeds = model.predict(test_id).cpu().numpy() + score = model.score(test_id, labels, metric="clustering") + score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) + score.update({ + 'subtask': args.subtask, + 'method': 'scmogcn', + }) + + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + torch.cuda.empty_cache() + + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( + evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch + save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) + if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": + get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, + conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", + root_path=file_root_path, + required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", + "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") + if args.tune_mode == "pipeline_params": + run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) +"""To reproduce scMoGCN on other samples, please refer to command lines belows: + +GEX-ADT: +$ python scmogcn.py --subtask openproblems_bmmc_cite_phase2 --device cuda + +GEX-ATAC: +$ python scmogcn.py --subtask openproblems_bmmc_multiome_phase2 --device cuda + +""" From 90c01c6b9b1d58066966d8e3098681aa8ae6c778 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:38:40 +0000 Subject: [PATCH 020/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmogcn/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index d1f681d1..10edd63b 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper From 6072fc318024f63f6cc9bb0582da5ecd86352b58 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 00:11:57 +0800 Subject: [PATCH 021/203] minor change --- examples/tuning/joint_embedding_scmogcn/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index 10edd63b..b2b8c1ce 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -19,7 +19,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + choices=["GSE140203_BRAIN_atac2gex","openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -57,12 +57,12 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder) data = dataset.load_data() train_size = len(data.get_split_idx("train")) - data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod1")(data) - data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod2")(data) + data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data) + data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data) # data.set_config( # feature_mod=["mod1", "mod2"], # label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"], From 597071eb0fe5ee26bebdd1f42f2f91712cb36cb4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:12:27 +0000 Subject: [PATCH 022/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmogcn/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index b2b8c1ce..cbc17295 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -18,8 +18,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["GSE140203_BRAIN_atac2gex","openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + parser.add_argument( + "-t", "--subtask", default="openproblems_bmmc_cite_phase2", + choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") From e2532e7b1ee7e76e70650146bd9f650a65e8d5e2 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 00:12:34 +0800 Subject: [PATCH 023/203] minor change --- examples/tuning/joint_embedding_jae/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index a3108167..e4b480c6 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper @@ -17,8 +17,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + parser.add_argument( + "-t", "--subtask", default="openproblems_bmmc_cite_phase2", + choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") From ca7b029149dfcc86962f43c751e97565ac61d92c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:14:36 +0000 Subject: [PATCH 024/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index e4b480c6..71308475 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper From 1aecb98a7fa40e614ffdfaca1570e8a74eabdaac Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 09:31:27 +0800 Subject: [PATCH 025/203] minor changes --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index e4b480c6..0b58c1cb 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -62,7 +62,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) - + logger.warning(data) (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_train_data(return_type="torch") (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch") From 7d595898d0c808d1d7ae7fdb68150a48e8a1d601 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 11:35:08 +0800 Subject: [PATCH 026/203] minor change --- .../tuning/joint_embedding_scmogcn/main.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index cbc17295..8fdbe10f 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -32,6 +32,7 @@ parser.add_argument("-bs", "--batch_size", default=512, type=int) parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("--preprocess", type=str, default=None) parser.add_argument("--cache", action="store_true", help="Cache processed data.") parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) @@ -58,10 +59,25 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder,preprocess=args.preprocess) data = dataset.load_data() - train_size = len(data.get_split_idx("train")) + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + if args.preprocess!="aux": + cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() + cell_type_labels_unique = list(np.unique(cell_type_labels)) + c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) + data.data['mod1'].obsm["cell_type"] = c_labels + data.data["mod1"].obsm["S_scores"]=np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"]=np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"]=np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"]=np.zeros(data.data['mod1'].shape[0]) + train_size = len(data.get_split_idx("train")) + data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data) data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data) # data.set_config( From 9d035f744055958cc18f64d425c83b3de877a1f9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Jul 2024 03:36:04 +0000 Subject: [PATCH 027/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmogcn/main.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index 8fdbe10f..cf593088 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -59,25 +59,25 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder,preprocess=args.preprocess) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) data = dataset.load_data() # Prepare preprocessing pipeline and apply it to data kwargs = {tune_mode: dict(wandb.config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) - if args.preprocess!="aux": + if args.preprocess != "aux": cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() cell_type_labels_unique = list(np.unique(cell_type_labels)) c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) data.data['mod1'].obsm["cell_type"] = c_labels - data.data["mod1"].obsm["S_scores"]=np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["G2M_scores"]=np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["batch_label"]=np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["phase_labels"]=np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) train_size = len(data.get_split_idx("train")) - + data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data) data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data) # data.set_config( From 86d56f7273fae37b9cb243dc917284e98b2bc0ab Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 11:36:31 +0800 Subject: [PATCH 028/203] minor change --- examples/tuning/joint_embedding_jae/main.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index afde44f6..591b71ba 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper @@ -29,6 +29,7 @@ parser.add_argument("-bs", "--batch_size", default=128, type=int) parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("--preprocess", type=str, default=None) parser.add_argument("--cache", action="store_true", help="Cache processed data.") parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) @@ -55,14 +56,23 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) data = dataset.load_data() + # Prepare preprocessing pipeline and apply it to data kwargs = {tune_mode: dict(wandb.config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) - logger.warning(data) + if args.preprocess != "aux": + cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() + cell_type_labels_unique = list(np.unique(cell_type_labels)) + c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) + data.data['mod1'].obsm["cell_type"] = c_labels + data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_train_data(return_type="torch") (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch") From 8cfcf15f2cad9bf783e69b92cc168caa11ef8cc0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Jul 2024 03:39:13 +0000 Subject: [PATCH 029/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 591b71ba..77e662e5 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper From 514465bd7683afebe028d57a741de3558d9d548d Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 22:29:13 +0800 Subject: [PATCH 030/203] minor change --- examples/tuning/joint_embedding_dcca/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 0efe7567..20924128 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing import dance.utils.metrics as metrics +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA @@ -159,7 +159,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) adata = adata[adata_sol.obs_names] adata_sol.obsm['X_emb'] = adata.X score = metrics.labeled_clustering_evaluate(adata, adata_sol) - score.update(metrics.integration_openproblems_evaluate(adata_sol)) + # score.update(metrics.integration_openproblems_evaluate(adata_sol)) # score.update({ # 'seed': args.seed + k, # 'subtask': args.subtask, From ac137c3164d678902769a676930eb0d2b4ac67a8 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 22:29:16 +0800 Subject: [PATCH 031/203] minor change --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 591b71ba..be7b962b 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -92,7 +92,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) print(embeds) score = model.score(X_test, test_id, labels, metric="clustering") - score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) + # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) score.update({ 'subtask': args.subtask, 'method': 'jae', From 3c717cbc9e9091ce5e38c797f672d7388f4ebe81 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Jul 2024 14:30:51 +0000 Subject: [PATCH 032/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_dcca/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 4a6c3317..b2a9bf8b 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA From 648b3c154c36039a4d0ae6f72d0568a9a8b2f37e Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 22:31:00 +0800 Subject: [PATCH 033/203] minor change --- examples/tuning/joint_embedding_dcca/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index da28136e..4a6c3317 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing import dance.utils.metrics as metrics +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA @@ -160,7 +160,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) adata = adata[adata_sol.obs_names] adata_sol.obsm['X_emb'] = adata.X score = metrics.labeled_clustering_evaluate(adata, adata_sol) - score.update(metrics.integration_openproblems_evaluate(adata_sol)) + # score.update(metrics.integration_openproblems_evaluate(adata_sol)) # score.update({ # 'seed': args.seed + k, # 'subtask': args.subtask, From 7a7f6a165f3f5c127edcaaae0e2d558323bbb5c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Jul 2024 14:31:58 +0000 Subject: [PATCH 034/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_dcca/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 4a6c3317..b2a9bf8b 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -11,10 +11,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA From 4518d5f56705f1397221ea8110cc59bd2c6a6136 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 24 Jul 2024 22:36:48 +0800 Subject: [PATCH 035/203] minor change --- examples/tuning/joint_embedding_scmogcn/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index cf593088..90ec2e96 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -104,7 +104,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) embeds = model.predict(test_id).cpu().numpy() score = model.score(test_id, labels, metric="clustering") - score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) + # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) score.update({ 'subtask': args.subtask, 'method': 'scmogcn', From 44e5b47095323ca5334eee467dc4d664f3b75c8a Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 25 Jul 2024 21:47:27 +0800 Subject: [PATCH 036/203] minor change --- examples/tuning/joint_embedding_dcca/main.py | 190 ++++++++++--------- 1 file changed, 101 insertions(+), 89 deletions(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index b2a9bf8b..d81b6929 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -3,6 +3,7 @@ import os import pprint import sys +from copy import deepcopy from pathlib import Path import anndata as ad @@ -11,10 +12,10 @@ import scipy import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing import dance.utils.metrics as metrics +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA @@ -94,95 +95,106 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=4, layer_d_2=[4], hidden2_2=4, args=args, # Type_1="NB", Type_2="Bernoulli", ground_truth1=torch.cat([train_labels, test_labels]), cycle=1, # attention_loss="Eucli") # yapf: disable - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") - data = dataset.load_data() - - # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) - print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") - preprocessing_pipeline(data) - - le = preprocessing.LabelEncoder() - labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) - data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense() if scipy.sparse.issparse(data.mod["mod2"].X) else data.mod["mod2"].X, 1) / 100 - # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"] - data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense() if scipy.sparse.issparse(data.mod["mod1"].X) else data.mod["mod1"].X, 1) / 100 - data.mod["mod1"].obsm["labels"] = labels - - # data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1", - # feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"], - # feature_channel=["counts", "counts", None, None, "size_factors", - # "size_factors"], label_channel="labels") - #TODO 感觉layers中的counts才是raw - (x_train, y_train, x_train_raw, y_train_raw, x_train_size, - y_train_size), train_labels = data.get_train_data(return_type="torch") - (x_test, y_test, x_test_raw, y_test_raw, x_test_size, - y_test_size), test_labels = data.get_test_data(return_type="torch") - - Nfeature1 = x_train.shape[1] - Nfeature2 = y_train.shape[1] - - device = torch.device(args.device) - train = data_utils.TensorDataset(x_train.float(), x_train_raw, x_train_size.float(), y_train.float(), y_train_raw, - y_train_size.float()) - - train_loader = data_utils.DataLoader(train, batch_size=args.batch_size, shuffle=True) - - test = data_utils.TensorDataset(x_test.float(), x_test_raw, x_test_size.float(), y_test.float(), y_test_raw, - y_test_size.float()) - - test_loader = data_utils.DataLoader(test, batch_size=args.batch_size, shuffle=False) - - total = data_utils.TensorDataset( - torch.cat([x_train, x_test]).float(), torch.cat([x_train_raw, x_test_raw]), - torch.cat([x_train_size, x_test_size]).float(), - torch.cat([y_train, y_test]).float(), torch.cat([y_train_raw, y_test_raw]), - torch.cat([y_train_size, y_test_size]).float()) - - total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) - model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=50, layer_d_1=[50, 128], hidden2_1=128, - layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=50, layer_d_2=[50], hidden2_2=50, - args=args, ground_truth1=torch.cat([train_labels, test_labels]), Type_1="NB", Type_2="Bernoulli", - cycle=1, attention_loss="Eucli").to(device) - model.to(device) - model.fit(train_loader, test_loader, total_loader, "RNA") - - emb1, emb2 = model.predict(total_loader) - embeds = np.concatenate([emb1, emb2], 1) - print(embeds) - - adata = ad.AnnData( - X=embeds, - obs=data.mod["mod1"].obs, - ) - adata_sol = data.mod["test_sol"] - adata = adata[adata_sol.obs_names] - adata_sol.obsm['X_emb'] = adata.X - score = metrics.labeled_clustering_evaluate(adata, adata_sol) - # score.update(metrics.integration_openproblems_evaluate(adata_sol)) - # score.update({ - # 'seed': args.seed + k, - # 'subtask': args.subtask, - # 'method': 'dcca', - # }) - - # if res is not None: - # res = res.append(score, ignore_index=True) - # else: - # for s in score: - # score[s] = [score[s]] - # res = pd.DataFrame(score) - score["ARI"]=score["dance_ari"] - del score["dance_ari"] - wandb.log(score) - wandb.finish() - torch.cuda.empty_cache() + try: + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") + data = dataset.load_data() + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + le = preprocessing.LabelEncoder() + labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense() if scipy.sparse.issparse(data.mod["mod2"].X) else data.mod["mod2"].X, 1) / 100 + # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"] + data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense() if scipy.sparse.issparse(data.mod["mod1"].X) else data.mod["mod1"].X, 1) / 100 + data.mod["mod1"].obsm["labels"] = labels + + # data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1", + # feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"], + # feature_channel=["counts", "counts", None, None, "size_factors", + # "size_factors"], label_channel="labels") + #TODO 感觉layers中的counts才是raw + #TODO 的确感觉layers中的counts才是raw,不知道反过来影响大不大 + (x_train, y_train, x_train_raw, y_train_raw, x_train_size, + y_train_size), train_labels = data.get_train_data(return_type="torch") + (x_test, y_test, x_test_raw, y_test_raw, x_test_size, + y_test_size), test_labels = data.get_test_data(return_type="torch") + + Nfeature1 = x_train.shape[1] + Nfeature2 = y_train.shape[1] + + device = torch.device(args.device) + train = data_utils.TensorDataset(x_train.float(), x_train_raw, x_train_size.float(), y_train.float(), y_train_raw, + y_train_size.float()) + + train_loader = data_utils.DataLoader(train, batch_size=args.batch_size, shuffle=True) + + test = data_utils.TensorDataset(x_test.float(), x_test_raw, x_test_size.float(), y_test.float(), y_test_raw, + y_test_size.float()) + + test_loader = data_utils.DataLoader(test, batch_size=args.batch_size, shuffle=False) + + total = data_utils.TensorDataset( + torch.cat([x_train, x_test]).float(), torch.cat([x_train_raw, x_test_raw]), + torch.cat([x_train_size, x_test_size]).float(), + torch.cat([y_train, y_test]).float(), torch.cat([y_train_raw, y_test_raw]), + torch.cat([y_train_size, y_test_size]).float()) + + total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) + model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=50, layer_d_1=[50, 128], hidden2_1=128, + layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=50, layer_d_2=[50], hidden2_2=50, + args=args, ground_truth1=torch.cat([train_labels, test_labels]), Type_1="NB", Type_2="Bernoulli", + cycle=1, attention_loss="Eucli").to(device) + model.to(device) + model.fit(train_loader, test_loader, total_loader, "RNA") + + emb1, emb2 = model.predict(total_loader) + embeds = np.concatenate([emb1, emb2], 1) + print(embeds) + + adata = ad.AnnData( + X=embeds, + obs=data.mod["mod1"].obs, + ) + adata_sol = data.mod["test_sol"] + adata = adata[adata_sol.obs_names] + adata_sol.obsm['X_emb'] = adata.X + score = metrics.labeled_clustering_evaluate(adata, adata_sol) + # score.update(metrics.integration_openproblems_evaluate(adata_sol)) + # score.update({ + # 'seed': args.seed + k, + # 'subtask': args.subtask, + # 'method': 'dcca', + # }) + + # if res is not None: + # res = res.append(score, ignore_index=True) + # else: + # for s in score: + # score[s] = [score[s]] + # res = pd.DataFrame(score) + score["ARI"]=score["dance_ari"] + del score["dance_ari"] + wandb.log(score.copy()) + wandb.finish() + finally: + # del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1 + # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels + # del labels,le,dataset,score + # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score] + locals_keys=list(locals().keys()) + for var in locals_keys: + try: + exec(f"del {var}") + logger.info(f"Deleted '{var}'") + except NameError: + logger.info(f"Variable '{var}' does not exist, continuing...") + torch.cuda.empty_cache() + gc.collect() #主要是报错时没有执行这些命令导致的,我感觉 - del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1 - del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels - del labels,le,dataset,score - gc.collect() + + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) From aad76dc41ad9a14bc0724ade398a313ab249829d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:50:14 +0000 Subject: [PATCH 037/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_dcca/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index d81b6929..7a1ed2b3 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -12,10 +12,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA From 4b2c0f6ab86996262ea09ecbb2182331297ad2de Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 25 Jul 2024 21:53:22 +0800 Subject: [PATCH 038/203] minor change --- examples/tuning/joint_embedding_jae/main.py | 109 +++++++++++--------- 1 file changed, 62 insertions(+), 47 deletions(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 4a3b70a6..1c115617 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -1,4 +1,5 @@ import argparse +import gc import os import pprint import sys @@ -7,8 +8,8 @@ import numpy as np import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper @@ -56,52 +57,66 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) - data = dataset.load_data() - - # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) - print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") - preprocessing_pipeline(data) - if args.preprocess != "aux": - cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() - cell_type_labels_unique = list(np.unique(cell_type_labels)) - c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) - data.data['mod1'].obsm["cell_type"] = c_labels - data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) - (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score, - G2M_score) = data.get_train_data(return_type="torch") - (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch") - X_train = torch.cat([X_mod1_train, X_mod2_train], dim=1) - phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) - X_test = torch.cat([X_mod1_test, X_mod2_test], dim=1) - X_test = torch.cat([X_train, X_test]).float().to(device) - test_id = np.arange(X_test.shape[0]) - labels = torch.cat([cell_type, cell_type_test]).numpy() - adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] - - model = JAEWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), - num_phases=phase_score.shape[1], num_features=X_train.shape[1]) - model.fit(X_train, cell_type, batch_label, phase_score, max_epochs=50) - - embeds = model.predict(X_test, test_id).cpu().numpy() - print(embeds) - - score = model.score(X_test, test_id, labels, metric="clustering") - # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) - score.update({ - 'subtask': args.subtask, - 'method': 'jae', - }) - score["ARI"] = score["dance_ari"] - del score["dance_ari"] - wandb.log(score) - wandb.finish() - torch.cuda.empty_cache() + + try: + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) + data = dataset.load_data() + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + if args.preprocess != "aux": + cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() + cell_type_labels_unique = list(np.unique(cell_type_labels)) + c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) + data.data['mod1'].obsm["cell_type"] = c_labels + data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) + (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score, + G2M_score) = data.get_train_data(return_type="torch") + (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch") + X_train = torch.cat([X_mod1_train, X_mod2_train], dim=1) + phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) + X_test = torch.cat([X_mod1_test, X_mod2_test], dim=1) + X_test = torch.cat([X_train, X_test]).float().to(device) + test_id = np.arange(X_test.shape[0]) + labels = torch.cat([cell_type, cell_type_test]).numpy() + adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] + + model = JAEWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), + num_phases=phase_score.shape[1], num_features=X_train.shape[1]) + model.fit(X_train, cell_type, batch_label, phase_score, max_epochs=50) + + embeds = model.predict(X_test, test_id).cpu().numpy() + print(embeds) + + score = model.score(X_test, test_id, labels, metric="clustering") + # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) + score.update({ + 'subtask': args.subtask, + 'method': 'jae', + }) + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + finally: + # del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1 + # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels + # del labels,le,dataset,score + # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score] + locals_keys = list(locals().keys()) + for var in locals_keys: + try: + exec(f"del {var}") + logger.info(f"Deleted '{var}'") + except NameError: + logger.info(f"Variable '{var}' does not exist, continuing...") + torch.cuda.empty_cache() + gc.collect() entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch From d834e89dbd05a887f5ba779b8b8b0f38ac066764 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:54:00 +0000 Subject: [PATCH 039/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 1c115617..1485c6fa 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper From ea0e7feebd2be3c72b8573cfdd2b1641c7f7b121 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 29 Jul 2024 17:34:07 +0800 Subject: [PATCH 040/203] minor change --- dance/datasets/multimodality.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py index 7df32f7d..090509d6 100644 --- a/dance/datasets/multimodality.py +++ b/dance/datasets/multimodality.py @@ -575,7 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre def _raw_to_dance(self, raw_data): mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data) - self.to_array([mod1, mod2, meta1, meta2, test_sol]) + # self.to_array([mod1, mod2, meta1, meta2, test_sol]) assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned" mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol}) @@ -755,7 +755,7 @@ def _maybe_preprocess(self, raw_data): if mod1.shape[1] > self.selection_threshold: sc.pp.highly_variable_genes(mod1, layer="counts", flavor="seurat_v3", n_top_genes=self.selection_threshold, span=self.span) - mod1 = mod1[:, mod1.var["highly_variable"]] + mod1 = mod1[:, mod1.var["highly_variable"]] # Equivalent to subset=True and _inplace_subset_var if mod2.shape[1] > self.selection_threshold: sc.pp.highly_variable_genes(mod2, layer="counts", flavor="seurat_v3", n_top_genes=self.selection_threshold, span=self.span) From 50da51ec08fa6ee776702791035305767fa5531c Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 29 Jul 2024 21:05:49 +0800 Subject: [PATCH 041/203] minor changes --- dance/datasets/multimodality.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py index 090509d6..14e9c5c6 100644 --- a/dance/datasets/multimodality.py +++ b/dance/datasets/multimodality.py @@ -575,7 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre def _raw_to_dance(self, raw_data): mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data) - # self.to_array([mod1, mod2, meta1, meta2, test_sol]) + self.to_array([mod1, mod2, meta1, meta2, test_sol]) assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned" mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol}) @@ -588,9 +588,13 @@ def _raw_to_dance(self, raw_data): def to_array(self, datas): for data in datas: if scipy.sparse.issparse(data.X): - data.X = np.array(data.X.todense()).astype(float) + if not isinstance(data.X, scipy.sparse.csr_matrix): + data.X = data.X.tocsr() + # data.X = np.array(data.X.todense()).astype(float) if "counts" in data.layers and scipy.sparse.issparse(data.layers["counts"]): - data.layers["counts"] = np.array(data.layers["counts"].todense()).astype(float) + if not isinstance(data.layers["counts"], scipy.sparse.csr_matrix): + data.layers["counts"] = data.layers["counts"].tocsr() + # data.layers["counts"] = np.array(data.layers["counts"].todense()).astype(float) def _maybe_preprocess(self, raw_data): if self.preprocess is None: From 8fa1959f63fea66b0de3d9e4c3221a89f93aaf83 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 31 Jul 2024 21:10:42 +0800 Subject: [PATCH 042/203] minor change --- dance/transforms/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/transforms/normalize.py b/dance/transforms/normalize.py index b0fb7af6..71ab6622 100644 --- a/dance/transforms/normalize.py +++ b/dance/transforms/normalize.py @@ -189,7 +189,7 @@ def transform(self, X): raise RuntimeError('Transformer was not fitted on any data') if scipy.sparse.issparse(X): tf = X.multiply(1 / X.sum(axis=1)) - return tf.multiply(self.idf) + return tf.multiply(self.idf).tocsr() else: tf = X / X.sum(axis=1, keepdims=True) return tf * self.idf From 46ccfbf86a4cf9a56a0646dc03d67f62c32b81e7 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 1 Aug 2024 21:59:45 +0800 Subject: [PATCH 043/203] minor change --- dance/pipeline.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/dance/pipeline.py b/dance/pipeline.py index 09595c52..a1aa2173 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -8,6 +8,7 @@ from operator import mul from pprint import pformat +import omegaconf import pandas as pd from omegaconf import DictConfig, OmegaConf @@ -793,6 +794,14 @@ def _params_search_space(self) -> Dict[str, Dict[str, Optional[Union[str, float] def wandb_sweep_config(self) -> Dict[str, Any]: if self.wandb_config is None: raise ValueError("wandb config not specified in the raw config.") + if "run_kwargs" in self.config: + return { + **self.wandb_config, "parameters": { + "run_kwargs": { + "values": omegaconf.OmegaConf.to_object(self.config.run_kwargs) + } + } + } return {**self.wandb_config, "parameters": self.search_space()} def wandb_sweep(self) -> Tuple[str, str, str]: @@ -807,7 +816,7 @@ def wandb_sweep(self) -> Tuple[str, str, str]: f"'entity' and 'project': {wandb_entity=!r}, {wandb_project=!r}") sweep_config = self.wandb_sweep_config() - logger.info(f"Sweep config:\n{pformat(sweep_config)}") + # logger.info(f"Sweep config:\n{pformat(sweep_config)}") wandb_sweep_id = wandb.sweep(sweep=sweep_config, entity=wandb_entity, project=wandb_project) logger.info(Color("blue")(f"\n\n\t[*] Sweep ID: {wandb_sweep_id}\n")) From ada544b21eaba34c0d99017ddc146abb63a1038a Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 1 Aug 2024 22:03:11 +0800 Subject: [PATCH 044/203] minor change --- examples/tuning/joint_embedding_jae/main.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 1485c6fa..8feedca7 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper @@ -19,8 +19,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + "-t", "--subtask", default="openproblems_bmmc_cite_phase2", choices=[ + "GSE140203_BRAIN_atac2gex", "GSE140203_SKIN_atac2gex", "openproblems_bmmc_cite_phase2", + "openproblems_bmmc_multiome_phase2" + ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -57,12 +59,19 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return try: dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) data = dataset.load_data() # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} + kwargs = {tune_mode: dict(wandb_config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) @@ -113,6 +122,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) try: exec(f"del {var}") logger.info(f"Deleted '{var}'") + except NameError: logger.info(f"Variable '{var}' does not exist, continuing...") torch.cuda.empty_cache() From c0cfd3a11de749cca4d79876cbf1d394cd6cfd6f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 14:03:53 +0000 Subject: [PATCH 045/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 8feedca7..3ba455d0 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper From 0507539c00526c92d5654bc320af8532faba409f Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 12 Aug 2024 20:47:59 +0800 Subject: [PATCH 046/203] minor change --- examples/tuning/step3_default_params.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml index 2a010328..91ba94c8 100644 --- a/examples/tuning/step3_default_params.yaml +++ b/examples/tuning/step3_default_params.yaml @@ -37,6 +37,8 @@ pipeline: base: min: 1.0 max: 10.0 + - type: normalize + target: tfidfTransform - type: normalize target: NormalizeTotal params_to_tune: From 250dfb4793871502995c7c46eb53358eedb1cc77 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 13 Aug 2024 19:31:43 +0800 Subject: [PATCH 047/203] minor change --- dance/pipeline.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/dance/pipeline.py b/dance/pipeline.py index a1aa2173..0841eed7 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -1041,7 +1041,9 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_ conf = OmegaConf.load(conf_load_path) pipeline_top_k = default(step2_pipeline_planer.config.pipeline_tuning_top_k, DEFAULT_PIPELINE_TUNING_TOP_K) result = pd.read_csv(result_load_path).sort_values(by=metric, ascending=ascending).head(pipeline_top_k) - columns = sorted([col for col in result.columns if col.startswith("pipeline")]) + columns = sorted( + [col for col in result.columns if (col.startswith("pipeline") or col.startswith("run_kwargs_pipeline"))], + key=lambda x: float(x.split('.')[1])) pipeline_names = result.loc[:, columns].values count = 0 for row in pipeline_names: @@ -1050,11 +1052,12 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_ for x in row: for k in conf.pipeline: if k["target"] == x: - pipeline.append(k) + pipeline.append(deepcopy(k)) for i, f in zip(required_indexes, required_funs): for k in step2_pipeline_planer.config.pipeline: if "target" in k and k["target"] == f: - pipeline.insert(i, k) + pipeline.insert(i, deepcopy(k)) + break for p1 in step2_pipeline_planer.config.pipeline: if "step3_frozen" in p1 and p1["step3_frozen"]: for p2 in pipeline: @@ -1065,12 +1068,16 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_ for target, d_p in p1.default_params.items(): if target == p2["target"]: p2["params"] = d_p - for p1, p2 in zip(step2_pipeline_planer.config.pipeline, pipeline): #need order + #顺序不对,参考_sanitize_pipeline进行修改 TODO + step2_pipeline = step2_pipeline_planer.config.pipeline + # step2_pipeline=sorted(step2_pipeline_planer.config.pipeline,key=lambda x: float(x.split('.')[1])) + for p1, p2 in zip(step2_pipeline, pipeline): #need order if "params" in p1: - for key, value in p1.params.items(): - if "params" not in p2: - p2.params = {} - p2.params[key] = value + p2.params = p1.params + # for key, value in p1.params.items(): + # if "params" not in p2: + # p2.params = {} + # p2.params[key] = value temp_conf = conf.copy() temp_conf.pipeline = pipeline temp_conf.wandb = step2_pipeline_planer.config.wandb From 1a6f14f4b6c9c5bf729b2609cd3010285d30ca31 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 13 Aug 2024 19:34:34 +0800 Subject: [PATCH 048/203] minor change --- examples/tuning/step3_default_params.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml index 91ba94c8..69135b02 100644 --- a/examples/tuning/step3_default_params.yaml +++ b/examples/tuning/step3_default_params.yaml @@ -219,19 +219,21 @@ pipeline: - [max_genes, max_counts, min_counts, min_genes] - [max_genes, max_counts, min_genes, min_counts] min_counts: - min: 3 - max: 1000 + min: 0.0 # Change occurs when joint embedding + max: 0.05 min_genes: min: 0.0 max: 0.05 max_counts: - min: 10000 - max: 100000 + min: 0.95 + max: 1.0 max_genes: min: 0.95 max: 1.0 - type: filter.cell target: FilterCellsPlaceHolder + - type: filter.cell + target: FilterCellsCommonMod - type: feature.cell target: CellPCA params: From 0776e07ee43e2318481b9330a639511113cca1da Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 13 Aug 2024 19:35:54 +0800 Subject: [PATCH 049/203] minor change --- examples/tuning/step3_default_params.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml index 2a010328..70da888c 100644 --- a/examples/tuning/step3_default_params.yaml +++ b/examples/tuning/step3_default_params.yaml @@ -54,6 +54,8 @@ pipeline: values: [null, 1e3, 1e4, 1e5, 1e6] max_fraction: values: [0.01, 0.05, 0.5, 0.7, 1.0] + - type: normalize + target: tfidfTransform - type: normalize target: NormalizePlaceHolder - type: filter.gene From e2eba29013404914eda7c45a641f13fa667d3e59 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 13 Aug 2024 19:37:51 +0800 Subject: [PATCH 050/203] minor change --- examples/tuning/step3_default_params.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml index a711c951..70da888c 100644 --- a/examples/tuning/step3_default_params.yaml +++ b/examples/tuning/step3_default_params.yaml @@ -37,8 +37,6 @@ pipeline: base: min: 1.0 max: 10.0 - - type: normalize - target: tfidfTransform - type: normalize target: NormalizeTotal params_to_tune: From a35cbb001e852603659636a9c1f43da5158c6e39 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 14 Aug 2024 10:50:30 +0800 Subject: [PATCH 051/203] minor change --- dance/utils/wrappers.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py index 11314694..7dd756ba 100644 --- a/dance/utils/wrappers.py +++ b/dance/utils/wrappers.py @@ -124,14 +124,4 @@ def transform_mod_to_anndata(mod_data: mudata.MuData, mod_key: str): return mod_data.mod[mod_key] -# 使用装饰器 -@add_mod_and_transform -class MyClass: - def __init__(self, x, **kwargs): - self.x = x - print("-------") - print(**kwargs) - - def __call__(self, data): - return data From eae8bd225e1bb9949b6cc953f838aced011e95f9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 02:51:19 +0000 Subject: [PATCH 052/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/utils/wrappers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py index 7dd756ba..28ffefe8 100644 --- a/dance/utils/wrappers.py +++ b/dance/utils/wrappers.py @@ -122,6 +122,3 @@ def new_call(self, data: Data, *args, **kwargs): def transform_mod_to_anndata(mod_data: mudata.MuData, mod_key: str): return mod_data.mod[mod_key] - - - From 5e49b3f00277310a4fd29d1022ac118a0e5be2bf Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 14 Aug 2024 11:01:34 +0800 Subject: [PATCH 053/203] minor change --- dance/transforms/cell_feature.py | 40 +++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py index 4e90c97d..cb7b2ed9 100644 --- a/dance/transforms/cell_feature.py +++ b/dance/transforms/cell_feature.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.decomposition import PCA, TruncatedSVD,SparsePCA from sklearn.random_projection import GaussianRandomProjection from dance.registry import register_preprocessor @@ -168,6 +168,44 @@ def __call__(self, data): return data +@register_preprocessor("feature", "cell") +@add_mod_and_transform +class SparsePCA(BaseTransform): + """Reduce cell feature matrix with SparsePCA. + + Parameters + ---------- + n_components + Number of SparsePCA components to use. + + """ + + _DISPLAY_ATTRS = ("n_components", ) + + def __init__(self, n_components: Union[float, int] = 400, *, channel: Optional[str] = None, + mod: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + + self.n_components = n_components + self.channel = channel + + def __call__(self, data): + feat = data.get_feature(return_type="numpy", channel=self.channel) + # if self.n_components > min(feat.shape): + # self.logger.warning( + # f"n_components={self.n_components} must be between 0 and min(n_samples, n_features)={min(feat.shape)} with svd_solver='full'" + # ) + # self.n_components = min(feat.shape) + pca = SparsePCA(n_components=self.n_components) + cell_feat = pca.fit_transform(feat) + self.logger.info(f"Generating cell SparsePCA features {feat.shape} (k={pca.n_components_})") + # evr = pca.explained_variance_ratio_ + # self.logger.info(f"Top 10 explained variances: {evr[:10]}") + # self.logger.info(f"Total explained variance: {evr.sum():.2%}") + data.data.obsm[self.out] = cell_feat + + return data + @register_preprocessor("feature", "cell") @add_mod_and_transform From 685582f4e86c890b42d0ccbf57094ed8ef88b4f9 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 14 Aug 2024 11:02:25 +0800 Subject: [PATCH 054/203] minor change --- examples/tuning/step3_default_params.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml index 4db51aa8..aaca441b 100644 --- a/examples/tuning/step3_default_params.yaml +++ b/examples/tuning/step3_default_params.yaml @@ -250,6 +250,14 @@ pipeline: n_components: min: 100 max: 1000 + - type: feature.cell + target: SparsePCA + params: + out: feature.cell + params_to_tune: + n_components: + min: 100 + max: 1000 - type: feature.cell target: WeightedFeaturePCA params: From 250fd999c3c5e7221e9741bd4e75f0c786cd7f76 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 03:02:28 +0000 Subject: [PATCH 055/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/transforms/cell_feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py index cb7b2ed9..2af1c3a6 100644 --- a/dance/transforms/cell_feature.py +++ b/dance/transforms/cell_feature.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from sklearn.decomposition import PCA, TruncatedSVD,SparsePCA +from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD from sklearn.random_projection import GaussianRandomProjection from dance.registry import register_preprocessor @@ -168,6 +168,7 @@ def __call__(self, data): return data + @register_preprocessor("feature", "cell") @add_mod_and_transform class SparsePCA(BaseTransform): From b6a034f60066553d42d594ab182fa3572c0f99b7 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 26 Aug 2024 16:06:18 +0800 Subject: [PATCH 056/203] minor change --- dance/datasets/singlemodality.py | 80 +++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 6 deletions(-) diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py index 60998be7..5be20d10 100644 --- a/dance/datasets/singlemodality.py +++ b/dance/datasets/singlemodality.py @@ -53,7 +53,7 @@ class CellTypeAnnotationDataset(BaseDataset): def __init__(self, full_download=False, train_dataset=None, test_dataset=None, species=None, tissue=None, valid_dataset=None, train_dir="train", test_dir="test", valid_dir="valid", map_path="map", - data_dir="./", train_as_valid=False, val_size=0.2): + data_dir="./", train_as_valid=False, val_size=0.2,test_size=None,filetype: str = "csv"): super().__init__(data_dir, full_download) self.data_dir = data_dir @@ -73,7 +73,8 @@ def __init__(self, full_download=False, train_dataset=None, test_dataset=None, s self.valid_dataset = train_dataset self.train2valid() self.val_size = val_size - + self.test_size=test_size + self.filetype=filetype def train2valid(self): logger.info("Copy train_dataset and use it as valid_dataset") temp_ava_data = self.available_data.copy() @@ -109,12 +110,12 @@ def download_all(self): pass os.rename(download_path, move_path) - def get_all_filenames(self, filetype: str = "csv", feat_suffix: str = "data", label_suffix: str = "celltype"): + def get_all_filenames(self, feat_suffix: str = "data", label_suffix: str = "celltype"): filenames = [] - for id in self.train_dataset + self.test_dataset + (self.valid_dataset + for id in self.train_dataset + (self.test_dataset if self.test_dataset is not None else []) + (self.valid_dataset if self.valid_dataset is not None else []): - filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}") - filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{filetype}") + filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{self.filetype}") + filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{self.filetype}") return filenames def download(self, download_map=True): @@ -175,6 +176,8 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se species = self.species tissue = self.tissue valid_feat = None + if self.test_dataset is None: + return self._load_raw_data_single_h5ad() if self.valid_dataset is not None: train_dataset_ids = self.train_dataset test_dataset_ids = self.test_dataset @@ -270,6 +273,71 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se return adata, labels, idx_to_label, train_size, 0 + def _load_raw_data_single_h5ad(self, ct_col: str = "cell_type") -> Tuple[ad.AnnData, List[Set[str]], List[str], int]: + species = self.species + tissue = self.tissue + valid_feat = None + data_dir = self.data_dir + train_dir = osp.join(data_dir, self.train_dir) + data_path=osp.join(train_dir, species, f"{species}_{tissue}{self.train_dataset[0]}_data.h5ad") + adata=sc.read_h5ad(data_path) + map_path = osp.join(data_dir, self.map_path, self.species) + X_train_temp, X_test = train_test_split(adata, test_size=0.2) + X_train, X_val = train_test_split(X_train_temp, test_size=0.25) + train_feat,valid_feat,test_feat=X_train.X,X_val.X,X_test.X + train_label,valid_label,test_label=X_train.obs,X_val.obs,X_test.obs + if valid_feat is not None: + # Combine features (only use features that are present in the training data) + train_size = train_feat.shape[0] + valid_size = valid_feat.shape[0] + # Convert cell type labels and map test cell type names to train + cell_types = set(train_label[ct_col].unique()) + idx_to_label = sorted(cell_types) + cell_type_mappings: Dict[str, Set[str]] = self.get_map_dict(map_path, tissue) + train_labels, valid_labels, test_labels = train_label[ct_col].tolist(), [], [] + for i in valid_label[ct_col]: + valid_labels.append(i if i in cell_types else cell_type_mappings.get(i)) + for i in test_label[ct_col]: + test_labels.append(i if i in cell_types else cell_type_mappings.get(i)) + labels: List[Set[str]] = train_labels + valid_labels + test_labels + + logger.debug("Mapped valid cell-types:") + for i, j, k in zip(valid_label.index, valid_label[ct_col], valid_labels): + logger.debug(f"{i}:{j}\t-> {k}") + + logger.debug("Mapped test cell-types:") + for i, j, k in zip(test_label.index, test_label[ct_col], test_labels): + logger.debug(f"{i}:{j}\t-> {k}") + + logger.info(f"Loaded expression data: {adata}") + logger.info(f"Number of training samples: {train_feat.shape[0]:,}") + logger.info(f"Number of valid samples: {valid_feat.shape[0]:,}") + logger.info(f"Number of testing samples: {test_feat.shape[0]:,}") + logger.info(f"Cell-types (n={len(idx_to_label)}):\n{pprint.pformat(idx_to_label)}") + + return adata, labels, idx_to_label, train_size, valid_size + else: + # Combine features (only use features that are present in the training data) + train_size = train_feat.shape[0] + cell_types = set(train_label[ct_col].unique()) + idx_to_label = sorted(cell_types) + cell_type_mappings: Dict[str, Set[str]] = self.get_map_dict(map_path, tissue) + train_labels, test_labels = train_label[ct_col].tolist(), [] + for i in test_label[ct_col]: + test_labels.append(i if i in cell_types else cell_type_mappings.get(i)) + labels: List[Set[str]] = train_labels + test_labels + + logger.debug("Mapped test cell-types:") + for i, j, k in zip(test_label.index, test_label[ct_col], test_labels): + logger.debug(f"{i}:{j}\t-> {k}") + + logger.info(f"Loaded expression data: {adata}") + logger.info(f"Number of training samples: {train_feat.shape[0]:,}") + logger.info(f"Number of testing samples: {test_feat.shape[0]:,}") + logger.info(f"Cell-types (n={len(idx_to_label)}):\n{pprint.pformat(idx_to_label)}") + + return adata, labels, idx_to_label, train_size, 0 + def _raw_to_dance(self, raw_data): adata, cell_labels, idx_to_label, train_size, valid_size = raw_data adata.obsm["cell_type"] = cell_label_to_df(cell_labels, idx_to_label, index=adata.obs.index) From 455aaff4409e31278f23d6f26fa2baae5d61f081 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 08:06:51 +0000 Subject: [PATCH 057/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/datasets/singlemodality.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py index 5be20d10..e515d466 100644 --- a/dance/datasets/singlemodality.py +++ b/dance/datasets/singlemodality.py @@ -53,7 +53,7 @@ class CellTypeAnnotationDataset(BaseDataset): def __init__(self, full_download=False, train_dataset=None, test_dataset=None, species=None, tissue=None, valid_dataset=None, train_dir="train", test_dir="test", valid_dir="valid", map_path="map", - data_dir="./", train_as_valid=False, val_size=0.2,test_size=None,filetype: str = "csv"): + data_dir="./", train_as_valid=False, val_size=0.2, test_size=None, filetype: str = "csv"): super().__init__(data_dir, full_download) self.data_dir = data_dir @@ -73,8 +73,9 @@ def __init__(self, full_download=False, train_dataset=None, test_dataset=None, s self.valid_dataset = train_dataset self.train2valid() self.val_size = val_size - self.test_size=test_size - self.filetype=filetype + self.test_size = test_size + self.filetype = filetype + def train2valid(self): logger.info("Copy train_dataset and use it as valid_dataset") temp_ava_data = self.available_data.copy() @@ -112,8 +113,8 @@ def download_all(self): def get_all_filenames(self, feat_suffix: str = "data", label_suffix: str = "celltype"): filenames = [] - for id in self.train_dataset + (self.test_dataset if self.test_dataset is not None else []) + (self.valid_dataset - if self.valid_dataset is not None else []): + for id in self.train_dataset + (self.test_dataset if self.test_dataset is not None else + []) + (self.valid_dataset if self.valid_dataset is not None else []): filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{self.filetype}") filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{self.filetype}") return filenames @@ -273,19 +274,20 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se return adata, labels, idx_to_label, train_size, 0 - def _load_raw_data_single_h5ad(self, ct_col: str = "cell_type") -> Tuple[ad.AnnData, List[Set[str]], List[str], int]: + def _load_raw_data_single_h5ad(self, + ct_col: str = "cell_type") -> Tuple[ad.AnnData, List[Set[str]], List[str], int]: species = self.species tissue = self.tissue valid_feat = None data_dir = self.data_dir train_dir = osp.join(data_dir, self.train_dir) - data_path=osp.join(train_dir, species, f"{species}_{tissue}{self.train_dataset[0]}_data.h5ad") - adata=sc.read_h5ad(data_path) + data_path = osp.join(train_dir, species, f"{species}_{tissue}{self.train_dataset[0]}_data.h5ad") + adata = sc.read_h5ad(data_path) map_path = osp.join(data_dir, self.map_path, self.species) X_train_temp, X_test = train_test_split(adata, test_size=0.2) X_train, X_val = train_test_split(X_train_temp, test_size=0.25) - train_feat,valid_feat,test_feat=X_train.X,X_val.X,X_test.X - train_label,valid_label,test_label=X_train.obs,X_val.obs,X_test.obs + train_feat, valid_feat, test_feat = X_train.X, X_val.X, X_test.X + train_label, valid_label, test_label = X_train.obs, X_val.obs, X_test.obs if valid_feat is not None: # Combine features (only use features that are present in the training data) train_size = train_feat.shape[0] From c58de0a9c60eb3413ad56b8f00b120b9903cf3dd Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 26 Aug 2024 16:14:48 +0800 Subject: [PATCH 058/203] update metadata --- dance/metadata/scdeepsort.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 79dfc5c1..473c37af 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -66,3 +66,4 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1 +human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1 \ No newline at end of file From 03a1109e6c4cb96d517c93a213a3318902c57bee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 08:16:14 +0000 Subject: [PATCH 059/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 473c37af..326229fb 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -66,4 +66,4 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1 -human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1 \ No newline at end of file +human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1 From 57d643a7b882d6be7dccd0c17b871369bd53fff2 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 26 Aug 2024 22:43:17 +0800 Subject: [PATCH 060/203] minor change --- examples/tuning/cta_actinn/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py index 506f71b9..77b963cc 100644 --- a/examples/tuning/cta_actinn/main.py +++ b/examples/tuning/cta_actinn/main.py @@ -30,7 +30,7 @@ parser.add_argument("--num_epochs", type=int, default=50, help="Number of epochs") parser.add_argument("--print_cost", action="store_true", help="Print cost when training") parser.add_argument("--species", default="mouse") - parser.add_argument("--test_dataset", nargs="+", default=[1759], help="List of testing dataset ids.") + parser.add_argument("--test_dataset", nargs="+", default=[], help="List of testing dataset ids.") parser.add_argument("--tissue", default="Spleen") parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.") parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.") @@ -41,13 +41,14 @@ parser.add_argument("--sweep_id", type=str, default=None) parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + parser.add_argument("--filetype", default="csv") args = parser.parse_args() logger.setLevel(args.log_level) logger.info(f"\n{pprint.pformat(vars(args))}") file_root_path = Path( args.root_path, "_".join([ "-".join([str(num) for num in dataset]) - for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None + for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if (dataset is not None and dataset !=[]) ])).resolve() logger.info(f"\n files is saved in {file_root_path}") pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") @@ -68,7 +69,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # Load data and perform necessary preprocessing data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, valid_dataset=args.valid_dataset, data_dir="./temp_data", tissue=args.tissue, - species=args.species).load_data() + species=args.species,filetype=args.filetype).load_data() print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) From 730ae275bb4af43b6f3c5d2ba02b1784e3f255f0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:45:26 +0000 Subject: [PATCH 061/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/cta_actinn/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py index 77b963cc..616c0d10 100644 --- a/examples/tuning/cta_actinn/main.py +++ b/examples/tuning/cta_actinn/main.py @@ -48,7 +48,8 @@ file_root_path = Path( args.root_path, "_".join([ "-".join([str(num) for num in dataset]) - for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if (dataset is not None and dataset !=[]) + for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] + if (dataset is not None and dataset != []) ])).resolve() logger.info(f"\n files is saved in {file_root_path}") pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") @@ -69,7 +70,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # Load data and perform necessary preprocessing data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, valid_dataset=args.valid_dataset, data_dir="./temp_data", tissue=args.tissue, - species=args.species,filetype=args.filetype).load_data() + species=args.species, filetype=args.filetype).load_data() print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) From 8983aec81547915a693a6c5334b4e1c5b04ada2a Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 26 Aug 2024 22:49:47 +0800 Subject: [PATCH 062/203] minor change --- dance/datasets/singlemodality.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py index e515d466..c85a293b 100644 --- a/dance/datasets/singlemodality.py +++ b/dance/datasets/singlemodality.py @@ -53,7 +53,7 @@ class CellTypeAnnotationDataset(BaseDataset): def __init__(self, full_download=False, train_dataset=None, test_dataset=None, species=None, tissue=None, valid_dataset=None, train_dir="train", test_dir="test", valid_dir="valid", map_path="map", - data_dir="./", train_as_valid=False, val_size=0.2, test_size=None, filetype: str = "csv"): + data_dir="./", train_as_valid=False, val_size=0.2, test_size=0.2, filetype: str = "csv"): super().__init__(data_dir, full_download) self.data_dir = data_dir @@ -177,7 +177,7 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se species = self.species tissue = self.tissue valid_feat = None - if self.test_dataset is None: + if self.test_dataset is None or self.test_dataset==[]: return self._load_raw_data_single_h5ad() if self.valid_dataset is not None: train_dataset_ids = self.train_dataset From 2432c2c58197532826e66012ef4cae00565ec79a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:51:19 +0000 Subject: [PATCH 063/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/datasets/singlemodality.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py index c85a293b..302b44b8 100644 --- a/dance/datasets/singlemodality.py +++ b/dance/datasets/singlemodality.py @@ -177,7 +177,7 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se species = self.species tissue = self.tissue valid_feat = None - if self.test_dataset is None or self.test_dataset==[]: + if self.test_dataset is None or self.test_dataset == []: return self._load_raw_data_single_h5ad() if self.valid_dataset is not None: train_dataset_ids = self.train_dataset From 52b9df597e0af067545652cc1440594ac0a91e77 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 6 Sep 2024 22:49:55 +0800 Subject: [PATCH 064/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 326229fb..dda0c5b4 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -66,4 +66,4 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1 -human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1 +human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1 From 24434eb70411a7055d172b689d3d8aaaa775f57e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:50:51 +0000 Subject: [PATCH 065/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index bec823f5..5d1fcfb2 100644 --- a/README.md +++ b/README.md @@ -193,14 +193,14 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | ------------------- | ------------ | ------------------------------------------------------------------------------------------------------------ | ---- | ------- | -| GNN | GraphSCI | Imputing Single-cell RNA-seq data by combining Graph Convolution and Autoencoder Neural Networks | 2021 | ✅ | +| GNN | GraphSCI | Imputing Single-cell RNA-seq data by combining Graph Convolution and Autoencoder Neural Networks | 2021 | ✅ | | GNN | scGNN (2020) | SCGNN: scRNA-seq Dropout Imputation via Induced Hierarchical Cell Similarity Graph | 2020 | P1 | -| GNN | scGNN (2021) | scGNN is a novel graph neural network framework for single-cell RNA-Seq analyses | 2021 | ✅ | +| GNN | scGNN (2021) | scGNN is a novel graph neural network framework for single-cell RNA-Seq analyses | 2021 | ✅ | | GNN | GNNImpute | An efficient scRNA-seq dropout imputation method using graph attention network | 2021 | P1 | | Graph Diffusion | MAGIC | MAGIC: A diffusion-based imputation method reveals gene-gene interactions in single-cell RNA-sequencing data | 2018 | P1 | | Probabilistic Model | scImpute | An accurate and robust imputation method scImpute for single-cell RNA-seq data | 2018 | P1 | | GAN | scGAIN | scGAIN: Single Cell RNA-seq Data Imputation using Generative Adversarial Networks | 2019 | P1 | -| NN | DeepImpute | DeepImpute: an accurate, fast, and scalable deep neural network method to impute single-cell RNA-seq data | 2019 | ✅ | +| NN | DeepImpute | DeepImpute: an accurate, fast, and scalable deep neural network method to impute single-cell RNA-seq data | 2019 | ✅ | | NN + TF | Saver-X | Transfer learning in single-cell transcriptomics improves data denoising and pattern discovery | 2019 | P1 | | Model | Evaluation Metric | Mouse Brain (current/reported) | Mouse Embryo (current/reported) | PBMC (current/reported) | @@ -215,12 +215,12 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | ----------------------- | ------------- | ------------------------------------------------------------------------------------------------------------- | ---- | ------- | -| GNN | ScDeepsort | Single-cell transcriptomics with weighted GNN | 2021 | ✅ | -| Logistic Regression | Celltypist | Cross-tissue immune cell analysis reveals tissue-specific features in humans. | 2021 | ✅ | -| Random Forest | singleCellNet | SingleCellNet: a computational tool to classify single cell RNA-Seq data across platforms and across species. | 2019 | ✅ | -| Neural Network | ACTINN | ACTINN: automated identification of cell types in single cell RNA sequencing. | 2020 | ✅ | +| GNN | ScDeepsort | Single-cell transcriptomics with weighted GNN | 2021 | ✅ | +| Logistic Regression | Celltypist | Cross-tissue immune cell analysis reveals tissue-specific features in humans. | 2021 | ✅ | +| Random Forest | singleCellNet | SingleCellNet: a computational tool to classify single cell RNA-Seq data across platforms and across species. | 2019 | ✅ | +| Neural Network | ACTINN | ACTINN: automated identification of cell types in single cell RNA sequencing. | 2020 | ✅ | | Hierarchical Clustering | SingleR | Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage. | 2019 | P1 | -| SVM | SVM | A comparison of automatic cell identification methods for single-cell RNA sequencing data. | 2018 | ✅ | +| SVM | SVM | A comparison of automatic cell identification methods for single-cell RNA sequencing data. | 2018 | ✅ | | Model | Evaluation Metric | Mouse Brain 2695 (current/reported) | Mouse Spleen 1759 (current/reported) | Mouse Kidney 203 (current/reported) | | ------------- | ----------------- | ----------------------------------- | ------------------------------------ | ----------------------------------- | @@ -234,12 +234,12 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | ----------- | ------------- | ------------------------------------------------------------------------------------------------------------ | ---- | ------- | -| GNN | graph-sc | GNN-based embedding for clustering scRNA-seq data | 2022 | ✅ | -| GNN | scTAG | ZINB-based Graph Embedding Autoencoder for Single-cell RNA-seq Interpretations | 2022 | ✅ | -| GNN | scDSC | Deep structural clustering for single-cell RNA-seq data jointly through autoencoder and graph neural network | 2022 | ✅ | +| GNN | graph-sc | GNN-based embedding for clustering scRNA-seq data | 2022 | ✅ | +| GNN | scTAG | ZINB-based Graph Embedding Autoencoder for Single-cell RNA-seq Interpretations | 2022 | ✅ | +| GNN | scDSC | Deep structural clustering for single-cell RNA-seq data jointly through autoencoder and graph neural network | 2022 | ✅ | | GNN | scGAC | scGAC: a graph attentional architecture for clustering single-cell RNA-seq data | 2022 | P1 | -| AutoEncoder | scDeepCluster | Clustering single-cell RNA-seq data with a model-based deep learning approach | 2019 | ✅ | -| AutoEncoder | scDCC | Model-based deep embedding for constrained clustering analysis of single cell RNA-seq data | 2021 | ✅ | +| AutoEncoder | scDeepCluster | Clustering single-cell RNA-seq data with a model-based deep learning approach | 2019 | ✅ | +| AutoEncoder | scDCC | Model-based deep embedding for constrained clustering analysis of single cell RNA-seq data | 2021 | ✅ | | AutoEncoder | scziDesk | Deep soft K-means clustering with self-training for single-cell RNA sequence data | 2020 | P1 | | Model | Evaluation Metric | 10x PBMC (current/reported) | Mouse ES (current/reported) | Worm Neuron (current/reported) | Mouse Bladder (current/reported) | @@ -256,12 +256,12 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | ---------------- | ------------------------ | -------------------------------------------------------------------------------------------------- | ---- | ------- | -| GNN | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration | 2022 | ✅ | +| GNN | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration | 2022 | ✅ | | GNN | ScMoLP | Link Prediction Variant of ScMoGCN | 2022 | P1 | | GNN | GRAPE | Handling Missing Data with Graph Representation Learning | 2020 | P1 | -| Generative Model | SCMM | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅ | -| Auto-encoder | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders | 2021 | ✅ | -| Auto-encoder | BABEL | BABEL enables cross-modality translation between multiomic profiles at single-cell resolution | 2021 | ✅ | +| Generative Model | SCMM | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅ | +| Auto-encoder | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders | 2021 | ✅ | +| Auto-encoder | BABEL | BABEL enables cross-modality translation between multiomic profiles at single-cell resolution | 2021 | ✅ | | Model | Evaluation Metric | GEX2ADT (current/reported) | ADT2GEX (current/reported) | GEX2ATAC (current/reported) | ATAC2GEX (current/reported) | | ------------------------ | ----------------- | -------------------------- | -------------------------- | --------------------------- | --------------------------- | @@ -274,10 +274,10 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | ---------------- | ------------------------ | -------------------------------------------------------------------------------------------------- | ---- | ------- | -| GNN | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration | 2022 | ✅ | +| GNN | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration | 2022 | ✅ | | GNN/Auto-ecnoder | GLUE | Multi-omics single-cell data integration and regulatory inference with graph-linked embedding | 2021 | P1 | -| Generative Model | SCMM | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅ | -| Auto-encoder | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders | 2021 | ✅ | +| Generative Model | SCMM | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅ | +| Auto-encoder | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders | 2021 | ✅ | | Model | Evaluation Metric | GEX2ADT (current/reported) | GEX2ATAC (current/reported) | | ------------------------ | ----------------- | -------------------------- | --------------------------- | @@ -289,11 +289,11 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | ---------------- | ------- | ----------------------------------------------------------------------------------------------------- | ---- | ------- | -| GNN | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration | 2022 | ✅ | -| Auto-encoder | scMVAE | Deep-joint-learning analysis model of single cell transcriptome and open chromatin accessibility data | 2020 | ✅ | -| Auto-encoder | scDEC | Simultaneous deep generative modelling and clustering of single-cell genomic data | 2021 | ✅ | +| GNN | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration | 2022 | ✅ | +| Auto-encoder | scMVAE | Deep-joint-learning analysis model of single cell transcriptome and open chromatin accessibility data | 2020 | ✅ | +| Auto-encoder | scDEC | Simultaneous deep generative modelling and clustering of single-cell genomic data | 2021 | ✅ | | GNN/Auto-ecnoder | GLUE | Multi-omics single-cell data integration and regulatory inference with graph-linked embedding | 2021 | P1 | -| Auto-encoder | DCCA | Deep cross-omics cycle attention model for joint analysis of single-cell multi-omics data | 2021 | ✅ | +| Auto-encoder | DCCA | Deep cross-omics cycle attention model for joint analysis of single-cell multi-omics data | 2021 | ✅ | | Model | Evaluation Metric | GEX2ADT (current/reported) | GEX2ATAC (current/reported) | | ---------- | ----------------- | -------------------------- | --------------------------- | @@ -329,11 +329,11 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | -------------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ------- | -| GNN | SpaGCN | SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network | 2021 | ✅ | -| GNN | STAGATE | Deciphering spatial domains from spatially resolved transcriptomics with adaptive graph attention auto-encoder | 2021 | ✅ | +| GNN | SpaGCN | SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network | 2021 | ✅ | +| GNN | STAGATE | Deciphering spatial domains from spatially resolved transcriptomics with adaptive graph attention auto-encoder | 2021 | ✅ | | Bayesian | BayesSpace | Spatial transcriptomics at subspot resolution with BayesSpace | 2021 | P1 | -| Pseudo-space-time (PST) Distance | stLearn | stLearn: integrating spatial location, tissue morphology and gene expression to find cell types, cell-cell interactions and spatial trajectories within undissociated tissues | 2020 | ✅ | -| Heuristic | Louvain | Fast unfolding of community hierarchies in large networks | 2008 | ✅ | +| Pseudo-space-time (PST) Distance | stLearn | stLearn: integrating spatial location, tissue morphology and gene expression to find cell types, cell-cell interactions and spatial trajectories within undissociated tissues | 2020 | ✅ | +| Heuristic | Louvain | Fast unfolding of community hierarchies in large networks | 2008 | ✅ | | Model | Evaluation Metric | 151673 (current/reported) | 151676 (current/reported) | 151507 (current/reported) | | ------- | ----------------- | ------------------------- | ------------------------- | ------------------------- | @@ -346,10 +346,10 @@ pip install -e . | BackBone | Model | Algorithm | Year | CheckIn | | -------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------- | ---- | ------- | -| GNN | DSTG | DSTG: deconvoluting spatial transcriptomics data through graph-based artificial intelligence | 2021 | ✅ | -| logNormReg | SpatialDecon | Advances in mixed cell deconvolution enable quantification of cell types in spatial transcriptomic data | 2022 | ✅ | -| NNMFreg | SPOTlight | SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots with single-cell transcriptomes | 2021 | ✅ | -| NN Linear + CAR assumption | CARD | Spatially informed cell-type deconvolution for spatial transcriptomics | 2022 | ✅ | +| GNN | DSTG | DSTG: deconvoluting spatial transcriptomics data through graph-based artificial intelligence | 2021 | ✅ | +| logNormReg | SpatialDecon | Advances in mixed cell deconvolution enable quantification of cell types in spatial transcriptomic data | 2022 | ✅ | +| NNMFreg | SPOTlight | SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots with single-cell transcriptomes | 2021 | ✅ | +| NN Linear + CAR assumption | CARD | Spatially informed cell-type deconvolution for spatial transcriptomics | 2022 | ✅ | | Model | Evaluation Metric | GSE174746 (current/reported) | CARD Synthetic (current/reported) | SPOTlight Synthetic (current/reported) | | ------------ | ----------------- | ---------------------------- | --------------------------------- | -------------------------------------- | From a80eecb172527d3ba13f55f4bde6a798f1a3108b Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 6 Sep 2024 23:24:06 +0800 Subject: [PATCH 066/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index dda0c5b4..4d25e1db 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -67,3 +67,4 @@ mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/ mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1 human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1 +human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1 From 87bedf66a438b4b535c41b41b1969f49d1326563 Mon Sep 17 00:00:00 2001 From: xzy Date: Sat, 7 Sep 2024 16:38:59 +0800 Subject: [PATCH 067/203] minor change --- dance/metadata/scdeepsort.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 4d25e1db..6464ba33 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -68,3 +68,5 @@ mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox. mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1 human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1 human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1 +human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1 +human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1 From d44b4bfedf78ea25a7975c004124fbdba389977e Mon Sep 17 00:00:00 2001 From: xzy Date: Sat, 7 Sep 2024 20:56:30 +0800 Subject: [PATCH 068/203] minor change --- dance/metadata/scdeepsort.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 6464ba33..e5d04770 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -70,3 +70,4 @@ human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_d human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1 human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1 +human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1 From 27b2e9314a756423b9129e022c367265f75e5454 Mon Sep 17 00:00:00 2001 From: xzy Date: Sat, 7 Sep 2024 21:23:37 +0800 Subject: [PATCH 069/203] minor change --- dance/metadata/scdeepsort.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index e5d04770..f2fa3b4a 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -71,3 +71,4 @@ human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_d human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1 +human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1 From b71b4d1cf26daa9c7fc55c56042c76e4f9791bbf Mon Sep 17 00:00:00 2001 From: xzy Date: Sat, 7 Sep 2024 23:02:22 +0800 Subject: [PATCH 070/203] minor change --- dance/metadata/scdeepsort.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index e5d04770..952e43fc 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -66,8 +66,9 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1 -human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1 +human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=656m2nhu&dl=1 human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1 human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1 +human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=z5djqf31&dl=1 From 2a92cf7523feb92c6443d8d686714686a28c6a75 Mon Sep 17 00:00:00 2001 From: xzy Date: Sun, 8 Sep 2024 11:07:38 +0800 Subject: [PATCH 071/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 1f90cc6c..417aabb7 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -72,3 +72,4 @@ human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_d human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1 human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1 +human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/k8cgud4rd93lm8cjwbqhy/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=6jztgtxdd2u28cyoeikt4glo8&st=wihnoizk&dl=1 From a05b1a0d3ff28eeb9c20e9367ac0844a2cbf9712 Mon Sep 17 00:00:00 2001 From: xzy Date: Sun, 8 Sep 2024 20:14:05 +0800 Subject: [PATCH 072/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 417aabb7..e5b239db 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -73,3 +73,5 @@ human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_d human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1 human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1 human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/k8cgud4rd93lm8cjwbqhy/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=6jztgtxdd2u28cyoeikt4glo8&st=wihnoizk&dl=1 +human,Blood,10000,train,,,train_human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/2vavg60b8kcs63idtley7/human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=zcphpx6fgip8iuqrdzeyq7r29&st=3og2rjo0&dl=1 +human,Blood,10000,train,,,train_human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad,https://www.dropbox.com/scl/fi/gice80zbl1ljei80la4g4/human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad?rlkey=ha3xj7w79u6ogo0djcuklapd5&st=bk1zjffw&dl=1 From ccd4168da532817bce665868d2af5ca5266ff8bf Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 9 Sep 2024 11:11:49 +0800 Subject: [PATCH 073/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index e5b239db..8b1c2c44 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -73,5 +73,13 @@ human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_d human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1 human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1 human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/k8cgud4rd93lm8cjwbqhy/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=6jztgtxdd2u28cyoeikt4glo8&st=wihnoizk&dl=1 -human,Blood,10000,train,,,train_human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/2vavg60b8kcs63idtley7/human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=zcphpx6fgip8iuqrdzeyq7r29&st=3og2rjo0&dl=1 -human,Blood,10000,train,,,train_human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad,https://www.dropbox.com/scl/fi/gice80zbl1ljei80la4g4/human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad?rlkey=ha3xj7w79u6ogo0djcuklapd5&st=bk1zjffw&dl=1 +human,Blood,10000,train,,,train_human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/wqfbtld761iu33xydjih1/human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=6fpf8eu9lm6orczc4lzlhu22f&st=joo51ad1&dl=1 +human,Blood,10000,train,,,train_human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad,https://www.dropbox.com/scl/fi/d60o68h6gr42di9x1ogv7/human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad?rlkey=hpofxqr2gdue3avafqa09fyw7&st=ykswto0h&dl=1 +human,Blood,10000,train,,,train_human_Blood2a498ace-872a-4935-984b-1afa70fd9886_data.h5ad,https://www.dropbox.com/scl/fi/krlqfjj15wrw8qvz5ra0s/human_Blood2a498ace-872a-4935-984b-1afa70fd9886_data.h5ad?rlkey=4jxlruy78g3hp9yr608pmvs9s&st=org7vf5t&dl=1 +human,Blood,10000,train,,,train_human_Blood2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/xfw2youc9jb2ob9oun26c/human_Blood2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=imvru6vfwile7rrwqjedevhnm&st=khi36f85&dl=1 +human,Blood,10000,train,,,train_human_Blood3faad104-2ab8-4434-816d-474d8d2641db_data.h5ad,https://www.dropbox.com/scl/fi/gygnlfkfdxd0av7vrvub3/human_Blood3faad104-2ab8-4434-816d-474d8d2641db_data.h5ad?rlkey=s3uobbze9qhbrrd2u3qkqbtw6&st=2ignoyi8&dl=1 +human,Blood,10000,train,,,train_human_Blood4c4cd77c-8fee-4836-9145-16562a8782fe_data.h5ad,https://www.dropbox.com/scl/fi/lgqrq59631rxrwrajkii5/human_Blood4c4cd77c-8fee-4836-9145-16562a8782fe_data.h5ad?rlkey=3a1176le2rorzd82rf4mmswyz&st=2bpcrwsq&dl=1 +human,Blood,10000,train,,,train_human_Bloodae29ebd0-1973-40a4-a6af-d15a5f77a80f_data.h5ad,https://www.dropbox.com/scl/fi/ax97ls2ojm3x5asoip2ji/human_Bloodae29ebd0-1973-40a4-a6af-d15a5f77a80f_data.h5ad?rlkey=k51mhhavmzfy4xq8gx52tjtq8&st=14tfycvk&dl=1 +human,Blood,10000,train,,,train_human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad,https://www.dropbox.com/scl/fi/md5mqb2dh0w9655v0c281/human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad?rlkey=afdyzlpcmd44lo7tl5gnzw8u3&st=gt2fdipz&dl=1 +human,Blood,10000,train,,,train_human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad,https://www.dropbox.com/scl/fi/1cih4y8h03dboijqieheg/human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad?rlkey=yupm1kblpt9a8qlmksz1u3xob&st=9jurnfn4&dl=1 +human,Blood,10000,train,,,train_human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad,https://www.dropbox.com/scl/fi/b2hvwk1xmbc6ifhouz4kv/human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad?rlkey=82vzr0qcii75tm4sjsn2xw89g&st=6pwjmxnk&dl=1 From 1c19f369719d7124a4e408242afa971f61b5ad91 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 10 Sep 2024 15:44:21 +0800 Subject: [PATCH 074/203] update main --- examples/tuning/cta_celltypist/main.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py index d3539816..c7a049ec 100644 --- a/examples/tuning/cta_celltypist/main.py +++ b/examples/tuning/cta_celltypist/main.py @@ -7,8 +7,8 @@ import numpy as np import torch -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist @@ -25,7 +25,7 @@ help="Whether to refine the predicted labels via majority voting after over-clustering.") parser.add_argument("--n_jobs", type=int, help="Number of jobs", default=10) parser.add_argument("--species", default="mouse", type=str) - parser.add_argument("--test_dataset", nargs="+", default=[1759], help="List of testing dataset ids.") + parser.add_argument("--test_dataset", nargs="+", default=[], help="List of testing dataset ids.") parser.add_argument("--tissue", default="Spleen", type=str) parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.") parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.") @@ -38,13 +38,15 @@ parser.add_argument("--sweep_id", type=str, default=None) parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + parser.add_argument("--filetype", default="csv") args = parser.parse_args() logger.setLevel(args.log_level) logger.info(f"Running Celltypist with the following parameters:\n{pprint.pformat(vars(args))}") file_root_path = Path( args.root_path, "_".join([ "-".join([str(num) for num in dataset]) - for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None + for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] + if (dataset is not None and dataset != []) ])).resolve() logger.info(f"\n files is saved in {file_root_path}") MAINDIR = Path(__file__).resolve().parent @@ -64,7 +66,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # Load data and perform necessary preprocessing data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, species=args.species, tissue=args.tissue, valid_dataset=args.valid_dataset, - data_dir="../temp_data").load_data() + data_dir="../temp_data", filetype=args.filetype).load_data() print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) From 6d1f47874336706a9e6f06021fad189298c1b5a7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 07:45:13 +0000 Subject: [PATCH 075/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/cta_celltypist/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py index c7a049ec..c625065f 100644 --- a/examples/tuning/cta_celltypist/main.py +++ b/examples/tuning/cta_celltypist/main.py @@ -7,8 +7,8 @@ import numpy as np import torch - import wandb + from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist From 1c86b2bec947261212279546aa775901e9f76db4 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 10 Sep 2024 15:49:26 +0800 Subject: [PATCH 076/203] update main --- examples/tuning/cta_actinn/main.py | 4 ++-- examples/tuning/cta_scdeepsort/main.py | 13 +++++++------ examples/tuning/cta_singlecellnet/main.py | 13 +++++++------ examples/tuning/imputation_deepimpute/main.py | 2 +- examples/tuning/imputation_graphsci/main.py | 2 +- examples/tuning/imputation_scgnn2/main.py | 2 +- examples/tuning/joint_embedding_dcca/main.py | 15 ++++++++++++--- 7 files changed, 31 insertions(+), 20 deletions(-) diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py index 616c0d10..84c30745 100644 --- a/examples/tuning/cta_actinn/main.py +++ b/examples/tuning/cta_actinn/main.py @@ -5,8 +5,8 @@ from typing import get_args import numpy as np -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN @@ -55,7 +55,7 @@ pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") logger.setLevel(args.log_level) - logger.info(f"Running SVM with the following parameters:\n{pprint.pformat(vars(args))}") + logger.info(f"Running ACTINN with the following parameters:\n{pprint.pformat(vars(args))}") def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) diff --git a/examples/tuning/cta_scdeepsort/main.py b/examples/tuning/cta_scdeepsort/main.py index 86cc5d52..1d928be6 100644 --- a/examples/tuning/cta_scdeepsort/main.py +++ b/examples/tuning/cta_scdeepsort/main.py @@ -7,8 +7,8 @@ import numpy as np import torch -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.scdeepsort import ScDeepSort @@ -29,10 +29,10 @@ parser.add_argument("--n_epochs", type=int, default=100, help="number of training epochs") parser.add_argument("--n_layers", type=int, default=1, help="number of hidden gcn layers") parser.add_argument("--species", default="mouse", type=str) - parser.add_argument("--test_dataset", nargs="+", type=int, default=[1759], help="Testing dataset IDs") + parser.add_argument("--test_dataset", nargs="+", type=int, default=[], help="Testing dataset IDs") parser.add_argument("--test_rate", type=float, default=0.2) parser.add_argument("--tissue", default="Spleen", type=str) - parser.add_argument("--train_dataset", nargs="+", type=int, default=[1970], help="List of training dataset ids.") + parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.") parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.") parser.add_argument("--weight_decay", type=float, default=5e-4, help="Weight for L2 loss") parser.add_argument("--seed", type=int, default=42) @@ -42,14 +42,15 @@ parser.add_argument("--sweep_id", type=str, default=None) parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) - + parser.add_argument("--filetype", default="csv") args = parser.parse_args() logger.setLevel(args.log_level) logger.info(f"Running ScDeepSort with the following parameters:\n{pprint.pformat(vars(args))}") file_root_path = Path( args.root_path, "_".join([ "-".join([str(num) for num in dataset]) - for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None + for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] + if (dataset is not None and dataset != []) ])).resolve() logger.info(f"\n files is saved in {file_root_path}") pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") @@ -61,7 +62,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # Load data and perform necessary preprocessing data = CellTypeAnnotationDataset(species=args.species, tissue=args.tissue, test_dataset=args.test_dataset, train_dataset=args.train_dataset, valid_dataset=args.valid_dataset, - data_dir="../temp_data").load_data() + data_dir="../temp_data", filetype=args.filetype).load_data() # Prepare preprocessing pipeline and apply it to data kwargs = {tune_mode: dict(wandb.config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py index 892c6633..b1b87d5a 100644 --- a/examples/tuning/cta_singlecellnet/main.py +++ b/examples/tuning/cta_singlecellnet/main.py @@ -6,8 +6,8 @@ from typing import get_args import numpy as np -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet @@ -28,10 +28,9 @@ parser.add_argument("--num_trees", type=int, default=1000) parser.add_argument("--species", default="mouse", type=str) parser.add_argument("--stratify", type=bool, default=True) - parser.add_argument("--test_dataset", type=int, nargs="+", default=[1759], - help="List testing training dataset ids.") + parser.add_argument("--test_dataset", nargs="+", default=[], help="List testing training dataset ids.") parser.add_argument("--tissue", default="Spleen", type=str) - parser.add_argument("--train_dataset", type=int, nargs="+", default=[1970], help="List of training dataset ids.") + parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.") parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.") parser.add_argument("--seed", type=int, default=10) @@ -40,13 +39,15 @@ parser.add_argument("--sweep_id", type=str, default=None) parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + parser.add_argument("--filetype", default="csv") args = parser.parse_args() logger.setLevel(args.log_level) logger.info(f"{pprint.pformat(vars(args))}") file_root_path = Path( args.root_path, "_".join([ "-".join([str(num) for num in dataset]) - for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None + for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] + if (dataset is not None and dataset != []) ])).resolve() logger.info(f"\n files is saved in {file_root_path}") pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") @@ -62,7 +63,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # Load data and perform necessary preprocessing data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, species=args.species, tissue=args.tissue, valid_dataset=args.valid_dataset, - data_dir="../temp_data").load_data(cache=args.cache) + data_dir="../temp_data", filetype=args.filetype).load_data(cache=args.cache) kwargs = {tune_mode: dict(wandb.config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") diff --git a/examples/tuning/imputation_deepimpute/main.py b/examples/tuning/imputation_deepimpute/main.py index 35989205..041ee8b9 100644 --- a/examples/tuning/imputation_deepimpute/main.py +++ b/examples/tuning/imputation_deepimpute/main.py @@ -5,8 +5,8 @@ import numpy as np import torch -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import ImputationDataset from dance.modules.single_modality.imputation.deepimpute import DeepImpute diff --git a/examples/tuning/imputation_graphsci/main.py b/examples/tuning/imputation_graphsci/main.py index 59705b7a..353cb246 100644 --- a/examples/tuning/imputation_graphsci/main.py +++ b/examples/tuning/imputation_graphsci/main.py @@ -7,9 +7,9 @@ import anndata as ad import numpy as np import torch -import wandb import dance.transforms.normalize as NormFuncs +import wandb from dance import logger from dance.data import Data from dance.datasets.singlemodality import ImputationDataset diff --git a/examples/tuning/imputation_scgnn2/main.py b/examples/tuning/imputation_scgnn2/main.py index aa5f31f9..ee75b386 100644 --- a/examples/tuning/imputation_scgnn2/main.py +++ b/examples/tuning/imputation_scgnn2/main.py @@ -5,8 +5,8 @@ from pprint import pformat import numpy as np -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import ImputationDataset from dance.modules.single_modality.imputation.scgnn2 import ScGNN2 diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index 7a1ed2b3..a17d10f6 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -12,10 +12,10 @@ import scipy import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing import dance.utils.metrics as metrics +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA @@ -95,11 +95,19 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=4, layer_d_2=[4], hidden2_2=4, args=args, # Type_1="NB", Type_2="Bernoulli", ground_truth1=torch.cat([train_labels, test_labels]), cycle=1, # attention_loss="Eucli") # yapf: disable + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return try: dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") data = dataset.load_data() # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} + kwargs = {tune_mode: dict(wandb_config)} preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) @@ -120,7 +128,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) y_train_size), train_labels = data.get_train_data(return_type="torch") (x_test, y_test, x_test_raw, y_test_raw, x_test_size, y_test_size), test_labels = data.get_test_data(return_type="torch") - + train_idx=data.get_split_idx("train") + test_idx=data.get_split_idx("test") Nfeature1 = x_train.shape[1] Nfeature2 = y_train.shape[1] From 9cc76e433ae978788acc8e0fa69d9bc56e47631d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Sep 2024 07:50:43 +0000 Subject: [PATCH 077/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/cta_actinn/main.py | 2 +- examples/tuning/cta_scdeepsort/main.py | 2 +- examples/tuning/cta_singlecellnet/main.py | 2 +- examples/tuning/imputation_deepimpute/main.py | 2 +- examples/tuning/imputation_graphsci/main.py | 2 +- examples/tuning/imputation_scgnn2/main.py | 2 +- examples/tuning/joint_embedding_dcca/main.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py index 84c30745..230b3295 100644 --- a/examples/tuning/cta_actinn/main.py +++ b/examples/tuning/cta_actinn/main.py @@ -5,8 +5,8 @@ from typing import get_args import numpy as np - import wandb + from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN diff --git a/examples/tuning/cta_scdeepsort/main.py b/examples/tuning/cta_scdeepsort/main.py index 1d928be6..40e1e7af 100644 --- a/examples/tuning/cta_scdeepsort/main.py +++ b/examples/tuning/cta_scdeepsort/main.py @@ -7,8 +7,8 @@ import numpy as np import torch - import wandb + from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.scdeepsort import ScDeepSort diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py index b1b87d5a..cc5406d9 100644 --- a/examples/tuning/cta_singlecellnet/main.py +++ b/examples/tuning/cta_singlecellnet/main.py @@ -6,8 +6,8 @@ from typing import get_args import numpy as np - import wandb + from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet diff --git a/examples/tuning/imputation_deepimpute/main.py b/examples/tuning/imputation_deepimpute/main.py index 041ee8b9..35989205 100644 --- a/examples/tuning/imputation_deepimpute/main.py +++ b/examples/tuning/imputation_deepimpute/main.py @@ -5,8 +5,8 @@ import numpy as np import torch - import wandb + from dance import logger from dance.datasets.singlemodality import ImputationDataset from dance.modules.single_modality.imputation.deepimpute import DeepImpute diff --git a/examples/tuning/imputation_graphsci/main.py b/examples/tuning/imputation_graphsci/main.py index 353cb246..59705b7a 100644 --- a/examples/tuning/imputation_graphsci/main.py +++ b/examples/tuning/imputation_graphsci/main.py @@ -7,9 +7,9 @@ import anndata as ad import numpy as np import torch +import wandb import dance.transforms.normalize as NormFuncs -import wandb from dance import logger from dance.data import Data from dance.datasets.singlemodality import ImputationDataset diff --git a/examples/tuning/imputation_scgnn2/main.py b/examples/tuning/imputation_scgnn2/main.py index ee75b386..aa5f31f9 100644 --- a/examples/tuning/imputation_scgnn2/main.py +++ b/examples/tuning/imputation_scgnn2/main.py @@ -5,8 +5,8 @@ from pprint import pformat import numpy as np - import wandb + from dance import logger from dance.datasets.singlemodality import ImputationDataset from dance.modules.single_modality.imputation.scgnn2 import ScGNN2 diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index a17d10f6..dd8f9f76 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -12,10 +12,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA From e3ba672e679ee47ff2ff1846786e27db3d344456 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 13 Sep 2024 23:29:36 +0800 Subject: [PATCH 078/203] update scn --- dance/transforms/scn_feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/transforms/scn_feature.py b/dance/transforms/scn_feature.py index da86e5d1..9a959a3a 100644 --- a/dance/transforms/scn_feature.py +++ b/dance/transforms/scn_feature.py @@ -49,7 +49,7 @@ def __call__(self, data): # sc.pp.scale(adata, max_value=10) # Filtering shouldn't be here norm_exp_df = adata.to_df() - cell_type_df = cell_type_df.loc[adata.obs_names] # not necessary, but kept here in case we subsample cells + # cell_type_df = cell_type_df.loc[adata.obs_names] # not necessary, but kept here in case we subsample cells # Get differentially expressed genes and gene pairs cell_type_array = cell_type_df.columns.values[cell_type_df.values.argmax(1)] From 04a3a4bac72c9b4f5e1fbb9bcbe394f6559c48c1 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Sep 2024 17:19:26 +0800 Subject: [PATCH 079/203] minor change --- get_result_web.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 get_result_web.py diff --git a/get_result_web.py b/get_result_web.py new file mode 100644 index 00000000..8d79d857 --- /dev/null +++ b/get_result_web.py @@ -0,0 +1,85 @@ +import json +import os + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from dance.utils import try_import + +# os.environ["http_proxy"]="http://121.250.209.147:7890" +# os.environ["https_proxy"]="http://121.250.209.147:7890" +wandb = try_import("wandb") +entity = "xzy11632" +project = "dance-dev" +collect_datasets = { + "cta_actinn": [ + "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab", + "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436" + ], + "cta_celltypist": [ + "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031", + "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" + ], + "cta_singlecellnet": [ + "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031", + "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" + ] +} +file_root = "/home/zyxing/dance/examples/tuning" + + +def check_identical_strings(string_list): + if not string_list: + raise ValueError("列表为空") + + arr = np.array(string_list) + if not np.all(arr == arr[0]): + raise ValueError("发现不同的字符串") + + return string_list[0] + + + # if not string_list: + # raise ValueError("列表为空") + # first_string = string_list[0] + # for s in string_list[1:]: + # if s != first_string: + # raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'") + # return first_string +def get_sweep_url(step_csv: pd.DataFrame): + ids = step_csv["id"] + sweep_urls = [] + for run_id in tqdm(ids, leave=False): + api = wandb.Api() + run = api.run(f"/{entity}/{project}/runs/{run_id}") + sweep_urls.append(run.sweep.url) + sweep_url = check_identical_strings(sweep_urls) + return sweep_url + + +def write_ans(): + ans = [] + for method_folder in tqdm(collect_datasets): + for dataset_id in collect_datasets[method_folder]: + file_path = f"{file_root}/{method_folder}/{dataset_id}/results" + step2_url = get_sweep_url(pd.read_csv(f"{file_path}/pipeline/best_test_acc.csv")) + step3_urls = [] + for i in range(3): + step3_urls.append(get_sweep_url(pd.read_csv(f"{file_path}/params/{i}_best_test_acc.csv"))) + step3_str = ",".join(step3_urls) + step_str = f"step2:{step2_url}|step3:{step3_str}" + ans.append({"Dataset_id": dataset_id, method_folder: step_str}) + with open('temp_ans.json', 'w') as f: + json.dump(ans, f) + + +write_ans() From ab7e193de5833450e0db3df9dde010c7ced8fc21 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Sep 2024 23:36:13 +0800 Subject: [PATCH 080/203] update_get_result_web --- get_result_web.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 8d79d857..cb5018ef 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -7,8 +7,8 @@ from dance.utils import try_import -# os.environ["http_proxy"]="http://121.250.209.147:7890" -# os.environ["https_proxy"]="http://121.250.209.147:7890" +os.environ["http_proxy"] = "http://121.250.209.147:7890" +os.environ["https_proxy"] = "http://121.250.209.147:7890" wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" @@ -55,13 +55,16 @@ def check_identical_strings(string_list): # if s != first_string: # raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'") # return first_string -def get_sweep_url(step_csv: pd.DataFrame): +def get_sweep_url(step_csv: pd.DataFrame, single=True): ids = step_csv["id"] sweep_urls = [] - for run_id in tqdm(ids, leave=False): + for run_id in tqdm(reversed(ids), + leave=False): #The reversal of order is related to additional_sweep_ids.append(sweep_id) api = wandb.Api() run = api.run(f"/{entity}/{project}/runs/{run_id}") sweep_urls.append(run.sweep.url) + if single: + break sweep_url = check_identical_strings(sweep_urls) return sweep_url @@ -74,7 +77,11 @@ def write_ans(): step2_url = get_sweep_url(pd.read_csv(f"{file_path}/pipeline/best_test_acc.csv")) step3_urls = [] for i in range(3): - step3_urls.append(get_sweep_url(pd.read_csv(f"{file_path}/params/{i}_best_test_acc.csv"))) + file_csv = f"{file_path}/params/{i}_best_test_acc.csv" + if not os.path.exists(file_csv): + print(f"文件 {file_csv} 不存在,跳过。") + continue + step3_urls.append(get_sweep_url(pd.read_csv(file_csv))) step3_str = ",".join(step3_urls) step_str = f"step2:{step2_url}|step3:{step3_str}" ans.append({"Dataset_id": dataset_id, method_folder: step_str}) From f1e85b07225d2a8b74936dc4f4f2f4b9e8445b86 Mon Sep 17 00:00:00 2001 From: xzy Date: Sun, 29 Sep 2024 09:21:22 +0800 Subject: [PATCH 081/203] update 159 --- .../multi_modality/joint_embedding/scmogcn.py | 2 +- .../multi_modality/joint_embedding/scmvae.py | 5 +- .../tuning/joint_embedding_scmogcn/main.py | 149 +++++++++------ .../tuning/joint_embedding_scmvae/main.py | 177 ++++++++++-------- 4 files changed, 197 insertions(+), 136 deletions(-) diff --git a/dance/modules/multi_modality/joint_embedding/scmogcn.py b/dance/modules/multi_modality/joint_embedding/scmogcn.py index f0567293..a5fd4bd0 100644 --- a/dance/modules/multi_modality/joint_embedding/scmogcn.py +++ b/dance/modules/multi_modality/joint_embedding/scmogcn.py @@ -116,7 +116,7 @@ def fit(self, g_mod1, g_mod2, train_size, cell_type, batch_label, phase_score): Bipartite expression feature graph for modality 1. g_mod2 : dgl.DGLGraph Bipartite expression feature graph for modality 2. - train_size : int + train_size : int or array_like Number of training samples. labels : torch.Tensor Labels for training samples. diff --git a/dance/modules/multi_modality/joint_embedding/scmvae.py b/dance/modules/multi_modality/joint_embedding/scmvae.py index 3905f2fe..36837053 100644 --- a/dance/modules/multi_modality/joint_embedding/scmvae.py +++ b/dance/modules/multi_modality/joint_embedding/scmvae.py @@ -369,7 +369,7 @@ def _inference(self, X1=None, X2=None): if X1 is not None: if self.log_variational: - X1_ = torch.log(X1_ + 1) + X1_ = torch.log(torch.clamp(X1_,min=1e-7)+ 1) mean_l, logvar_l, library = self.X1_encoder_l(X1_) @@ -380,7 +380,8 @@ def _inference(self, X1=None, X2=None): if self.Type == 'ZINB': if self.log_variational: - X2_ = torch.log(X2_ + 1) + # X2_ = torch.log(X2_ + 1) + X2_ = torch.log(torch.clamp(X2_,min=1e-7)+ 1) mean_l2, logvar_l2, library2 = self.X2_encoder_l(X2_) means, logvar = self._encode_modalities(X1_, X2_) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index 90ec2e96..bbb6d890 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -1,4 +1,5 @@ import argparse +import gc import os import pprint import sys @@ -20,7 +21,7 @@ parser = argparse.ArgumentParser() parser.add_argument( "-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_SKIN_atac2gex","openproblems_2022_multi_atac2gex"]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -55,66 +56,98 @@ logger.info(f"\n files is saved in {file_root_path}") pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" - + os.environ["CUDA_LAUNCH_BLOCKING"]="1" + os.environ["WANDB_AGENT_DISABLE_FLAPPING"] = "True" def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) - data = dataset.load_data() - # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) - print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") - preprocessing_pipeline(data) - if args.preprocess != "aux": - cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() - cell_type_labels_unique = list(np.unique(cell_type_labels)) - c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) - data.data['mod1'].obsm["cell_type"] = c_labels - data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) - data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) - - train_size = len(data.get_split_idx("train")) - - data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data) - data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data) - # data.set_config( - # feature_mod=["mod1", "mod2"], - # label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"], - # feature_channel=["X_pca", "X_pca"], - # label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], - # ) - (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch") - phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) - test_id = np.arange(x_mod1.shape[0]) - labels = cell_type.numpy() - adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] - model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), - num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1]) - model.fit( - g_mod1=data.data["mod1"].uns["g"], - g_mod2=data.data["mod2"].uns["g"], - train_size=train_size, - cell_type=cell_type, - batch_label=batch_label, - phase_score=phase_score, - ) - - embeds = model.predict(test_id).cpu().numpy() - score = model.score(test_id, labels, metric="clustering") - # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) - score.update({ - 'subtask': args.subtask, - 'method': 'scmogcn', - }) - - score["ARI"] = score["dance_ari"] - del score["dance_ari"] - wandb.log(score) - wandb.finish() - torch.cuda.empty_cache() + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return + try: + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess) + data = dataset.load_data() + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb_config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + # train_idx=list(set(data.mod["meta1"].obs_names) & set(data.mod["mod1"].obs_names)) + train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) + + # train_size=data.mod["meta1"].shape[0] + # test_size=data.mod["mod1"].shape[0]-train_size + data.set_split_idx("train",train_idx) + data.set_split_idx("test",test_idx) + if args.preprocess != "aux": + cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() + cell_type_labels_unique = list(np.unique(cell_type_labels)) + c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) + data.data['mod1'].obsm["cell_type"] = c_labels + data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) + + # train_size = len(data.get_split_idx("train")) + #按理说meta1应该包括mod1前半部分的所有内容,可能中途打乱了顺序 + data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data) + data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data) + # data.set_config( + # feature_mod=["mod1", "mod2"], + # label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"], + # feature_channel=["X_pca", "X_pca"], + # label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], + # ) + (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch") + phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) + test_id = np.arange(x_mod1.shape[0]) + labels = cell_type.numpy() + adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] + model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), + num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1]) + model.fit( + g_mod1=data.data["mod1"].uns["g"], + g_mod2=data.data["mod2"].uns["g"], + train_size=train_idx, + cell_type=cell_type, + batch_label=batch_label, + phase_score=phase_score, + ) + + embeds = model.predict(test_id).cpu().numpy() + score = model.score(test_id, labels, metric="clustering") + # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) + score.update({ + 'subtask': args.subtask, + 'method': 'scmogcn', + }) + + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + finally: + # del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1 + # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels + # del labels,le,dataset,score + # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score] + locals_keys=list(locals().keys()) + for var in locals_keys: + try: + exec(f"del {var}") + logger.info(f"Deleted '{var}'") + except NameError: + logger.info(f"Variable '{var}' does not exist, continuing...") + torch.cuda.empty_cache() + gc.collect() entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index c52f2108..b3070b40 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -1,4 +1,5 @@ import argparse +import gc import os import pprint import sys @@ -71,81 +72,107 @@ def parameter_setting(): def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") - data = dataset.load_data() - - le = preprocessing.LabelEncoder() - labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) - data.mod["mod1"].obsm["labels"] = labels - - # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) - print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") - preprocessing_pipeline(data) - - (x_train, y_train), _ = data.get_train_data(return_type="torch") - (x_test, y_test), labels = data.get_test_data(return_type="torch") - - lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()])) - lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()])) - lib_mean1 = torch.from_numpy(lib_mean1) - lib_var1 = torch.from_numpy(lib_var1) - lib_mean2 = torch.from_numpy(lib_mean2) - lib_var2 = torch.from_numpy(lib_var2) - - Nfeature1 = x_train.shape[1] - Nfeature2 = y_train.shape[1] - train_size = len(data.get_split_idx("train")) - train = data_utils.TensorDataset(x_train, lib_mean1[:train_size], lib_var1[:train_size], lib_mean2[:train_size], - lib_var2[:train_size], y_train) - - valid = data_utils.TensorDataset(x_test, lib_mean1[train_size:], lib_var1[train_size:], lib_mean2[train_size:], - lib_var2[train_size:], y_test) - - total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) - - total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) - - x_test = torch.cat([x_train, x_test]) - y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 - model = scMVAE( - encoder_1=[Nfeature1, 1024, 128, 128], - hidden_1=128, - Z_DIMS=22, - decoder_share=[22, 128, 256], - share_hidden=128, - decoder_1=[128, 128, 1024], - hidden_2=1024, - encoder_l=[Nfeature1, 128], - hidden3=128, - encoder_2=[Nfeature2, 1024, 128, 128], - hidden_4=128, - encoder_l1=[Nfeature2, 128], - hidden3_1=128, - decoder_2=[128, 128, 1024], - hidden_5=1024, - drop_rate=0.1, - log_variational=True, - Type="ZINB", - device=device, - n_centroids=22, - penality="GMM", - model=1, - ) - model.to(device) - model.init_gmm_params(total_loader) - model.fit(args, train, valid, args.final_rate, args.scale_factor, device) - - # embeds = model.predict(x_test, y_test).cpu().numpy() - score = model.score(x_test, y_test, labels) - score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) - score["ARI"] = score["dance_ari"] - del score["dance_ari"] - wandb.log(score) - wandb.finish() - torch.cuda.empty_cache() + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return + try: + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") + data = dataset.load_data() + + le = preprocessing.LabelEncoder() + labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + data.mod["mod1"].obsm["labels"] = labels + + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb_config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) + + # train_size=data.mod["meta1"].shape[0] + # test_size=data.mod["mod1"].shape[0]-train_size + data.set_split_idx("train",train_idx) + data.set_split_idx("test",test_idx) + (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch") + (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch") + # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels) + lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()])) + lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()])) + lib_mean1 = torch.from_numpy(lib_mean1) + lib_var1 = torch.from_numpy(lib_var1) + lib_mean2 = torch.from_numpy(lib_mean2) + lib_var2 = torch.from_numpy(lib_var2) + + Nfeature1 = x_train.shape[1] + Nfeature2 = y_train.shape[1] + # train_size = len(data.get_split_idx("train")) + # train_size=x_train.shape[0] + train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx], + lib_var2[train_idx], y_train) + + valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx], + lib_var2[test_idx], y_test) + + total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) + + total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) + + x_test = torch.cat([x_train, x_test]) + y_test = torch.cat([y_train, y_test]) + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 + model = scMVAE( + encoder_1=[Nfeature1, 1024, 128, 128], + hidden_1=128, + Z_DIMS=22, + decoder_share=[22, 128, 256], + share_hidden=128, + decoder_1=[128, 128, 1024], + hidden_2=1024, + encoder_l=[Nfeature1, 128], + hidden3=128, + encoder_2=[Nfeature2, 1024, 128, 128], + hidden_4=128, + encoder_l1=[Nfeature2, 128], + hidden3_1=128, + decoder_2=[128, 128, 1024], + hidden_5=1024, + drop_rate=0.1, + log_variational=True, + Type="ZINB", + device=device, + n_centroids=22, + penality="GMM", + model=1, + ) + model.to(device) + model.init_gmm_params(total_loader) + model.fit(args, train, valid, args.final_rate, args.scale_factor, device) + + # embeds = model.predict(x_test, y_test).cpu().numpy() + score = model.score(x_test, y_test, labels) + # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + finally: + locals_keys=list(locals().keys()) + for var in locals_keys: + try: + exec(f"del {var}") + logger.info(f"Deleted '{var}'") + except NameError: + logger.info(f"Variable '{var}' does not exist, continuing...") + torch.cuda.empty_cache() + gc.collect() # score.update({ # 'seed': args.seed + k, # 'subtask': args.subtask, From ab0b5dfd5793be26c71bf10f800801312770c864 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Sep 2024 01:21:54 +0000 Subject: [PATCH 082/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../multi_modality/joint_embedding/scmvae.py | 4 +-- .../tuning/joint_embedding_scmogcn/main.py | 31 +++++++++++-------- .../tuning/joint_embedding_scmvae/main.py | 22 ++++++------- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/dance/modules/multi_modality/joint_embedding/scmvae.py b/dance/modules/multi_modality/joint_embedding/scmvae.py index 36837053..0173ea39 100644 --- a/dance/modules/multi_modality/joint_embedding/scmvae.py +++ b/dance/modules/multi_modality/joint_embedding/scmvae.py @@ -369,7 +369,7 @@ def _inference(self, X1=None, X2=None): if X1 is not None: if self.log_variational: - X1_ = torch.log(torch.clamp(X1_,min=1e-7)+ 1) + X1_ = torch.log(torch.clamp(X1_, min=1e-7) + 1) mean_l, logvar_l, library = self.X1_encoder_l(X1_) @@ -381,7 +381,7 @@ def _inference(self, X1=None, X2=None): if self.Type == 'ZINB': if self.log_variational: # X2_ = torch.log(X2_ + 1) - X2_ = torch.log(torch.clamp(X2_,min=1e-7)+ 1) + X2_ = torch.log(torch.clamp(X2_, min=1e-7) + 1) mean_l2, logvar_l2, library2 = self.X2_encoder_l(X2_) means, logvar = self._encode_modalities(X1_, X2_) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index bbb6d890..39152202 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -20,8 +20,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_SKIN_atac2gex","openproblems_2022_multi_atac2gex"]) + "-t", "--subtask", default="openproblems_bmmc_cite_phase2", choices=[ + "GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", + "GSE140203_SKIN_atac2gex", "openproblems_2022_multi_atac2gex" + ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -56,8 +58,9 @@ logger.info(f"\n files is saved in {file_root_path}") pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" - os.environ["CUDA_LAUNCH_BLOCKING"]="1" + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" os.environ["WANDB_AGENT_DISABLE_FLAPPING"] = "True" + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) @@ -78,14 +81,14 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) # train_idx=list(set(data.mod["meta1"].obs_names) & set(data.mod["mod1"].obs_names)) - train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] - train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] - test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) - + train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx))) + # train_size=data.mod["meta1"].shape[0] # test_size=data.mod["mod1"].shape[0]-train_size - data.set_split_idx("train",train_idx) - data.set_split_idx("test",test_idx) + data.set_split_idx("train", train_idx) + data.set_split_idx("test", test_idx) if args.preprocess != "aux": cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() cell_type_labels_unique = list(np.unique(cell_type_labels)) @@ -106,13 +109,15 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # feature_channel=["X_pca", "X_pca"], # label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], # ) - (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch") + (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, + G2M_score) = data.get_data(return_type="torch") phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) test_id = np.arange(x_mod1.shape[0]) labels = cell_type.numpy() adata_sol = data.data['test_sol'] # [data._split_idx_dict['test']] - model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1), - num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1]) + model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), + num_batches=int(batch_label.max() + 1), num_phases=phase_score.shape[1], + num_features=x_mod1.shape[1] + x_mod2.shape[1]) model.fit( g_mod1=data.data["mod1"].uns["g"], g_mod2=data.data["mod2"].uns["g"], @@ -139,7 +144,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels # del labels,le,dataset,score # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score] - locals_keys=list(locals().keys()) + locals_keys = list(locals().keys()) for var in locals_keys: try: exec(f"del {var}") diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index b3070b40..9fb85885 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -93,16 +93,16 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) - train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] - train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] - test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) - + train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx))) + # train_size=data.mod["meta1"].shape[0] # test_size=data.mod["mod1"].shape[0]-train_size - data.set_split_idx("train",train_idx) - data.set_split_idx("test",test_idx) - (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch") - (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch") + data.set_split_idx("train", train_idx) + data.set_split_idx("test", test_idx) + (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch") + (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch") # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels) lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()])) lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()])) @@ -116,10 +116,10 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # train_size = len(data.get_split_idx("train")) # train_size=x_train.shape[0] train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx], - lib_var2[train_idx], y_train) + lib_var2[train_idx], y_train) valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx], - lib_var2[test_idx], y_test) + lib_var2[test_idx], y_test) total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) @@ -164,7 +164,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) wandb.log(score) wandb.finish() finally: - locals_keys=list(locals().keys()) + locals_keys = list(locals().keys()) for var in locals_keys: try: exec(f"del {var}") From cbc4469aa3b7a923fb658107c2809a562511d319 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 3 Oct 2024 00:09:55 +0800 Subject: [PATCH 083/203] update metadata --- dance/metadata/scdeepsort.csv | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 8b1c2c44..fc732dad 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -83,3 +83,8 @@ human,Blood,10000,train,,,train_human_Bloodae29ebd0-1973-40a4-a6af-d15a5f77a80f_ human,Blood,10000,train,,,train_human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad,https://www.dropbox.com/scl/fi/md5mqb2dh0w9655v0c281/human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad?rlkey=afdyzlpcmd44lo7tl5gnzw8u3&st=gt2fdipz&dl=1 human,Blood,10000,train,,,train_human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad,https://www.dropbox.com/scl/fi/1cih4y8h03dboijqieheg/human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad?rlkey=yupm1kblpt9a8qlmksz1u3xob&st=9jurnfn4&dl=1 human,Blood,10000,train,,,train_human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad,https://www.dropbox.com/scl/fi/b2hvwk1xmbc6ifhouz4kv/human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad?rlkey=82vzr0qcii75tm4sjsn2xw89g&st=6pwjmxnk&dl=1 +human,Blood,549,train,,,train_human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=af7wxs06&dl=1 +human,Blood,1324,train,,,train_human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad,https://www.dropbox.com/scl/fi/kbuvlttd8dfmvx1v94fr4/human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad?rlkey=v1ne1dg2gl8b4j6qj3j3ry6fy&st=gy9vb5q6&dl=1 +human,Blood,10000,train,,,train_human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad,https://www.dropbox.com/scl/fi/8wq8eaod0xuvgwhsjoapa/human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad?rlkey=b6u3b7335l7baricjlgbwthb3&st=cw7mjmx5&dl=1 +human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=cfgc3m7s&dl=1 +human,Blood,10000,train,,,train_human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad,https://www.dropbox.com/scl/fi/kgay0bhk4er6qjx96okrz/human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad?rlkey=m5ax0vhx3vh7ylo4pc74tx9ky&st=sbhonz18&dl=1 From 26c73bafacc4f4a5bad4ce06c1ce3bda73d64344 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 3 Oct 2024 00:11:19 +0800 Subject: [PATCH 084/203] minor change --- get_result_web.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 8d79d857..726e4426 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -7,31 +7,32 @@ from dance.utils import try_import -# os.environ["http_proxy"]="http://121.250.209.147:7890" -# os.environ["https_proxy"]="http://121.250.209.147:7890" +os.environ["http_proxy"] = "http://121.250.209.147:7890" +os.environ["https_proxy"] = "http://121.250.209.147:7890" wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" collect_datasets = { "cta_actinn": [ - "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab", - "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436" - ], - "cta_celltypist": [ - "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031", + "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031", "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" ], + "cta_celltypist": [ + "471647b3-04fe-4c76-8372-3264feb950e8", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + ], + "cta_scdeepsort": [ + "471647b3-04fe-4c76-8372-3264feb950e8", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + ], "cta_singlecellnet": [ - "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" + "471647b3-04fe-4c76-8372-3264feb950e8", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", ] } file_root = "/home/zyxing/dance/examples/tuning" @@ -55,13 +56,16 @@ def check_identical_strings(string_list): # if s != first_string: # raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'") # return first_string -def get_sweep_url(step_csv: pd.DataFrame): +def get_sweep_url(step_csv: pd.DataFrame, single=True): ids = step_csv["id"] sweep_urls = [] - for run_id in tqdm(ids, leave=False): + for run_id in tqdm(reversed(ids), + leave=False): #The reversal of order is related to additional_sweep_ids.append(sweep_id) api = wandb.Api() run = api.run(f"/{entity}/{project}/runs/{run_id}") sweep_urls.append(run.sweep.url) + if single: + break sweep_url = check_identical_strings(sweep_urls) return sweep_url From 4f45b1753deb614b1416007cb660ecbdaad3c1b9 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 2 Oct 2024 12:22:56 -0400 Subject: [PATCH 085/203] minor change --- get_result_web.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 8d79d857..2a5df8b8 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -13,13 +13,12 @@ entity = "xzy11632" project = "dance-dev" collect_datasets = { - "cta_actinn": [ - "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab", - "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436" - ], "cta_celltypist": [ + "84230ea4-998d-4aa8-8456-81dd54ce23af", + "d3566d6a-a455-4a15-980f-45eb29114cab", + ], + "cta_scdeepsort": [ + "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab", "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031", @@ -27,14 +26,11 @@ "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" ], "cta_singlecellnet": [ - "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" + "84230ea4-998d-4aa8-8456-81dd54ce23af", + "d3566d6a-a455-4a15-980f-45eb29114cab", ] } -file_root = "/home/zyxing/dance/examples/tuning" +file_root = "/egr/research-dselab/dingjia5/zhongyu/dance/examples/tuning" def check_identical_strings(string_list): @@ -58,7 +54,8 @@ def check_identical_strings(string_list): def get_sweep_url(step_csv: pd.DataFrame): ids = step_csv["id"] sweep_urls = [] - for run_id in tqdm(ids, leave=False): + for run_id in tqdm(reversed(ids), + leave=False): #The reversal of order is related to additional_sweep_ids.append(sweep_id) api = wandb.Api() run = api.run(f"/{entity}/{project}/runs/{run_id}") sweep_urls.append(run.sweep.url) From 436c7e51f8565a4246d0c9d4b065336d521d5c47 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 7 Oct 2024 21:05:48 +0800 Subject: [PATCH 086/203] update get_result_web --- get_result_web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/get_result_web.py b/get_result_web.py index cb5018ef..4232fdd9 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -78,7 +78,7 @@ def write_ans(): step3_urls = [] for i in range(3): file_csv = f"{file_path}/params/{i}_best_test_acc.csv" - if not os.path.exists(file_csv): + if not os.path.exists(file_csv): #no parameter print(f"文件 {file_csv} 不存在,跳过。") continue step3_urls.append(get_sweep_url(pd.read_csv(file_csv))) From b1e205d898564cca733379d77255524d864771d5 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 23 Oct 2024 15:49:27 +0800 Subject: [PATCH 087/203] sc_sim --- sc_similarity/anndata_similarity.py | 180 ++++++++++++++++++++++++++++ sc_similarity/example_usage.py | 89 ++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 sc_similarity/anndata_similarity.py create mode 100644 sc_similarity/example_usage.py diff --git a/sc_similarity/anndata_similarity.py b/sc_similarity/anndata_similarity.py new file mode 100644 index 00000000..eb8d355c --- /dev/null +++ b/sc_similarity/anndata_similarity.py @@ -0,0 +1,180 @@ +# anndata_similarity.py +# TODO translate notes +import warnings +from typing import Callable, Dict, List + +import anndata +import numpy as np +import pandas as pd +from scipy.spatial.distance import jaccard +from scipy.stats import pearsonr, wasserstein_distance +from sklearn.metrics.pairwise import cosine_similarity + +# Suppress scipy warnings for constant input in Pearson correlation +warnings.filterwarnings("ignore", message="An input array is constant") + + +class AnnDataSimilarity: + + def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData): + """初始化 AnnDataSimilarity 对象,进行数据预处理。""" + self.adata1 = adata1.copy() + self.adata2 = adata2.copy() + self.preprocess() + self.results = {} + self.results_score = {} + + def preprocess(self): + """预处理数据,包括对数归一化和归一化为概率分布。""" + # 对原始数据进行对数归一化 + self.adata1.obs['celltype'] = self.adata1.obs['celltype'].astype(str) + self.adata2.obs['celltype'] = self.adata2.obs['celltype'].astype(str) + + # 计算每个细胞类型的平均表达 + self.avg_expr1 = self._compute_average_expression(self.adata1) + self.avg_expr2 = self._compute_average_expression(self.adata2) + + # 归一化为概率分布以计算 JS 散度等 + self.prob_expr1 = self._normalize_to_probability(self.avg_expr1) + self.prob_expr2 = self._normalize_to_probability(self.avg_expr2) + + def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame: + """计算每种细胞类型的平均基因表达。""" + return adata.to_df().groupby(adata.obs['celltype']).mean() + + def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame: + """将基因表达矩阵归一化为概率分布(每个细胞类型的表达总和为1)。""" + return df.div(df.sum(axis=1), axis=0).fillna(0) + + def cosine_sim(self) -> pd.DataFrame: + """计算两个数据集间的余弦相似度。 返回数据框,行和列分别为 adata1 和 adata2 的细胞类型。""" + sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2) + return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index) + + def pearson_corr(self) -> pd.DataFrame: + """计算两个数据集间的皮尔逊相关系数。 返回数据框,行和列分别为 adata1 和 adata2 的细胞类型。""" + celltypes1 = self.avg_expr1.index + celltypes2 = self.avg_expr2.index + corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) + corr_matrix.at[ct1, ct2] = corr + + return corr_matrix.astype(float) + + def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame: + """计算两个数据集间的 Jaccard 相似度。 使用基因表达的二值化表示,基于指定阈值。 返回数据框,行和列分别为 adata1 和 adata2 + 的细胞类型。""" + # 二值化表达矩阵 + binary_expr1 = (self.avg_expr1 > threshold).astype(int) + binary_expr2 = (self.avg_expr2 > threshold).astype(int) + + celltypes1 = binary_expr1.index + celltypes2 = binary_expr2.index + sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2]) + sim_matrix.at[ct1, ct2] = sim + + return sim_matrix.astype(float) + + def js_distance(self) -> pd.DataFrame: + """计算两个数据集间的 Jensen-Shannon 散度。 需要先将表达数据归一化为概率分布。 返回数据框,行和列分别为 adata1 和 adata2 + 的细胞类型。""" + # def jsd(p, q): + # """ + # 计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。 + # """ + # p = p + 1e-12 + # q = q + 1e-12 + # m = 0.5 * (p + q) + # return 0.5 * (entropy(p, m) + entropy(q, m)) + + # from scipy.stats import entropy + + celltypes1 = self.prob_expr1.index + celltypes2 = self.prob_expr2.index + js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values, + self.prob_expr2.loc[ct2].values) + js_matrix.at[ct1, ct2] = jsd_value + + return js_matrix.astype(float) + + def _jensen_shannon_divergence(self, p, q) -> float: + """计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。""" + from scipy.spatial.distance import jensenshannon + return jensenshannon(p, q) + + def otdd(): + """计算两个数据集间的 OTDD。""" + raise NotImplementedError("OTDD!") + + def wasserstein_dist(self) -> pd.DataFrame: + """计算两个数据集间的 Wasserstein 距离。 返回数据框,行和列分别为 adata1 和 adata2 的细胞类型。""" + celltypes1 = self.avg_expr1.index + celltypes2 = self.avg_expr2.index + wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) + wasserstein_matrix.at[ct1, ct2] = wd + + return wasserstein_matrix.astype(float) + + def compute_similarity( + self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance', + 'otdd']) -> Dict[str, pd.DataFrame]: + """计算指定的相似性度量。 参数: + + methods: 要计算的相似性度量方法列表。支持 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' + 返回: + 包含各个相似性矩阵的字典 + + """ + results = {} + for method in methods: + if method == 'cosine': + results['cosine'] = self.cosine_sim() + elif method == 'pearson': + results['pearson'] = self.pearson_corr() + elif method == 'jaccard': + results['jaccard'] = self.jaccard_sim() + elif method == 'js_distance': + results['js_distance'] = self.js_distance() + elif method == 'wasserstein': + results['wasserstein'] = self.wasserstein_dist() + elif method == "otdd": + results['otdd'] = self.otdd() + else: + raise ValueError(f"Unsupported similarity method: {method}") + return results + + def get_similarity_matrix( + self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance']) -> Dict[str, pd.DataFrame]: + """同 compute_similarity,保留方法名一致性。""" + self.results = self.compute_similarity(methods) + return self.results + + def get_max_similarity_A_to_B(self): + if self.results is None: + raise ValueError(f"need results!") + else: + self.results_score = {} + for key in self.results: + self.results_score[key] = self._get_max_similarity(self.results[key]) + return self.results_score + + def _get_max_similarity(self, similarity_matrix: pd.DataFrame): + """最大匹配平均相似性分数.""" + max_similarity = similarity_matrix.max(axis=1) + overall_similarity = max_similarity.mean() + return overall_similarity diff --git a/sc_similarity/example_usage.py b/sc_similarity/example_usage.py new file mode 100644 index 00000000..3fedb874 --- /dev/null +++ b/sc_similarity/example_usage.py @@ -0,0 +1,89 @@ +# test_anndata_similarity.py + +import anndata +import numpy as np +import pandas as pd +from anndata_similarity import AnnDataSimilarity + + +def create_test_ann_data(): + # 定义基因和细胞类型 + genes = ['gene1', 'gene2'] + celltypes1 = ['A', 'B'] + celltypes2 = ['A', 'B'] + + # 创建数据集1 + data1 = np.array([ + [10, 0], # 细胞类型 A + [0, 10] # 细胞类型 B + ]) + obs1 = pd.DataFrame({'celltype': celltypes1}, index=['cell1', 'cell2']) + adata1 = anndata.AnnData(X=data1, obs=obs1, var=pd.DataFrame(index=genes)) + + # 创建数据集2 + data2 = np.array([ + [10, 0], # 细胞类型 A + [10, 0] # 细胞类型 B + ]) + obs2 = pd.DataFrame({'celltype': celltypes2}, index=['cell3', 'cell4']) + adata2 = anndata.AnnData(X=data2, obs=obs2, var=pd.DataFrame(index=genes)) + + return adata1, adata2 + + +def run_test_case(): + # 创建测试数据 + adata1, adata2 = create_test_ann_data() + + # 初始化相似性计算器 + similarity_calculator = AnnDataSimilarity(adata1, adata2) + + # 计算相似性 + similarity_matrices = similarity_calculator.compute_similarity( + methods=['cosine', 'pearson', 'jaccard', 'js_distance']) + + # 预期结果 + expected_cosine = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B']) + + expected_pearson = pd.DataFrame([[1.0, 1.0], [-1.0, -1.0]], index=['A', 'B'], columns=['A', 'B']) + + expected_jaccard = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B']) + + expected_js = pd.DataFrame([[1.0, 1.0], [0.167445, 0.167445]], index=['A', 'B'], columns=['A', 'B']) + + # 打印结果 + print("Computed Cosine Similarity:") + print(similarity_matrices['cosine']) + print("\nExpected Cosine Similarity:") + print(expected_cosine) + + print("\nComputed Pearson Correlation:") + print(similarity_matrices['pearson']) + print("\nExpected Pearson Correlation:") + print(expected_pearson) + + print("\nComputed Jaccard Similarity:") + print(similarity_matrices['jaccard']) + print("\nExpected Jaccard Similarity:") + print(expected_jaccard) + + print("\nComputed Jensen-Shannon distance:") + print(similarity_matrices['js_distance']) + print("\nExpected Jensen-Shannon distance:") + print(expected_js) + + # 验证结果是否与预期一致 + assert similarity_matrices['cosine'].equals(expected_cosine), "Cosine similarity does not match expected values." + assert similarity_matrices['pearson'].equals( + expected_pearson), "Pearson correlation does not match expected values." + assert similarity_matrices['jaccard'].equals(expected_jaccard), "Jaccard similarity does not match expected values." + + # 由于浮点数计算的精度问题,使用近似比较 + assert np.allclose(similarity_matrices['js_distance'], expected_js, + atol=1e-4), "JS distance does not match expected values." + + print("\nAll tests passed successfully!") + + +if __name__ == "__main__": + run_test_case() From ea5bb9b096ab222f47b4533cd746f7aaa724c7c3 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 20:18:04 +0800 Subject: [PATCH 088/203] update get_result_web --- get_result_web.py | 133 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 104 insertions(+), 29 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 37e694ec..e9fce0fc 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -3,38 +3,20 @@ import numpy as np import pandas as pd +from omegaconf import OmegaConf +from sympy import im from tqdm import tqdm from dance.utils import try_import +# get yaml of best method os.environ["http_proxy"] = "http://121.250.209.147:7890" os.environ["https_proxy"] = "http://121.250.209.147:7890" wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" -collect_datasets = { - "cta_actinn": [ - "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" - ], - "cta_celltypist": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ], - "cta_scdeepsort": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ], - "cta_singlecellnet": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ] -} +with open("dataset_server.json") as f: + collect_datasets = json.load(f) file_root = "/home/zyxing/dance/examples/tuning" @@ -70,24 +52,117 @@ def get_sweep_url(step_csv: pd.DataFrame, single=True): return sweep_url +import re + + +def spilt_web(url: str): + pattern = r"https://wandb\.ai/([^/]+)/([^/]+)/sweeps/([^/]+)" + + match = re.search(pattern, url) + + if match: + entity = match.group(1) + project = match.group(2) + sweep_id = match.group(3) + + return entity, project, sweep_id + else: + print(url) + print("No match found") + + +def get_best_method(urls, metric_col="test_acc"): + all_best_run = None + all_best_step_name = None + step_names = ["step2", "step3_0", "step3_1", "step3_2"] + + def get_metric(run): + if metric_col not in run.summary: + return float('-inf') + else: + return run.summary[metric_col] + + for step_name, url in zip(step_names, urls): + _, _, sweep_id = spilt_web(url) + sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + goal = sweep.config["metric"]["goal"] + if goal == "maximize": + best_run = max(sweep.runs, key=get_metric) + elif goal == "minimize": + best_run = min(sweep.runs, key=get_metric) + else: + raise RuntimeError("choose goal in ['minimize','maximize']") + if metric_col not in best_run.summary: + continue + if all_best_run is None: + all_best_run = best_run + all_best_step_name = step_name + elif all_best_run.summary[metric_col] < best_run.summary[metric_col] and goal == "maximize": + all_best_run = best_run + all_best_step_name = step_name + elif all_best_run.summary[metric_col] > best_run.summary[metric_col] and goal == "minimize": + all_best_run = best_run + all_best_step_name = step_name + return all_best_step_name, all_best_run + + +def get_best_yaml(step_name, best_run, file_path): + if step_name == "step2": + conf = OmegaConf.load(f"{file_path}/pipeline_params_tuning_config.yaml") + for i, fun in enumerate(conf["pipeline"]): + if "include" not in fun: + continue + type_fun = fun["type"] + prefix = f"pipeline.{i}.{type_fun}" + # filtered_dict = {k: v for k, v in b_run.config.items() if k==prefix}.items()[0] + fun_name = best_run.config[prefix] + fun['target'] = fun_name + if 'params' not in fun: + fun['params'] = {} + if "default_params" in fun and fun_name in fun["default_params"]: + fun['params'].update(fun["default_params"][fun_name]) + del fun["include"] + del fun["default_params"] + else: + step3_number = step_name.split("_")[1] + conf = OmegaConf.load(f"{file_path}/config_yamls/params/{step3_number}_test_acc_params_tuning_config.yaml") + for i, fun in enumerate(conf['pipeline']): + if 'params_to_tune' not in fun: + continue + target = fun["target"] + prefix = f"params.{i}.{target}" + filtered_dict = {k: v for k, v in best_run.config.items() if k.startswith(prefix)} + for k, v in filtered_dict.items(): + param_name = k.split(".")[-1] + fun['params_to_tune'][param_name] = v + if "params" not in fun: + fun["params"] = {} + fun["params"].update(fun['params_to_tune']) + del fun["params_to_tune"] + return OmegaConf.to_yaml(conf["pipeline"]) + + def write_ans(): ans = [] for method_folder in tqdm(collect_datasets): for dataset_id in collect_datasets[method_folder]: - file_path = f"{file_root}/{method_folder}/{dataset_id}/results" - step2_url = get_sweep_url(pd.read_csv(f"{file_path}/pipeline/best_test_acc.csv")) + file_path = f"{file_root}/{method_folder}/{dataset_id}" + step2_url = get_sweep_url(pd.read_csv(f"{file_path}/results/pipeline/best_test_acc.csv")) step3_urls = [] for i in range(3): - file_csv = f"{file_path}/params/{i}_best_test_acc.csv" + file_csv = f"{file_path}/results/params/{i}_best_test_acc.csv" if not os.path.exists(file_csv): #no parameter print(f"文件 {file_csv} 不存在,跳过。") continue step3_urls.append(get_sweep_url(pd.read_csv(file_csv))) step3_str = ",".join(step3_urls) step_str = f"step2:{step2_url}|step3:{step3_str}" - ans.append({"Dataset_id": dataset_id, method_folder: step_str}) - with open('temp_ans.json', 'w') as f: - json.dump(ans, f) + step_name, best_run = get_best_method([step2_url] + step3_urls) + best_yaml = get_best_yaml(step_name, best_run, file_path) + ans.append({"Dataset_id": dataset_id, method_folder: step_str, "best_yaml": best_yaml}) + # with open('temp_ans.json', 'w') as f: + # json.dump(ans, f,indent=4) + pd.DataFrame(ans).to_csv("temp_ans.csv") write_ans() From ef68af1f8163136f75cc96c38a7bee6e4faacf0a Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 20:52:14 +0800 Subject: [PATCH 089/203] minor change --- get_result_web.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/get_result_web.py b/get_result_web.py index e9fce0fc..97717245 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from natsort import os_sort_key from omegaconf import OmegaConf from sympy import im from tqdm import tqdm @@ -142,11 +143,22 @@ def get_best_yaml(step_name, best_run, file_path): return OmegaConf.to_yaml(conf["pipeline"]) +def check_exist(file_path): + file_path = f"{file_path}/results/params/" + if os.path.exists(file_path) and os.path.isdir(file_path): + file_num = len(os.listdir(file_path)) + return file_num > 1 + else: + return False + + def write_ans(): ans = [] for method_folder in tqdm(collect_datasets): for dataset_id in collect_datasets[method_folder]: file_path = f"{file_root}/{method_folder}/{dataset_id}" + if not check_exist(file_path): + continue step2_url = get_sweep_url(pd.read_csv(f"{file_path}/results/pipeline/best_test_acc.csv")) step3_urls = [] for i in range(3): From 2b918df2513f5cef6303040db2ef1bba4514828d Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 20:52:44 +0800 Subject: [PATCH 090/203] minor change --- dataset_server.json | 92 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 dataset_server.json diff --git a/dataset_server.json b/dataset_server.json new file mode 100644 index 00000000..10a279da --- /dev/null +++ b/dataset_server.json @@ -0,0 +1,92 @@ +{ + "cta_actinn": [ + "01209dce-3575-4bed-b1df-129f57fbc031", + "055ca631-6ffb-40de-815e-b931e10718c0", + "2a498ace-872a-4935-984b-1afa70fd9886", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", + "3faad104-2ab8-4434-816d-474d8d2641db", + "471647b3-04fe-4c76-8372-3264feb950e8", + "4c4cd77c-8fee-4836-9145-16562a8782fe", + "84230ea4-998d-4aa8-8456-81dd54ce23af", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", + "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d3566d6a-a455-4a15-980f-45eb29114cab", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + "c7775e88-49bf-4ba2-a03b-93f00447c958", + "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", + "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" + ] + , + "cta_celltypist": [ + "01209dce-3575-4bed-b1df-129f57fbc031", + "055ca631-6ffb-40de-815e-b931e10718c0", + "2a498ace-872a-4935-984b-1afa70fd9886", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", + "3faad104-2ab8-4434-816d-474d8d2641db", + "471647b3-04fe-4c76-8372-3264feb950e8", + "4c4cd77c-8fee-4836-9145-16562a8782fe", + "84230ea4-998d-4aa8-8456-81dd54ce23af", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", + "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d3566d6a-a455-4a15-980f-45eb29114cab", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + "c7775e88-49bf-4ba2-a03b-93f00447c958", + "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", + "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" + ], + "cta_scdeepsort": [ + "01209dce-3575-4bed-b1df-129f57fbc031", + "055ca631-6ffb-40de-815e-b931e10718c0", + "2a498ace-872a-4935-984b-1afa70fd9886", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", + "3faad104-2ab8-4434-816d-474d8d2641db", + "471647b3-04fe-4c76-8372-3264feb950e8", + "4c4cd77c-8fee-4836-9145-16562a8782fe", + "84230ea4-998d-4aa8-8456-81dd54ce23af", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", + "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d3566d6a-a455-4a15-980f-45eb29114cab", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + "c7775e88-49bf-4ba2-a03b-93f00447c958", + "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", + "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" + ] + , + "cta_singlecellnet": [ + "01209dce-3575-4bed-b1df-129f57fbc031", + "055ca631-6ffb-40de-815e-b931e10718c0", + "2a498ace-872a-4935-984b-1afa70fd9886", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", + "3faad104-2ab8-4434-816d-474d8d2641db", + "471647b3-04fe-4c76-8372-3264feb950e8", + "4c4cd77c-8fee-4836-9145-16562a8782fe", + "84230ea4-998d-4aa8-8456-81dd54ce23af", + "8a554710-08bc-4005-87cd-da9675bdc2e7", + "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", + "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d3566d6a-a455-4a15-980f-45eb29114cab", + "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + "c7775e88-49bf-4ba2-a03b-93f00447c958", + "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", + "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" + ] +} From 86d000df330499468a9530aa669fa06aea99cefb Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 21:07:50 +0800 Subject: [PATCH 091/203] minor_change --- get_result_web.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 37e694ec..ca1dc1f2 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -13,26 +13,26 @@ entity = "xzy11632" project = "dance-dev" collect_datasets = { - "cta_actinn": [ - "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" - ], - "cta_celltypist": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ], - "cta_scdeepsort": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ], + # "cta_actinn": [ + # "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7", + # "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031", + # "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", + # "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" + # ], + # "cta_celltypist": [ + # "471647b3-04fe-4c76-8372-3264feb950e8", + # "8a554710-08bc-4005-87cd-da9675bdc2e7", + # "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + # ], + # "cta_scdeepsort": [ + # "471647b3-04fe-4c76-8372-3264feb950e8", + # "8a554710-08bc-4005-87cd-da9675bdc2e7", + # "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + # ], "cta_singlecellnet": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" ] } file_root = "/home/zyxing/dance/examples/tuning" @@ -87,7 +87,7 @@ def write_ans(): step_str = f"step2:{step2_url}|step3:{step3_str}" ans.append({"Dataset_id": dataset_id, method_folder: step_str}) with open('temp_ans.json', 'w') as f: - json.dump(ans, f) + json.dump(ans, f, indent=4) write_ans() From 5d32d2eacca45fc82811e45ad7e1aeebf21728f3 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 08:11:55 -0500 Subject: [PATCH 092/203] minor --- get_result_web.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 7d3ed259..fe3c54ec 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -7,32 +7,14 @@ from dance.utils import try_import -os.environ["http_proxy"] = "http://121.250.209.147:7890" -os.environ["https_proxy"] = "http://121.250.209.147:7890" wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" collect_datasets = { - "cta_actinn": [ - "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db" - ], "cta_celltypist": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ], - "cta_scdeepsort": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - ], - "cta_singlecellnet": [ - "471647b3-04fe-4c76-8372-3264feb950e8", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", + "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" ] } file_root = "/egr/research-dselab/dingjia5/zhongyu/dance/examples/tuning" @@ -87,7 +69,7 @@ def write_ans(): step_str = f"step2:{step2_url}|step3:{step3_str}" ans.append({"Dataset_id": dataset_id, method_folder: step_str}) with open('temp_ans.json', 'w') as f: - json.dump(ans, f) + json.dump(ans, f, indent=4) write_ans() From 2408e5428d563214528a7fd37dbf415cd2c4de5d Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 21:15:14 +0800 Subject: [PATCH 093/203] minor --- get_result_web.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 97717245..bed6753b 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -11,8 +11,7 @@ from dance.utils import try_import # get yaml of best method -os.environ["http_proxy"] = "http://121.250.209.147:7890" -os.environ["https_proxy"] = "http://121.250.209.147:7890" + wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" From 7d30ad76bcf8a2410ff17655f9b807af5cbe6124 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 8 Nov 2024 21:21:42 +0800 Subject: [PATCH 094/203] minor change --- get_result_web.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/get_result_web.py b/get_result_web.py index 97717245..18fc3708 100644 --- a/get_result_web.py +++ b/get_result_web.py @@ -104,7 +104,7 @@ def get_metric(run): elif all_best_run.summary[metric_col] > best_run.summary[metric_col] and goal == "minimize": all_best_run = best_run all_best_step_name = step_name - return all_best_step_name, all_best_run + return all_best_step_name, all_best_run, all_best_run.summary[metric_col] def get_best_yaml(step_name, best_run, file_path): @@ -169,9 +169,14 @@ def write_ans(): step3_urls.append(get_sweep_url(pd.read_csv(file_csv))) step3_str = ",".join(step3_urls) step_str = f"step2:{step2_url}|step3:{step3_str}" - step_name, best_run = get_best_method([step2_url] + step3_urls) + step_name, best_run, best_res = get_best_method([step2_url] + step3_urls) best_yaml = get_best_yaml(step_name, best_run, file_path) - ans.append({"Dataset_id": dataset_id, method_folder: step_str, "best_yaml": best_yaml}) + ans.append({ + "Dataset_id": dataset_id, + method_folder: step_str, + f"{method_folder}_best_yaml": best_yaml, + f"{method_folder}_best_res": best_res + }) # with open('temp_ans.json', 'w') as f: # json.dump(ans, f,indent=4) pd.DataFrame(ans).to_csv("temp_ans.csv") From 05d995dc809849f5b99e6075db910116277c1c9b Mon Sep 17 00:00:00 2001 From: xzy Date: Sat, 9 Nov 2024 10:58:21 +0800 Subject: [PATCH 095/203] minor --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index fc732dad..d8fa18ce 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -86,5 +86,5 @@ human,Blood,10000,train,,,train_human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_ human,Blood,549,train,,,train_human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=af7wxs06&dl=1 human,Blood,1324,train,,,train_human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad,https://www.dropbox.com/scl/fi/kbuvlttd8dfmvx1v94fr4/human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad?rlkey=v1ne1dg2gl8b4j6qj3j3ry6fy&st=gy9vb5q6&dl=1 human,Blood,10000,train,,,train_human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad,https://www.dropbox.com/scl/fi/8wq8eaod0xuvgwhsjoapa/human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad?rlkey=b6u3b7335l7baricjlgbwthb3&st=cw7mjmx5&dl=1 -human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=cfgc3m7s&dl=1 +human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/7gszhapz281uah6ytc615/human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad?rlkey=28ywz595f00ppjqwg6054tdqj&st=7bxft78n&dl=1 human,Blood,10000,train,,,train_human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad,https://www.dropbox.com/scl/fi/kgay0bhk4er6qjx96okrz/human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad?rlkey=m5ax0vhx3vh7ylo4pc74tx9ky&st=sbhonz18&dl=1 From fb148f3599dcd6bebe77bdb081223d3ac55cb066 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 13 Nov 2024 15:42:38 +0800 Subject: [PATCH 096/203] minor change --- .../dataset_server.json | 0 .../get_result_web.py | 9 +- .../sc_similarity_examples/sim_query_atlas.py | 92 +++++++++ sc_similarity/anndata_similarity.py | 180 ------------------ sc_similarity/example_usage.py | 89 --------- 5 files changed, 98 insertions(+), 272 deletions(-) rename dataset_server.json => examples/dataset_server.json (100%) rename get_result_web.py => examples/get_result_web.py (97%) create mode 100644 examples/sc_similarity_examples/sim_query_atlas.py delete mode 100644 sc_similarity/anndata_similarity.py delete mode 100644 sc_similarity/example_usage.py diff --git a/dataset_server.json b/examples/dataset_server.json similarity index 100% rename from dataset_server.json rename to examples/dataset_server.json diff --git a/get_result_web.py b/examples/get_result_web.py similarity index 97% rename from get_result_web.py rename to examples/get_result_web.py index 64c214c8..ee5d4158 100644 --- a/get_result_web.py +++ b/examples/get_result_web.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path import numpy as np import pandas as pd @@ -15,9 +16,10 @@ wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" -with open("dataset_server.json") as f: +file_root = str(Path(__file__).resolve().parent) +with open(f"{file_root}/dataset_server.json") as f: collect_datasets = json.load(f) -file_root = "/home/zyxing/dance/examples/tuning" +file_root = "./tuning" def check_identical_strings(string_list): @@ -181,4 +183,5 @@ def write_ans(): pd.DataFrame(ans).to_csv("temp_ans.csv") -write_ans() +if __name__ == "__main__": + write_ans() diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/sc_similarity_examples/sim_query_atlas.py new file mode 100644 index 00000000..0b6dc8d7 --- /dev/null +++ b/examples/sc_similarity_examples/sim_query_atlas.py @@ -0,0 +1,92 @@ +import pandas as pd + +atlas_datasets = [ + "01209dce-3575-4bed-b1df-129f57fbc031", "055ca631-6ffb-40de-815e-b931e10718c0", + "2a498ace-872a-4935-984b-1afa70fd9886", "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", + "3faad104-2ab8-4434-816d-474d8d2641db", "471647b3-04fe-4c76-8372-3264feb950e8", + "4c4cd77c-8fee-4836-9145-16562a8782fe", "84230ea4-998d-4aa8-8456-81dd54ce23af", + "8a554710-08bc-4005-87cd-da9675bdc2e7", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d3566d6a-a455-4a15-980f-45eb29114cab", "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569" +] +import sys + +sys.path.append("..") +import ast + +from get_result_web import get_sweep_url, spilt_web + +from dance.utils import try_import + + +def find_unique_matching_row(df, config_col, input_dict_list): + """在 DataFrame 中查找指定列中与输入字典列表匹配的唯一一行。 + + :param df: pandas.DataFrame,包含要搜索的数据。 + :param config_col: str,DataFrame 中包含字典列表字符串的列名。 + :param input_dict_list: list of dicts,输入的字典列表,用于匹配。 + :return: pandas.Series,匹配的行。 + :raises ValueError: 如果匹配的行数不等于1。 + + """ + + # 定义一个函数,用于解析字符串并比较 + def is_match(config_str): + try: + # 使用 ast.literal_eval 安全地解析字符串为 Python 对象 + config = ast.literal_eval(config_str) + return config == input_dict_list + except (ValueError, SyntaxError): + # 如果解析失败,则不匹配 + return False + + # 应用比较函数,得到一个布尔系列 + matches = df[config_col].apply(is_match) + + # 获取所有匹配的行 + matching_rows = df[matches] + + # 检查匹配的行数 + num_matches = len(matching_rows) + if num_matches == 1: + return matching_rows.iloc[0] + elif num_matches == 0: + raise ValueError("未找到匹配的行。") + else: + raise ValueError(f"找到 {num_matches} 行匹配,预期恰好一行。") + + +wandb = try_import("wandb") +entity = "xzy11632" +project = "dance-dev" +query_datasets = [ + "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" +] + + +def get_ans(query_dataset, method): + data = pd.read_csv(f"/home/zyxing/dance/examples/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv") + sweep_url = get_sweep_url(data) + _, _, sweep_id = spilt_web(sweep_url) + sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + ans = pd.DataFrame(index=[method], columns=atlas_datasets) + for i, run_kwarg in enumerate(sweep.config["parameters"]["run_kwargs"]["values"]): + ans.loc[method, atlas_datasets[i]] = find_unique_matching_row(data, "run_kwargs", run_kwarg)["test_acc"] + # ans.append({atlas_datasets[i]:find_unique_matching_row(data,"run_kwargs",run_kwarg)["test_acc"]}) + return ans + + +ans_all = {} +methods = ["cta_actinn", "cta_scdeepsort"] +if __name__ == "__main__": + for query_dataset in query_datasets: + ans = [] + for method in methods: + ans.append(get_ans(query_dataset, method)) + ans = pd.concat(ans) + ans_all[query_dataset] = ans + for k, v in ans_all.items(): + v.to_csv(f"{str(methods)}_{k}_in_atlas.csv") diff --git a/sc_similarity/anndata_similarity.py b/sc_similarity/anndata_similarity.py deleted file mode 100644 index eb8d355c..00000000 --- a/sc_similarity/anndata_similarity.py +++ /dev/null @@ -1,180 +0,0 @@ -# anndata_similarity.py -# TODO translate notes -import warnings -from typing import Callable, Dict, List - -import anndata -import numpy as np -import pandas as pd -from scipy.spatial.distance import jaccard -from scipy.stats import pearsonr, wasserstein_distance -from sklearn.metrics.pairwise import cosine_similarity - -# Suppress scipy warnings for constant input in Pearson correlation -warnings.filterwarnings("ignore", message="An input array is constant") - - -class AnnDataSimilarity: - - def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData): - """初始化 AnnDataSimilarity 对象,进行数据预处理。""" - self.adata1 = adata1.copy() - self.adata2 = adata2.copy() - self.preprocess() - self.results = {} - self.results_score = {} - - def preprocess(self): - """预处理数据,包括对数归一化和归一化为概率分布。""" - # 对原始数据进行对数归一化 - self.adata1.obs['celltype'] = self.adata1.obs['celltype'].astype(str) - self.adata2.obs['celltype'] = self.adata2.obs['celltype'].astype(str) - - # 计算每个细胞类型的平均表达 - self.avg_expr1 = self._compute_average_expression(self.adata1) - self.avg_expr2 = self._compute_average_expression(self.adata2) - - # 归一化为概率分布以计算 JS 散度等 - self.prob_expr1 = self._normalize_to_probability(self.avg_expr1) - self.prob_expr2 = self._normalize_to_probability(self.avg_expr2) - - def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame: - """计算每种细胞类型的平均基因表达。""" - return adata.to_df().groupby(adata.obs['celltype']).mean() - - def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame: - """将基因表达矩阵归一化为概率分布(每个细胞类型的表达总和为1)。""" - return df.div(df.sum(axis=1), axis=0).fillna(0) - - def cosine_sim(self) -> pd.DataFrame: - """计算两个数据集间的余弦相似度。 返回数据框,行和列分别为 adata1 和 adata2 的细胞类型。""" - sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2) - return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index) - - def pearson_corr(self) -> pd.DataFrame: - """计算两个数据集间的皮尔逊相关系数。 返回数据框,行和列分别为 adata1 和 adata2 的细胞类型。""" - celltypes1 = self.avg_expr1.index - celltypes2 = self.avg_expr2.index - corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) - corr_matrix.at[ct1, ct2] = corr - - return corr_matrix.astype(float) - - def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame: - """计算两个数据集间的 Jaccard 相似度。 使用基因表达的二值化表示,基于指定阈值。 返回数据框,行和列分别为 adata1 和 adata2 - 的细胞类型。""" - # 二值化表达矩阵 - binary_expr1 = (self.avg_expr1 > threshold).astype(int) - binary_expr2 = (self.avg_expr2 > threshold).astype(int) - - celltypes1 = binary_expr1.index - celltypes2 = binary_expr2.index - sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2]) - sim_matrix.at[ct1, ct2] = sim - - return sim_matrix.astype(float) - - def js_distance(self) -> pd.DataFrame: - """计算两个数据集间的 Jensen-Shannon 散度。 需要先将表达数据归一化为概率分布。 返回数据框,行和列分别为 adata1 和 adata2 - 的细胞类型。""" - # def jsd(p, q): - # """ - # 计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。 - # """ - # p = p + 1e-12 - # q = q + 1e-12 - # m = 0.5 * (p + q) - # return 0.5 * (entropy(p, m) + entropy(q, m)) - - # from scipy.stats import entropy - - celltypes1 = self.prob_expr1.index - celltypes2 = self.prob_expr2.index - js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values, - self.prob_expr2.loc[ct2].values) - js_matrix.at[ct1, ct2] = jsd_value - - return js_matrix.astype(float) - - def _jensen_shannon_divergence(self, p, q) -> float: - """计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。""" - from scipy.spatial.distance import jensenshannon - return jensenshannon(p, q) - - def otdd(): - """计算两个数据集间的 OTDD。""" - raise NotImplementedError("OTDD!") - - def wasserstein_dist(self) -> pd.DataFrame: - """计算两个数据集间的 Wasserstein 距离。 返回数据框,行和列分别为 adata1 和 adata2 的细胞类型。""" - celltypes1 = self.avg_expr1.index - celltypes2 = self.avg_expr2.index - wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) - wasserstein_matrix.at[ct1, ct2] = wd - - return wasserstein_matrix.astype(float) - - def compute_similarity( - self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance', - 'otdd']) -> Dict[str, pd.DataFrame]: - """计算指定的相似性度量。 参数: - - methods: 要计算的相似性度量方法列表。支持 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' - 返回: - 包含各个相似性矩阵的字典 - - """ - results = {} - for method in methods: - if method == 'cosine': - results['cosine'] = self.cosine_sim() - elif method == 'pearson': - results['pearson'] = self.pearson_corr() - elif method == 'jaccard': - results['jaccard'] = self.jaccard_sim() - elif method == 'js_distance': - results['js_distance'] = self.js_distance() - elif method == 'wasserstein': - results['wasserstein'] = self.wasserstein_dist() - elif method == "otdd": - results['otdd'] = self.otdd() - else: - raise ValueError(f"Unsupported similarity method: {method}") - return results - - def get_similarity_matrix( - self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance']) -> Dict[str, pd.DataFrame]: - """同 compute_similarity,保留方法名一致性。""" - self.results = self.compute_similarity(methods) - return self.results - - def get_max_similarity_A_to_B(self): - if self.results is None: - raise ValueError(f"need results!") - else: - self.results_score = {} - for key in self.results: - self.results_score[key] = self._get_max_similarity(self.results[key]) - return self.results_score - - def _get_max_similarity(self, similarity_matrix: pd.DataFrame): - """最大匹配平均相似性分数.""" - max_similarity = similarity_matrix.max(axis=1) - overall_similarity = max_similarity.mean() - return overall_similarity diff --git a/sc_similarity/example_usage.py b/sc_similarity/example_usage.py deleted file mode 100644 index 3fedb874..00000000 --- a/sc_similarity/example_usage.py +++ /dev/null @@ -1,89 +0,0 @@ -# test_anndata_similarity.py - -import anndata -import numpy as np -import pandas as pd -from anndata_similarity import AnnDataSimilarity - - -def create_test_ann_data(): - # 定义基因和细胞类型 - genes = ['gene1', 'gene2'] - celltypes1 = ['A', 'B'] - celltypes2 = ['A', 'B'] - - # 创建数据集1 - data1 = np.array([ - [10, 0], # 细胞类型 A - [0, 10] # 细胞类型 B - ]) - obs1 = pd.DataFrame({'celltype': celltypes1}, index=['cell1', 'cell2']) - adata1 = anndata.AnnData(X=data1, obs=obs1, var=pd.DataFrame(index=genes)) - - # 创建数据集2 - data2 = np.array([ - [10, 0], # 细胞类型 A - [10, 0] # 细胞类型 B - ]) - obs2 = pd.DataFrame({'celltype': celltypes2}, index=['cell3', 'cell4']) - adata2 = anndata.AnnData(X=data2, obs=obs2, var=pd.DataFrame(index=genes)) - - return adata1, adata2 - - -def run_test_case(): - # 创建测试数据 - adata1, adata2 = create_test_ann_data() - - # 初始化相似性计算器 - similarity_calculator = AnnDataSimilarity(adata1, adata2) - - # 计算相似性 - similarity_matrices = similarity_calculator.compute_similarity( - methods=['cosine', 'pearson', 'jaccard', 'js_distance']) - - # 预期结果 - expected_cosine = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B']) - - expected_pearson = pd.DataFrame([[1.0, 1.0], [-1.0, -1.0]], index=['A', 'B'], columns=['A', 'B']) - - expected_jaccard = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B']) - - expected_js = pd.DataFrame([[1.0, 1.0], [0.167445, 0.167445]], index=['A', 'B'], columns=['A', 'B']) - - # 打印结果 - print("Computed Cosine Similarity:") - print(similarity_matrices['cosine']) - print("\nExpected Cosine Similarity:") - print(expected_cosine) - - print("\nComputed Pearson Correlation:") - print(similarity_matrices['pearson']) - print("\nExpected Pearson Correlation:") - print(expected_pearson) - - print("\nComputed Jaccard Similarity:") - print(similarity_matrices['jaccard']) - print("\nExpected Jaccard Similarity:") - print(expected_jaccard) - - print("\nComputed Jensen-Shannon distance:") - print(similarity_matrices['js_distance']) - print("\nExpected Jensen-Shannon distance:") - print(expected_js) - - # 验证结果是否与预期一致 - assert similarity_matrices['cosine'].equals(expected_cosine), "Cosine similarity does not match expected values." - assert similarity_matrices['pearson'].equals( - expected_pearson), "Pearson correlation does not match expected values." - assert similarity_matrices['jaccard'].equals(expected_jaccard), "Jaccard similarity does not match expected values." - - # 由于浮点数计算的精度问题,使用近似比较 - assert np.allclose(similarity_matrices['js_distance'], expected_js, - atol=1e-4), "JS distance does not match expected values." - - print("\nAll tests passed successfully!") - - -if __name__ == "__main__": - run_test_case() From 18b197f98bf5184f1d46084bb4dce22c56377522 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 13 Nov 2024 15:46:56 +0800 Subject: [PATCH 097/203] minor change --- examples/sc_similarity_examples/sim_query_atlas.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/sc_similarity_examples/sim_query_atlas.py index 0b6dc8d7..de0c2b5b 100644 --- a/examples/sc_similarity_examples/sim_query_atlas.py +++ b/examples/sc_similarity_examples/sim_query_atlas.py @@ -1,3 +1,5 @@ +import argparse + import pandas as pd atlas_datasets = [ @@ -80,7 +82,10 @@ def get_ans(query_dataset, method): ans_all = {} -methods = ["cta_actinn", "cta_scdeepsort"] +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--methods", default=["cta_actinn", "cta_scdeepsort"], nargs="+") +args = parser.parse_args() +methods = args.methods if __name__ == "__main__": for query_dataset in query_datasets: ans = [] From 864747e81747814c66772e47d21401b24852231d Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 13 Nov 2024 02:58:42 -0500 Subject: [PATCH 098/203] minor change --- examples/sc_similarity_examples/sim_query_atlas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/sc_similarity_examples/sim_query_atlas.py index de0c2b5b..de783f20 100644 --- a/examples/sc_similarity_examples/sim_query_atlas.py +++ b/examples/sc_similarity_examples/sim_query_atlas.py @@ -1,4 +1,5 @@ import argparse +from pathlib import Path import pandas as pd @@ -21,6 +22,8 @@ from dance.utils import try_import +file_root = str(Path(__file__).resolve().parent.parent) + def find_unique_matching_row(df, config_col, input_dict_list): """在 DataFrame 中查找指定列中与输入字典列表匹配的唯一一行。 @@ -70,7 +73,7 @@ def is_match(config_str): def get_ans(query_dataset, method): - data = pd.read_csv(f"/home/zyxing/dance/examples/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv") + data = pd.read_csv(f"{file_root}/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv") sweep_url = get_sweep_url(data) _, _, sweep_id = spilt_web(sweep_url) sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") From b2db5fa5076d01e75db826b24350e96804945129 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 14 Nov 2024 16:00:23 +0800 Subject: [PATCH 099/203] minor --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c58e5232..11365116 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ h5py==3.11.0 igraph==0.10.8 leidenalg==0.10.1 louvain==0.8.1 -mudata==0.2.3 +mudata==0.2.4 networkx==3.3; python_version >= "3.10" networkx==3.2.1; python_version < "3.10" numba==0.59.0 From 4faaa46c1e1bd4e0f10ce90c349511b76e7b3540 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 15 Nov 2024 16:48:21 +0800 Subject: [PATCH 100/203] minor --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 11365116..1d28d37e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ h5py==3.11.0 igraph==0.10.8 leidenalg==0.10.1 louvain==0.8.1 -mudata==0.2.4 +mudata==0.3.1 networkx==3.3; python_version >= "3.10" networkx==3.2.1; python_version < "3.10" numba==0.59.0 From 53d9ca9488bb64adaf613a4bc3128b871f44a028 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 15 Nov 2024 17:18:46 +0800 Subject: [PATCH 101/203] minor --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 1d28d37e..a627a45d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ threadpoolctl==3.5.0 tifffile==2024.2.12 torchnmf==0.3.5 tqdm==4.66.2 +anndata==0.10.8 From 1ff7f672196fd33b6f691b3211132ae22a6a7491 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 15 Nov 2024 17:29:42 +0800 Subject: [PATCH 102/203] minor --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a627a45d..b22a69a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ h5py==3.11.0 igraph==0.10.8 leidenalg==0.10.1 louvain==0.8.1 -mudata==0.3.1 +mudata==0.2.3 networkx==3.3; python_version >= "3.10" networkx==3.2.1; python_version < "3.10" numba==0.59.0 From 7f204df1af556b84c596a07b3581318790f09b2e Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 15 Nov 2024 20:38:23 +0800 Subject: [PATCH 103/203] see https://github.com/PyCQA/docformatter/pull/287 --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c580386f..194f88fa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,13 +32,13 @@ repos: name: Sort imports args: [--line-width, "120", --profile, black] - - repo: https://github.com/PyCQA/docformatter - rev: v1.7.5 - hooks: - - id: docformatter - name: Format docstring - additional_dependencies: [tomli] - args: [--config, pyproject.toml] + # - repo: https://github.com/PyCQA/docformatter + # rev: v1.7.5 + # hooks: + # - id: docformatter + # name: Format docstring + # additional_dependencies: [tomli] + # args: [--config, pyproject.toml] - repo: https://github.com/executablebooks/mdformat rev: 0.7.17 From 5c1d43a6d81ec5884b039286cdc97cc88cddb40c Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 20 Nov 2024 16:54:40 +0800 Subject: [PATCH 104/203] sc_similarity --- dance/sc_similarity/anndata_similarity.py | 357 ++++++++++++++++++++++ dance/sc_similarity/download_data.py | 9 + 2 files changed, 366 insertions(+) create mode 100644 dance/sc_similarity/anndata_similarity.py create mode 100644 dance/sc_similarity/download_data.py diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py new file mode 100644 index 00000000..0287dee8 --- /dev/null +++ b/dance/sc_similarity/anndata_similarity.py @@ -0,0 +1,357 @@ +# anndata_similarity.py +# TODO translate notes +import re +import warnings +from typing import Callable, Dict, List, Optional + +import anndata +import anndata as ad +import numpy as np +import pandas as pd +import scanpy as sc +import yaml +from omegaconf import OmegaConf +from scipy.spatial.distance import jaccard +from scipy.stats import pearsonr, wasserstein_distance +from sklearn.metrics.pairwise import cosine_similarity + +# Suppress scipy warnings for constant input in Pearson correlation +warnings.filterwarnings("ignore", message="An input array is constant") + + +class AnnDataSimilarity: + + def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, cell_col: str, + ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None, + adata2_name: Optional[str] = None, + methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"): + """Initialize the AnnDataSimilarity object and perform data preprocessing.""" + self.adata1 = adata1.copy() + self.adata2 = adata2.copy() + self.origin_adata1 = adata1.copy() + self.origin_adata2 = adata2.copy() + self.cell_col = cell_col + self.preprocess() + self.results = {} + self.results_score = {} + self.ground_truth_conf_path = ground_truth_conf_path + self.adata1_name = adata1_name + self.adata2_name = adata2_name + self.methods = methods + self.tissue = tissue + + def filter_gene(self): + sc.pp.highly_variable_genes(self.adata1, n_top_genes=2000, flavor='seurat_v3') + sc.pp.highly_variable_genes(self.adata2, n_top_genes=2000, flavor='seurat_v3') + + common_hvg = self.adata1.var_names[self.adata1.var['highly_variable']].intersection( + self.adata2.var_names[self.adata2.var['highly_variable']]) + + self.adata1 = self.adata1[:, common_hvg].copy() + self.adata2 = self.adata2[:, common_hvg].copy() + self.common_genes = common_hvg + + def preprocess(self): + self.filter_gene() + """Preprocess the data, including log normalization and normalization to probability distribution.""" + self.adata1.obs[self.cell_col] = self.adata1.obs[self.cell_col].astype(str) + self.adata2.obs[self.cell_col] = self.adata2.obs[self.cell_col].astype(str) + self.avg_expr1 = self._compute_average_expression(self.adata1) + self.avg_expr2 = self._compute_average_expression(self.adata2) + self.prob_expr1 = self._normalize_to_probability(self.avg_expr1) + self.prob_expr2 = self._normalize_to_probability(self.avg_expr2) + + def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame: + """Calculate the average gene expression for each cell type""" + return adata.to_df().groupby(adata.obs[self.cell_col]).mean() + + def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame: + """Normalize the gene expression matrix to a probability distribution (expression sums to 1 for each cell type)""" + return df.div(df.sum(axis=1), axis=0).fillna(0) + + def cosine_sim(self) -> pd.DataFrame: + """Computes the cosine similarity between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively.""" + sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2) + return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index) + + def pearson_corr(self) -> pd.DataFrame: + """Computes the Pearson correlation coefficient between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively.""" + celltypes1 = self.avg_expr1.index + celltypes2 = self.avg_expr2.index + corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) + corr_matrix.at[ct1, ct2] = corr + + return corr_matrix.astype(float) + + def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame: + """Computes the Jaccard similarity between two datasets. Uses a binary representation of gene expression based on a specified threshold. Returns a data frame with rows and columns of cell types in adata1 and adata2 respectively.""" + # Binarized expression matrix + binary_expr1 = (self.avg_expr1 > threshold).astype(int) + binary_expr2 = (self.avg_expr2 > threshold).astype(int) + + celltypes1 = binary_expr1.index + celltypes2 = binary_expr2.index + sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2]) + sim_matrix.at[ct1, ct2] = sim + + return sim_matrix.astype(float) + + def js_distance(self) -> pd.DataFrame: + """Computes the Jensen-Shannon divergence between two datasets. The expression data must first be normalized to a probability distribution. Returns a data frame with rows and columns containing the cell types of adata1 and adata2, respectively.""" + # def jsd(p, q): + # """ + # 计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。 + # """ + # p = p + 1e-12 + # q = q + 1e-12 + # m = 0.5 * (p + q) + # return 0.5 * (entropy(p, m) + entropy(q, m)) + + # from scipy.stats import entropy + + celltypes1 = self.prob_expr1.index + celltypes2 = self.prob_expr2.index + js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values, + self.prob_expr2.loc[ct2].values) + js_matrix.at[ct1, ct2] = jsd_value + + return js_matrix.astype(float) + + def _jensen_shannon_divergence(self, p, q) -> float: + """Compute the Jensen-Shannon divergence of two probability distributions p and q.""" + from scipy.spatial.distance import jensenshannon + return jensenshannon(p, q) + + def common_genes_num(self): + return len(self.common_genes) + + def otdd(): + """Compute the OTDD between two data sets.""" + raise NotImplementedError("OTDD!") + + def data_company(): + raise NotImplementedError("data company") + + def wasserstein_dist(self) -> pd.DataFrame: + """Compute the Wasserstein distance between two datasets. Return a data frame with the cell types in rows and columns of adata1 and adata2 respectively.""" + celltypes1 = self.avg_expr1.index + celltypes2 = self.avg_expr2.index + wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) + + for ct1 in celltypes1: + for ct2 in celltypes2: + wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) + wasserstein_matrix.at[ct1, ct2] = wd + + return wasserstein_matrix.astype(float) + + def get_dataset_meta_sim(self): + # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general'] + con_cols = [ + "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean", + "n_counts_var", "var_n_counts_mean", "var_n_counts_var" + ] + dis_cols = ['assay', 'tissue'] + + def get_discrete_sim(col_list1, col_list2): + set1 = set(col_list1) + set2 = set(col_list2) + intersection = len(set1.intersection(set2)) + union = len(set1.union(set2)) + return intersection / union + + def get_con_sim(con_data_1, con_data_2): + return abs(con_data_1 - con_data_2) / max(con_data_1, con_data_2) + + def get_dataset_info(data: ad.AnnData): + con_sim = {} + con_sim["nnz_mean"] = np.mean(data.obs["nnz"]) + con_sim["nnz_var"] = np.var(data.obs["nnz"]) + nnz_values = data.X[data.X.nonzero()] + con_sim["nnz_counts_mean"] = np.mean(nnz_values) + con_sim["nnz_counts_var"] = np.var(nnz_values) + con_sim["n_measured_vars"] = np.mean(data.obs["n_measured_vars"]) + con_sim["cell_num"] = len(data.obs) + con_sim["gene_num"] = len(data.var) + con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"]) + con_sim["n_counts_var"] = np.var(data.obs["n_counts"]) + con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"]) + con_sim["var_n_counts_var"] = np.var(data.var["n_counts"]) + data.uns["con_sim"] = con_sim + return data + + data_1 = self.adata1.copy() + data_2 = self.adata2.copy() + data_1 = get_dataset_info(data_1) + data_2 = get_dataset_info(data_2) + ans = {} + obs_1 = data_1.obs + obs_2 = data_2.obs + con_sim_1 = data_1.uns["con_sim"] + con_sim_2 = data_2.uns["con_sim"] + for dis_col in dis_cols: + ans[f"{dis_col}_sim"] = get_discrete_sim(obs_1[dis_col].values, obs_2[dis_col].values) + for con_col in con_cols: + ans[f"{con_col}_sim"] = get_con_sim(con_sim_1[con_col], con_sim_2[con_col]) + return np.mean(list(ans.values())) + + def get_ground_truth(self): + assert self.ground_truth_conf_path is not None + assert self.adata1_name is not None + assert self.adata2_name is not None + ground_truth_conf = pd.read_excel(self.ground_truth_conf_path, sheet_name=self.tissue, index_col=0) + + def get_targets(dataset_truth: str): + dataset_truth = OmegaConf.create(fix_yaml_string(dataset_truth)) + targets = [] + for item in dataset_truth: + targets.append(item["target"]) + return targets + + sim_targets = [] + for method in self.methods: + query_dataset_truth = ground_truth_conf.loc[self.adata1_name, f"{method}_method"] + atlas_dataset_truth = ground_truth_conf.loc[self.adata2_name, f"{method}_method"] + query_targets = get_targets(query_dataset_truth) + atlas_targets = get_targets(atlas_dataset_truth) + assert len(query_targets) == len(atlas_targets) + sim_targets.append((sum(a == b for a, b in zip(query_targets, atlas_targets)), len(query_targets))) + sim_targets.append((sum(x for x, y in sim_targets), sum(y for x, y in sim_targets))) + return sim_targets + + def compute_similarity( + self, methods: List[str] = [ + 'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim" + ] + ) -> Dict[str, pd.DataFrame]: + """Computes the specified similarity measure. Parameters: + + methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' + Returns: + Dictionary containing the similarity matrices + + """ + results = {} + for method in methods: + if method == 'cosine': + results['cosine'] = self.cosine_sim() + elif method == 'pearson': + results['pearson'] = self.pearson_corr() + elif method == 'jaccard': + results['jaccard'] = self.jaccard_sim() + elif method == 'js_distance': + results['js_distance'] = self.js_distance() + elif method == 'wasserstein': + results['wasserstein'] = self.wasserstein_dist() + elif method == "common_genes_num": + results["common_genes_num"] = self.common_genes_num() + elif method == "otdd": + results['otdd'] = self.otdd() + elif method == "ground_truth": + results["ground_truth"] = self.get_ground_truth() + elif method == "metadata_sim": + results["metadata_sim"] = self.get_dataset_meta_sim() + else: + raise ValueError(f"Unsupported similarity method: {method}") + return results + + def get_similarity_matrix( + self, methods: List[str] = [ + 'cosine', 'pearson', 'jaccard', 'js_distance', "common_genes_num", "ground_truth", "metadata_sim" + ] + ) -> Dict[str, pd.DataFrame]: + """Same as compute_similarity, keeping method name consistency.""" + self.results = self.compute_similarity(methods) + return self.results + + def get_max_similarity_A_to_B(self): + if self.results is None: + raise ValueError(f"need results!") + else: + self.results_score = {} + for key in self.results: + if key not in ["common_genes_num", "ground_truth", "metadata_sim"]: + self.results_score[key] = self._get_max_similarity(self.results[key]) + else: + self.results_score[key] = self.results[key] + return self.results_score + + def _get_max_similarity(self, similarity_matrix: pd.DataFrame): + """Maximum matching average similarity score.""" + matched_values = [ + similarity_matrix.loc[label, + label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max() + for label in similarity_matrix.index + ] # need to ask + overall_similarity = np.mean(matched_values) + return overall_similarity + + +def extract_type_target_params(item_text): + lines = item_text.strip().split('\n') + item_dict = {} + params_dict = {} + current_param_key = None + in_params = False + for line in lines: + stripped_line = line.strip() + if stripped_line.startswith('- type:'): + item_dict['type'] = stripped_line.split(':', 1)[1].strip() + elif stripped_line.startswith('target:'): + item_dict['target'] = stripped_line.split(':', 1)[1].strip() + elif stripped_line.startswith('params:'): + params_content = stripped_line.split(':', 1)[1].strip() + if params_content == '{}': + params_dict = {} + in_params = False + else: + params_dict = {} + in_params = True + elif in_params: + if re.match(r'^\w+:$', stripped_line): + current_param_key = stripped_line[:-1].strip() + params_dict[current_param_key] = {} + elif re.match(r'^- ', stripped_line): + list_item = stripped_line[2:].strip() + if current_param_key: + if not isinstance(params_dict[current_param_key], list): + params_dict[current_param_key] = [] + params_dict[current_param_key].append(list_item) + elif ':' in stripped_line: + key, value = map(str.strip, stripped_line.split(':', 1)) + if current_param_key and isinstance(params_dict.get(current_param_key, None), dict): + params_dict[current_param_key][key] = yaml.safe_load(value) + else: + params_dict[key] = yaml.safe_load(value) + item_dict['params'] = params_dict + return item_dict + + +def fix_yaml_string(original_str): + #It will be deleted + yaml_str = original_str.replace('\\n', '\n').strip() + items = re.split(r'(?=-\s*type:)', yaml_str) + config_list = [] + for item in items: + if not item.strip(): + continue + if not item.strip().startswith('- type:'): + print(item) + print("警告: 某个项未以 '- type:' 开头,跳过此项.") + continue + item_dict = extract_type_target_params(item) + config_list.append(item_dict) + fixed_yaml = yaml.dump(config_list, sort_keys=False) + return fixed_yaml diff --git a/dance/sc_similarity/download_data.py b/dance/sc_similarity/download_data.py new file mode 100644 index 00000000..83c705fd --- /dev/null +++ b/dance/sc_similarity/download_data.py @@ -0,0 +1,9 @@ +from dance.datasets.singlemodality import CellTypeAnnotationDataset + + +def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "h5ad", train_dataset=[], + test_dataset=[], valid_dataset=[], data_dir="../temp_data"): + data = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset, + valid_dataset=valid_dataset, data_dir=data_dir, tissue=tissue, species=species, + filetype=filetype).load_data() + return data.data From 60d5f21ed9fa30427423139a01c9170fbcded19b Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 21 Nov 2024 21:23:34 +0800 Subject: [PATCH 105/203] minor --- dance/metadata/scdeepsort.csv | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index d8fa18ce..cc41e000 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -88,3 +88,23 @@ human,Blood,1324,train,,,train_human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_d human,Blood,10000,train,,,train_human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad,https://www.dropbox.com/scl/fi/8wq8eaod0xuvgwhsjoapa/human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad?rlkey=b6u3b7335l7baricjlgbwthb3&st=cw7mjmx5&dl=1 human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/7gszhapz281uah6ytc615/human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad?rlkey=28ywz595f00ppjqwg6054tdqj&st=7bxft78n&dl=1 human,Blood,10000,train,,,train_human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad,https://www.dropbox.com/scl/fi/kgay0bhk4er6qjx96okrz/human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad?rlkey=m5ax0vhx3vh7ylo4pc74tx9ky&st=sbhonz18&dl=1 +human,Heart,10000,train,,,train_human_Heart1c739a3e-c3f5-49d5-98e0-73975e751201_data.h5ad,https://www.dropbox.com/scl/fi/ymqp6iki4vu2ur9jw56fp/human_Heart1c739a3e-c3f5-49d5-98e0-73975e751201_data.h5ad?rlkey=o6cladni7kfsfigvni5hk01n7&st=zcxajdyt&dl=1 +human,Heart,10000,train,,,train_human_Heart4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad,https://www.dropbox.com/scl/fi/ml4fvp9v9nufot215m9tl/human_Heart4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad?rlkey=nzfgqj7lb943tvxt2mgtg6o8m&st=4jn8mo66&dl=1 +human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/w2oe3csvt50riz9afrr1g/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=auhxwg79r0xkixttj9f33ho9t&st=0584ksnu&dl=1 +human,Heart,2839,train,,,train_human_Heart83b5e943-a1d5-4164-b3f2-f7a37f01b524_data.h5ad,https://www.dropbox.com/scl/fi/o8vppxwumzh58her9ifxu/human_Heart83b5e943-a1d5-4164-b3f2-f7a37f01b524_data.h5ad?rlkey=iyr5mrhxeisrj5n6pfs2ogdwd&st=vlkpmn65&dl=1 +human,Heart,1089,train,,,train_human_Heart97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad,https://www.dropbox.com/scl/fi/x84r3fzkgfcnn8nsnvmyz/human_Heart97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad?rlkey=vzv8xknnfawjfzwv635p5cj8l&st=3jjdb83q&dl=1 +human,Heart,10000,train,,,train_human_Heart572f3f3e-d3e4-4d13-8e2b-88215e508481_data.h5ad,https://www.dropbox.com/scl/fi/rjv6hv7f14exz1lbh2n9m/human_Heart572f3f3e-d3e4-4d13-8e2b-88215e508481_data.h5ad?rlkey=fszfqswjpe9aspm30x1js05sb&st=35ij2wom&dl=1 +human,Heart,3961,train,,,train_human_Heart1009f384-b12d-448e-ba9f-1b7d2ecfbb4e_data.h5ad,https://www.dropbox.com/scl/fi/a2mooa1bszno3xh4o6sew/human_Heart1009f384-b12d-448e-ba9f-1b7d2ecfbb4e_data.h5ad?rlkey=k9x23whyta1z0g9org4zdrsmm&st=k1z51tdw&dl=1 +human,Heart,10000,train,,,train_human_Heart1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d_data.h5ad,https://www.dropbox.com/scl/fi/86e8yrjglgo714p02jozm/human_Heart1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d_data.h5ad?rlkey=ff7y5amtc8hovujepsqtvyki6&st=dtsskar4&dl=1 +human,Heart,10000,train,,,train_human_Heart1252c5fb-945f-42d6-b1a8-8a3bd864384b_data.h5ad,https://www.dropbox.com/scl/fi/ulhxkmys6q8cxiucrsc7q/human_Heart1252c5fb-945f-42d6-b1a8-8a3bd864384b_data.h5ad?rlkey=z8lbxq7hevqh6vjozlu9pmfzu&st=wml2yxfx&dl=1 +human,Heart,10000,train,,,train_human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad,https://www.dropbox.com/scl/fi/tus9c1k07fh12ggq7s8v4/human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad?rlkey=l3qetjv2iedp9be2yi7x7060k&st=sasyq4k0&dl=1 +human,Heart,10000,train,,,train_human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/ud57pptc2vucgfzbumhi0/human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=foifku28abrsjmn7fyeo3nzlq&st=2o88drh7&dl=1 +human,Heart,2576,train,,,train_human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad,https://www.dropbox.com/scl/fi/3yyr268to8d5z2v1ybph5/human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad?rlkey=tavfun6ztdeuz7gw93r80u3f2&st=lypmrhf9&dl=1 +human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/4t7rimzdbu614qicv7ac1/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=eih47wtd6txwtchaobnwb13qu&st=tgh2kahb&dl=1 +human,Heart,10000,train,,,train_human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad,https://www.dropbox.com/scl/fi/932xsn8bkxeppy4xnuy81/human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad?rlkey=yhgjmo6vcbwcfbfpsnnbxc4np&st=p5asp1ad&dl=1 +human,Heart,10000,train,,,train_human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad,https://www.dropbox.com/scl/fi/okl5stq86etyx6zhfaex9/human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad?rlkey=eg9u06l5k0ycw9t4hf9rlz9rh&st=r3henhn9&dl=1 +human,Heart,10000,train,,,train_human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad,https://www.dropbox.com/scl/fi/qv88ufwdqb89boz79t4w8/human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad?rlkey=s5u32dymnqzpczp67f2vopdxp&st=8u7airt8&dl=1 +human,Heart,10000,train,,,train_human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_data.h5ad,https://www.dropbox.com/scl/fi/xddt54xc5bujkkvhql5nv/human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_data.h5ad?rlkey=fnlrns7gupn548zfa6g6s11iw&st=42qx454t&dl=1 +human,Heart,3799,train,,,train_human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad,https://www.dropbox.com/scl/fi/82mmm9drh008r4faduuvd/human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad?rlkey=u4iwlbm6e6laaht8ey7950fqh&st=v8u040to&dl=1 +human,Heart,10000,train,,,train_human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad,https://www.dropbox.com/scl/fi/uiufoyquxt0hea9dgf0l2/human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad?rlkey=7mu54uqnqrbqtuyfxnqdexmc5&st=2k45pou1&dl=1 +human,Heart,10000,train,,,train_human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/kkgg4abyidaylwxkut2bx/human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=77wmk5knn1oenbffwxo9a6zkk&st=bxemfdf6&dl=1 From a44f62517c75652dcc8a08a9d4f0f816072b5476 Mon Sep 17 00:00:00 2001 From: xzy Date: Sun, 24 Nov 2024 16:53:16 +0800 Subject: [PATCH 106/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index cc41e000..dc22dd98 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -100,7 +100,7 @@ human,Heart,10000,train,,,train_human_Heart1252c5fb-945f-42d6-b1a8-8a3bd864384b_ human,Heart,10000,train,,,train_human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad,https://www.dropbox.com/scl/fi/tus9c1k07fh12ggq7s8v4/human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad?rlkey=l3qetjv2iedp9be2yi7x7060k&st=sasyq4k0&dl=1 human,Heart,10000,train,,,train_human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/ud57pptc2vucgfzbumhi0/human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=foifku28abrsjmn7fyeo3nzlq&st=2o88drh7&dl=1 human,Heart,2576,train,,,train_human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad,https://www.dropbox.com/scl/fi/3yyr268to8d5z2v1ybph5/human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad?rlkey=tavfun6ztdeuz7gw93r80u3f2&st=lypmrhf9&dl=1 -human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/4t7rimzdbu614qicv7ac1/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=eih47wtd6txwtchaobnwb13qu&st=tgh2kahb&dl=1 +human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad(Heart)_data.h5ad,https://www.dropbox.com/scl/fi/4t7rimzdbu614qicv7ac1/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad-Heart-_data.h5ad?rlkey=eih47wtd6txwtchaobnwb13qu&st=9cq6ttbe&dl=1 human,Heart,10000,train,,,train_human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad,https://www.dropbox.com/scl/fi/932xsn8bkxeppy4xnuy81/human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad?rlkey=yhgjmo6vcbwcfbfpsnnbxc4np&st=p5asp1ad&dl=1 human,Heart,10000,train,,,train_human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad,https://www.dropbox.com/scl/fi/okl5stq86etyx6zhfaex9/human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad?rlkey=eg9u06l5k0ycw9t4hf9rlz9rh&st=r3henhn9&dl=1 human,Heart,10000,train,,,train_human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad,https://www.dropbox.com/scl/fi/qv88ufwdqb89boz79t4w8/human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad?rlkey=s5u32dymnqzpczp67f2vopdxp&st=8u7airt8&dl=1 From 136995b62ba246ea3403a90920cb4ef0da3cf88d Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Tue, 26 Nov 2024 09:26:34 +0800 Subject: [PATCH 107/203] update main --- .../tuning/joint_embedding_scmvae/main.py | 177 ++++++++++-------- 1 file changed, 102 insertions(+), 75 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index c52f2108..b3070b40 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -1,4 +1,5 @@ import argparse +import gc import os import pprint import sys @@ -71,81 +72,107 @@ def parameter_setting(): def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") - data = dataset.load_data() - - le = preprocessing.LabelEncoder() - labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) - data.mod["mod1"].obsm["labels"] = labels - - # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) - print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") - preprocessing_pipeline(data) - - (x_train, y_train), _ = data.get_train_data(return_type="torch") - (x_test, y_test), labels = data.get_test_data(return_type="torch") - - lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()])) - lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()])) - lib_mean1 = torch.from_numpy(lib_mean1) - lib_var1 = torch.from_numpy(lib_var1) - lib_mean2 = torch.from_numpy(lib_mean2) - lib_var2 = torch.from_numpy(lib_var2) - - Nfeature1 = x_train.shape[1] - Nfeature2 = y_train.shape[1] - train_size = len(data.get_split_idx("train")) - train = data_utils.TensorDataset(x_train, lib_mean1[:train_size], lib_var1[:train_size], lib_mean2[:train_size], - lib_var2[:train_size], y_train) - - valid = data_utils.TensorDataset(x_test, lib_mean1[train_size:], lib_var1[train_size:], lib_mean2[train_size:], - lib_var2[train_size:], y_test) - - total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) - - total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) - - x_test = torch.cat([x_train, x_test]) - y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 - model = scMVAE( - encoder_1=[Nfeature1, 1024, 128, 128], - hidden_1=128, - Z_DIMS=22, - decoder_share=[22, 128, 256], - share_hidden=128, - decoder_1=[128, 128, 1024], - hidden_2=1024, - encoder_l=[Nfeature1, 128], - hidden3=128, - encoder_2=[Nfeature2, 1024, 128, 128], - hidden_4=128, - encoder_l1=[Nfeature2, 128], - hidden3_1=128, - decoder_2=[128, 128, 1024], - hidden_5=1024, - drop_rate=0.1, - log_variational=True, - Type="ZINB", - device=device, - n_centroids=22, - penality="GMM", - model=1, - ) - model.to(device) - model.init_gmm_params(total_loader) - model.fit(args, train, valid, args.final_rate, args.scale_factor, device) - - # embeds = model.predict(x_test, y_test).cpu().numpy() - score = model.score(x_test, y_test, labels) - score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) - score["ARI"] = score["dance_ari"] - del score["dance_ari"] - wandb.log(score) - wandb.finish() - torch.cuda.empty_cache() + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return + try: + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") + data = dataset.load_data() + + le = preprocessing.LabelEncoder() + labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + data.mod["mod1"].obsm["labels"] = labels + + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb_config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) + + # train_size=data.mod["meta1"].shape[0] + # test_size=data.mod["mod1"].shape[0]-train_size + data.set_split_idx("train",train_idx) + data.set_split_idx("test",test_idx) + (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch") + (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch") + # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels) + lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()])) + lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()])) + lib_mean1 = torch.from_numpy(lib_mean1) + lib_var1 = torch.from_numpy(lib_var1) + lib_mean2 = torch.from_numpy(lib_mean2) + lib_var2 = torch.from_numpy(lib_var2) + + Nfeature1 = x_train.shape[1] + Nfeature2 = y_train.shape[1] + # train_size = len(data.get_split_idx("train")) + # train_size=x_train.shape[0] + train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx], + lib_var2[train_idx], y_train) + + valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx], + lib_var2[test_idx], y_test) + + total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) + + total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) + + x_test = torch.cat([x_train, x_test]) + y_test = torch.cat([y_train, y_test]) + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 + model = scMVAE( + encoder_1=[Nfeature1, 1024, 128, 128], + hidden_1=128, + Z_DIMS=22, + decoder_share=[22, 128, 256], + share_hidden=128, + decoder_1=[128, 128, 1024], + hidden_2=1024, + encoder_l=[Nfeature1, 128], + hidden3=128, + encoder_2=[Nfeature2, 1024, 128, 128], + hidden_4=128, + encoder_l1=[Nfeature2, 128], + hidden3_1=128, + decoder_2=[128, 128, 1024], + hidden_5=1024, + drop_rate=0.1, + log_variational=True, + Type="ZINB", + device=device, + n_centroids=22, + penality="GMM", + model=1, + ) + model.to(device) + model.init_gmm_params(total_loader) + model.fit(args, train, valid, args.final_rate, args.scale_factor, device) + + # embeds = model.predict(x_test, y_test).cpu().numpy() + score = model.score(x_test, y_test, labels) + # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + finally: + locals_keys=list(locals().keys()) + for var in locals_keys: + try: + exec(f"del {var}") + logger.info(f"Deleted '{var}'") + except NameError: + logger.info(f"Variable '{var}' does not exist, continuing...") + torch.cuda.empty_cache() + gc.collect() # score.update({ # 'seed': args.seed + k, # 'subtask': args.subtask, From 82f16f41f804385fbae594255412d77d658c7596 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Nov 2024 01:31:59 +0000 Subject: [PATCH 108/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmvae/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index b349fd42..8b091eb7 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -1,6 +1,5 @@ import argparse import gc -import gc import os import pprint import sys @@ -111,7 +110,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) - + # train_size=data.mod["meta1"].shape[0] # test_size=data.mod["mod1"].shape[0]-train_size data.set_split_idx("train",train_idx) From e5e2edfe902fef99e10e61b477482de9277ce739 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 29 Nov 2024 10:59:10 +0800 Subject: [PATCH 109/203] minor --- .../result_analysis/get_important_pattern.py | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 examples/result_analysis/get_important_pattern.py diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py new file mode 100644 index 00000000..b03731cc --- /dev/null +++ b/examples/result_analysis/get_important_pattern.py @@ -0,0 +1,212 @@ +# metric_name = "test_acc" +# ascending = False +import argparse +import itertools +import pathlib +from collections import Counter +from itertools import combinations +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scikit_posthocs as sp +import seaborn as sns +from mlxtend.frequent_patterns import apriori +from mlxtend.preprocessing import TransactionEncoder +from networkx import parse_adjlist +from scipy import cluster, stats + + +#TODO need to sync all files or get sweep,not file +#asceding need to think +#负向的pattern,换一下顺序就可以吧 +def get_important_pattern(test_accs, ascending, vis=True, alpha=0.05, title=""): + + if vis: + fig = plt.figure(figsize=(12, 4)) + sns.boxplot(data=test_accs) + plt.xticks(list(range(len(test_accs))), [f"{i}" for i in range(len(test_accs))]) + plt.title(title) + plt.show() + _, p_value = stats.kruskal(*test_accs) + if p_value < alpha: + medians = [np.median(group) for group in test_accs] + data = test_accs + p_values_matrix = sp.posthoc_dunn(a=data, p_adjust="bonferroni") + sorted_indices = np.argsort(np.argsort([-x for x in medians] if ascending else medians)) + ranks = { + index: { + "rank": rank, + "before": None, + "after": [], + "real_rank": rank + } + for index, rank in enumerate(sorted_indices) + } + for (rank1, rank2) in combinations(range(max(sorted_indices) + 1), 2): + for idx1 in [index for index, value in ranks.items() if value["rank"] == rank1]: + for idx2 in [index for index, value in ranks.items() if value["rank"] == rank2]: + if p_values_matrix.iloc[idx1, idx2] > alpha: + if ranks[idx2]["before"] is None: + ranks[idx1]["after"].append(idx2) + ranks[idx2]["before"] = idx1 + + def change_real_rank(rank_item, real_rank): + rank_item["real_rank"] = real_rank + for idx in rank_item["after"]: + change_real_rank(ranks[idx], real_rank) + + for rank_item in ranks.values(): + if rank_item["before"] is None: + for idx in rank_item["after"]: + change_real_rank(ranks[idx], rank_item["real_rank"]) + return [v["real_rank"] for k, v in ranks.items()] + else: + if vis: + print("No significant differences found between the groups.") + return [] + + +def replace_nan_in_2d(lst): + return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst] + + +def are_all_elements_same_direct(list_2d): + first_element = None + for sublist in list_2d: + for element in sublist: + if first_element is None: + first_element = element + elif element != first_element: + return False + return True if first_element is not None else True + + +def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1): + threshold = int(len(step2_data) * threshold_per) + step2_data.loc[:, metric_name] = step2_data.loc[:, metric_name].astype(float) + df_sorted = step2_data.sort_values(metric_name, ascending=ascending) + top_10_percent = df_sorted.head(threshold) + columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) + transactions = top_10_percent[columns].values.tolist() + te = TransactionEncoder() + te_ary = te.fit(transactions).transform(transactions) + df = pd.DataFrame(te_ary, columns=te.columns_) + frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True) + # print(frequent_itemsets) + # rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) + return [tuple(a) for a in frequent_itemsets["itemsets"]] + + +# def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False): +# if not data: +# return [] + +# n = max(1, n) + +# mean = np.mean(data) +# std = np.std(data) + +# if std == 0: +# return sorted(data, reverse=not ascending)[:n] + +# z_scores = [(x, (x - mean) / std) for x in data] + +# significant_values = [x for x, z in z_scores if z > threshold] + +# significant_values_sorted = sorted(significant_values, reverse=not ascending) + +# if len(significant_values_sorted) < n: +# remaining = sorted(data, reverse=not ascending)[:n - len(significant_values_sorted)] +# significant_values_sorted.extend(remaining) + +# return significant_values_sorted[:n] + + +def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05): + ans_all = [] + columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) + test_accs = [] + test_acc_names = [] + for r in range(1, len(columns)): #全流程的单独处理 + for com in itertools.combinations(columns, r): + test_accs_arrays = [] + groups = step2_data.groupby(by=list(com)) + if len(groups) == 1: + continue + for g in groups: + test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])}) + test_accs += [i[metric_name] for i in test_accs_arrays] + test_acc_names += [i["name"] for i in test_accs_arrays] + # if are_all_elements_same_direct(test_accs): + # continue + test_accs = replace_nan_in_2d(test_accs) + final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title=" ".join(list(com)), vis=vis) + if len(final_ranks) > 0: #TODO maybe need to think ascending + max_rank = max(final_ranks) + max_rank_count = final_ranks.count(max_rank) + if max_rank_count < len(final_ranks) / 2: + for index, (test_acc_name, rank) in enumerate(zip(test_acc_names, final_ranks)): + if rank == max_rank: + if vis: + print(f"index={index},name={test_acc_name},rank={rank}") + ans_all.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, )) + return ans_all + + +def summary_pattern(data_path, metric_name, ascending, alpha=0.05, vis=False): + step2_origin_data = pd.read_csv(data_path) + step2_data = step2_origin_data.dropna() + com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha) + apr_ans = get_frequent_itemsets(step2_data, metric_name, ascending) + return list(set(com_ans) & set(apr_ans)) + + +# def list_files(directory,file_name="best_test_acc.csv",save_path="summary_file"): +# ans=[] +# path = Path(directory) +# for file_path in path.rglob('*'): +# if file_path.is_file(): +# if file_path.name==file_name: +# algorithm,dataset=file_path.relative_to(directory).parts[:2] +# ans.append({"algorithm":algorithm,"dataset":dataset,"summary_pattern":summary_pattern(file_path)}) +# pd.DataFrame(ans).to_csv(save_path) +def list_files(directories, metric_name, ascending, file_name="best_test_acc.csv", alpha=0.05, vis=False): + ans_all = [] + for directory in directories: + path = Path(directory) + for file_path in path.rglob('*'): + if file_path.is_file(): + if file_path.name == file_name: + print(file_path) + dataset = file_path.parent + method = file_path.parent.parent + ans = summary_pattern(file_path, metric_name, ascending, alpha=alpha, vis=vis) + with open(Path(file_path.parent.resolve(), "pipeline_summary_pattern.txt"), 'w') as f: + f.write(str(ans)) + ans_all.append({"dataset": dataset, "method": method, "ans": ans}) + return ans_all + + +if __name__ == "__main__": + directories = [] + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("task", default="cluster") + parser.add_argument("metric_name", default="acc") + parser.add_argument("ascending", default=False) + args = parser.parse_args() + task = args.task + metric_name = args.metric_name + ascending = args.ascending + file_root = Path(__file__).resolve().parent.parent / "tuning" + for path in file_root.iterdir(): + if path.is_dir(): + if str(path.name).startswith(task): + directories.append(path) + ans_all = list_files(directories, metric_name, ascending) + df = pd.DataFrame(ans_all) + pivot_df = df.pivot(index="dataset", columns="method", values="ans") + pivot_df.to_csv(f"{task}_pattern.csv") + + # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_actinn/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True)) From d5390e122de8e351dea33347249c7934548acb0e Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 29 Nov 2024 10:59:24 +0800 Subject: [PATCH 110/203] minor --- .../get_important_pattern_sweep.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 examples/result_analysis/get_important_pattern_sweep.py diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py new file mode 100644 index 00000000..7010505d --- /dev/null +++ b/examples/result_analysis/get_important_pattern_sweep.py @@ -0,0 +1,98 @@ +import json +import sys +from pathlib import Path +from turtle import pos + +import pandas as pd +import requests +from get_important_pattern import get_com_all, get_frequent_itemsets + +sys.path.append("..") +from get_result_web import spilt_web + +from dance.pipeline import flatten_dict +from dance.utils import try_import + +entity = "xzy11632" +project = "dance-dev" +tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"] +mertic_names = ["test_acc", "acc", "MRE", "ARI", "MSE"] +ascendings = [False, False, True, False, True] +file_root = Path(__file__).resolve().parent +prefix = f'https://wandb.ai/{entity}/{project}' +runs_sum = 0 +wandb = try_import("wandb") +positive = True + + +def get_additional_sweep(sweep_id): + # if sweep has piror runs + # every run get command , get additional sweep id + # or last run command + sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + #last run command + run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None) + additional_sweep_ids = [sweep_id] + if run is None: #check summary data num,note aznph5wt,数量可能不一致。 + return additional_sweep_ids + run_id = run.id + web_abs = requests.get(f"https://api.wandb.ai/files/{run.entity}/{run.project}/{run_id}/wandb-metadata.json") + args = dict(web_abs.json())["args"] + for i in range(len(args)): + if args[i] == '--additional_sweep_ids': + if i + 1 < len(args): + additional_sweep_ids += get_additional_sweep(args[i + 1]) + return additional_sweep_ids + + +def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=False): + # try: + step2_data = step2_origin_data.dropna() + com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha) + apr_ans = get_frequent_itemsets(step2_data, metric_name, ascending) + return list(set(com_ans) & set(apr_ans)) + # except Exception as e: + # print(e) + # return str(e) + + +if __name__ == "__main__": + ans_all = [] + for i, task in enumerate(tasks): + data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str) + data = data.ffill().set_index(['Methods']) + for row_idx in range(data.shape[0]): + for col_idx in range(data.shape[1]): + method = data.index[row_idx] + dataset = data.columns[col_idx] + value = data.iloc[row_idx, col_idx] + step_name = data.iloc[row_idx]["Unnamed: 1"] + if method != "SVM" or dataset != "Dataset 1: GSE67835 Brain": + continue + if isinstance(value, str) and value.startswith(prefix) and ( + str(step_name).lower() == "step2" or str(step_name).lower() == "step 2"): #TODO add step3 + sweep_url = value + else: + continue + _, _, sweep_id = spilt_web(sweep_url) + sweep_ids = get_additional_sweep(sweep_id) + summary_data = [] + for sweep_id in sweep_ids: + sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + for run in sweep.runs: + result = dict(run.summary._json_dict).copy() + result.update(run.config) + result.update({"id": run.id}) + summary_data.append(flatten_dict(result)) # get result and config + ans = pd.DataFrame(summary_data).set_index(["id"]) + ans.sort_index(axis=1, inplace=True) + print(dataset) + print(method) + ans_all.append({ + "task": task, + "dataset": dataset, + "method": method, + "pattern": summary_pattern(ans, mertic_names[i], ascendings[i]) + }) + with open(f"positive:{positive}_pattern.json", "w") as f: + json.dump(ans_all, f, indent=2) From 4a7c5be485e4b424868c1e98f2f3789ec69001d0 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 2 Dec 2024 19:53:16 +0800 Subject: [PATCH 111/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index dc22dd98..c993b2d7 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -108,3 +108,7 @@ human,Heart,10000,train,,,train_human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_ human,Heart,3799,train,,,train_human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad,https://www.dropbox.com/scl/fi/82mmm9drh008r4faduuvd/human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad?rlkey=u4iwlbm6e6laaht8ey7950fqh&st=v8u040to&dl=1 human,Heart,10000,train,,,train_human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad,https://www.dropbox.com/scl/fi/uiufoyquxt0hea9dgf0l2/human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad?rlkey=7mu54uqnqrbqtuyfxnqdexmc5&st=2k45pou1&dl=1 human,Heart,10000,train,,,train_human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/kkgg4abyidaylwxkut2bx/human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=77wmk5knn1oenbffwxo9a6zkk&st=bxemfdf6&dl=1 +human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/nxd9awxshy5y4ps6ctsqr/human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=g7je5vl6kaany80tw8jwsioxr&st=5zq8uimv&dl=1 +human,Heart,10000,train,,,train_human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad,https://www.dropbox.com/scl/fi/uxd1yhbc98ayx4f0ap4h8/human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad?rlkey=aa7f5rylwxzxyc7ue0efvc1kb&st=0rt0d1lr&dl=1 +human,Heart,10000,train,,,train_human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/gb98ycqcu24ewonalqjpo/human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=u8d2ovqkrhptigqmucm7qijde&st=dxhu640r&dl=1 +human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad,https://www.dropbox.com/scl/fi/mz5jq1ig0zp36w7xmv1pe/human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad?rlkey=g6gjodg99l1ba0swr678h8kb0&st=hugh3bpt&dl=1 From 06f85e7767c41baac7127c6b7e1752173276bb43 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 2 Dec 2024 19:58:16 +0800 Subject: [PATCH 112/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index c993b2d7..dd0bcedd 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -108,7 +108,7 @@ human,Heart,10000,train,,,train_human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_ human,Heart,3799,train,,,train_human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad,https://www.dropbox.com/scl/fi/82mmm9drh008r4faduuvd/human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad?rlkey=u4iwlbm6e6laaht8ey7950fqh&st=v8u040to&dl=1 human,Heart,10000,train,,,train_human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad,https://www.dropbox.com/scl/fi/uiufoyquxt0hea9dgf0l2/human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad?rlkey=7mu54uqnqrbqtuyfxnqdexmc5&st=2k45pou1&dl=1 human,Heart,10000,train,,,train_human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/kkgg4abyidaylwxkut2bx/human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=77wmk5knn1oenbffwxo9a6zkk&st=bxemfdf6&dl=1 -human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/nxd9awxshy5y4ps6ctsqr/human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=g7je5vl6kaany80tw8jwsioxr&st=5zq8uimv&dl=1 +human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)_data.h5ad,https://www.dropbox.com/scl/fi/nxd9awxshy5y4ps6ctsqr/human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Heart-_data.h5ad?rlkey=g7je5vl6kaany80tw8jwsioxr&st=2n66z9yc&dl=1 human,Heart,10000,train,,,train_human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad,https://www.dropbox.com/scl/fi/uxd1yhbc98ayx4f0ap4h8/human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad?rlkey=aa7f5rylwxzxyc7ue0efvc1kb&st=0rt0d1lr&dl=1 human,Heart,10000,train,,,train_human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/gb98ycqcu24ewonalqjpo/human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=u8d2ovqkrhptigqmucm7qijde&st=dxhu640r&dl=1 human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad,https://www.dropbox.com/scl/fi/mz5jq1ig0zp36w7xmv1pe/human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad?rlkey=g6gjodg99l1ba0swr678h8kb0&st=hugh3bpt&dl=1 From adfbfd2bed2bc288ab8f1d3245a546a8b40a2e51 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 4 Dec 2024 09:40:05 +0800 Subject: [PATCH 113/203] minor --- examples/tuning/joint_embedding_scmvae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 9fb85885..07ae5afa 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -9,9 +9,9 @@ import pandas as pd import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE From 95b28835ce129bda7c15493651789adc1b24df4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Dec 2024 01:41:18 +0000 Subject: [PATCH 114/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmvae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index d9a7188c..8b091eb7 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -9,9 +9,9 @@ import pandas as pd import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE From 13a42e04e8950d020f3a54b207018bd2c9566ffc Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 4 Dec 2024 23:37:45 +0800 Subject: [PATCH 115/203] minor --- dance/metadata/scdeepsort.csv | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index dd0bcedd..747fff41 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -112,3 +112,6 @@ human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf( human,Heart,10000,train,,,train_human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad,https://www.dropbox.com/scl/fi/uxd1yhbc98ayx4f0ap4h8/human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad?rlkey=aa7f5rylwxzxyc7ue0efvc1kb&st=0rt0d1lr&dl=1 human,Heart,10000,train,,,train_human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/gb98ycqcu24ewonalqjpo/human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=u8d2ovqkrhptigqmucm7qijde&st=dxhu640r&dl=1 human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad,https://www.dropbox.com/scl/fi/mz5jq1ig0zp36w7xmv1pe/human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad?rlkey=g6gjodg99l1ba0swr678h8kb0&st=hugh3bpt&dl=1 +human,Brain,10000,train,,,train_human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad,https://www.dropbox.com/scl/fi/hu35a45qk3b4m2ep17poa/human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad?rlkey=zbb9otp1tu6kvlkxsc7absfih&st=p9nwvnjo&dl=1 +human,Brain,10000,train,,,train_human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Brain)_data.h5ad,https://www.dropbox.com/scl/fi/f3l26pxi1d6bmtzzvugxr/human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Brain-_data.h5ad?rlkey=qn3hip8pk0be91uyrfvdf78uh&st=rsr9vb0j&dl=1 +human,Brain,10000,train,,,train_human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad,https://www.dropbox.com/scl/fi/8hw6yprqqc3tk7k2g03nj/human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad?rlkey=urxy3hu4omlt2824l2epdsl70&st=dqmdsxac&dl=1 From c0cd31ba9566bc296221bb840ff16ca17085174a Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 5 Dec 2024 16:45:48 +0800 Subject: [PATCH 116/203] minor --- examples/dataset_server.json | 176 +++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 80 deletions(-) diff --git a/examples/dataset_server.json b/examples/dataset_server.json index 10a279da..f0404928 100644 --- a/examples/dataset_server.json +++ b/examples/dataset_server.json @@ -1,92 +1,108 @@ { "cta_actinn": [ - "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", - "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", - "3faad104-2ab8-4434-816d-474d8d2641db", - "471647b3-04fe-4c76-8372-3264feb950e8", - "4c4cd77c-8fee-4836-9145-16562a8782fe", - "84230ea4-998d-4aa8-8456-81dd54ce23af", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", - "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d3566d6a-a455-4a15-980f-45eb29114cab", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - "c7775e88-49bf-4ba2-a03b-93f00447c958", - "456e8b9b-f872-488b-871d-94534090a865", - "738942eb-ac72-44ff-a64b-8943b5ecd8d9", - "a5d95a42-0137-496f-8a60-101e17f263c8", - "71be997d-ff75-41b9-8a9f-1288c865f921" + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" ] , "cta_celltypist": [ - "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", - "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", - "3faad104-2ab8-4434-816d-474d8d2641db", - "471647b3-04fe-4c76-8372-3264feb950e8", - "4c4cd77c-8fee-4836-9145-16562a8782fe", - "84230ea4-998d-4aa8-8456-81dd54ce23af", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", - "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d3566d6a-a455-4a15-980f-45eb29114cab", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - "c7775e88-49bf-4ba2-a03b-93f00447c958", - "456e8b9b-f872-488b-871d-94534090a865", - "738942eb-ac72-44ff-a64b-8943b5ecd8d9", - "a5d95a42-0137-496f-8a60-101e17f263c8", - "71be997d-ff75-41b9-8a9f-1288c865f921" + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" ], "cta_scdeepsort": [ - "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", - "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", - "3faad104-2ab8-4434-816d-474d8d2641db", - "471647b3-04fe-4c76-8372-3264feb950e8", - "4c4cd77c-8fee-4836-9145-16562a8782fe", - "84230ea4-998d-4aa8-8456-81dd54ce23af", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", - "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d3566d6a-a455-4a15-980f-45eb29114cab", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - "c7775e88-49bf-4ba2-a03b-93f00447c958", - "456e8b9b-f872-488b-871d-94534090a865", - "738942eb-ac72-44ff-a64b-8943b5ecd8d9", - "a5d95a42-0137-496f-8a60-101e17f263c8", - "71be997d-ff75-41b9-8a9f-1288c865f921" + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" ] , "cta_singlecellnet": [ - "01209dce-3575-4bed-b1df-129f57fbc031", - "055ca631-6ffb-40de-815e-b931e10718c0", - "2a498ace-872a-4935-984b-1afa70fd9886", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", - "3faad104-2ab8-4434-816d-474d8d2641db", - "471647b3-04fe-4c76-8372-3264feb950e8", - "4c4cd77c-8fee-4836-9145-16562a8782fe", - "84230ea4-998d-4aa8-8456-81dd54ce23af", - "8a554710-08bc-4005-87cd-da9675bdc2e7", - "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", - "bc260987-8ee5-4b6e-8773-72805166b3f7", - "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", - "d3566d6a-a455-4a15-980f-45eb29114cab", - "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", - "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", - "c7775e88-49bf-4ba2-a03b-93f00447c958", - "456e8b9b-f872-488b-871d-94534090a865", - "738942eb-ac72-44ff-a64b-8943b5ecd8d9", - "a5d95a42-0137-496f-8a60-101e17f263c8", - "71be997d-ff75-41b9-8a9f-1288c865f921" + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" ] } From da1bfe7b6fe0a578086036d3bf39d21ea2d4e499 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 5 Dec 2024 19:45:06 +0800 Subject: [PATCH 117/203] minor --- examples/dataset_server.json | 216 ++++++++++++++++++----------------- 1 file changed, 110 insertions(+), 106 deletions(-) diff --git a/examples/dataset_server.json b/examples/dataset_server.json index f0404928..615fd761 100644 --- a/examples/dataset_server.json +++ b/examples/dataset_server.json @@ -1,108 +1,112 @@ { - "cta_actinn": [ - "572f3f3e-d3e4-4d13-8e2b-88215e508481", - "fa27492b-82ff-4ab7-ac61-0e2b184eee67", - "f15e263b-6544-46cb-a46e-e33ab7ce8347", - "f7995301-7551-4e1d-8396-ffe3c9497ace", - "e6a11140-2545-46bc-929e-da243eed2cae", - "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", - "1c739a3e-c3f5-49d5-98e0-73975e751201", - "1252c5fb-945f-42d6-b1a8-8a3bd864384b", - "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", - "d567b692-c374-4628-a508-8008f6778f22", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", - "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", - "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", - "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", - "83b5e943-a1d5-4164-b3f2-f7a37f01b524", - "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", - "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", - "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", - "9434b020-de42-43eb-bcc4-542b2be69015", - "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", - "4ed927e9-c099-49af-b8ce-a2652d069333", - "ed852810-a003-4386-9846-1638362cee39", - "f75f2ff4-2884-4c2d-b375-70de37a34507", - "97a17473-e2b1-4f31-a544-44a60773e2dd" - ] - , - "cta_celltypist": [ - "572f3f3e-d3e4-4d13-8e2b-88215e508481", - "fa27492b-82ff-4ab7-ac61-0e2b184eee67", - "f15e263b-6544-46cb-a46e-e33ab7ce8347", - "f7995301-7551-4e1d-8396-ffe3c9497ace", - "e6a11140-2545-46bc-929e-da243eed2cae", - "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", - "1c739a3e-c3f5-49d5-98e0-73975e751201", - "1252c5fb-945f-42d6-b1a8-8a3bd864384b", - "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", - "d567b692-c374-4628-a508-8008f6778f22", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", - "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", - "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", - "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", - "83b5e943-a1d5-4164-b3f2-f7a37f01b524", - "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", - "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", - "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", - "9434b020-de42-43eb-bcc4-542b2be69015", - "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", - "4ed927e9-c099-49af-b8ce-a2652d069333", - "ed852810-a003-4386-9846-1638362cee39", - "f75f2ff4-2884-4c2d-b375-70de37a34507", - "97a17473-e2b1-4f31-a544-44a60773e2dd" - ], - "cta_scdeepsort": [ - "572f3f3e-d3e4-4d13-8e2b-88215e508481", - "fa27492b-82ff-4ab7-ac61-0e2b184eee67", - "f15e263b-6544-46cb-a46e-e33ab7ce8347", - "f7995301-7551-4e1d-8396-ffe3c9497ace", - "e6a11140-2545-46bc-929e-da243eed2cae", - "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", - "1c739a3e-c3f5-49d5-98e0-73975e751201", - "1252c5fb-945f-42d6-b1a8-8a3bd864384b", - "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", - "d567b692-c374-4628-a508-8008f6778f22", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", - "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", - "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", - "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", - "83b5e943-a1d5-4164-b3f2-f7a37f01b524", - "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", - "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", - "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", - "9434b020-de42-43eb-bcc4-542b2be69015", - "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", - "4ed927e9-c099-49af-b8ce-a2652d069333", - "ed852810-a003-4386-9846-1638362cee39", - "f75f2ff4-2884-4c2d-b375-70de37a34507", - "97a17473-e2b1-4f31-a544-44a60773e2dd" - ] - , - "cta_singlecellnet": [ - "572f3f3e-d3e4-4d13-8e2b-88215e508481", - "fa27492b-82ff-4ab7-ac61-0e2b184eee67", - "f15e263b-6544-46cb-a46e-e33ab7ce8347", - "f7995301-7551-4e1d-8396-ffe3c9497ace", - "e6a11140-2545-46bc-929e-da243eed2cae", - "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", - "1c739a3e-c3f5-49d5-98e0-73975e751201", - "1252c5fb-945f-42d6-b1a8-8a3bd864384b", - "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", - "d567b692-c374-4628-a508-8008f6778f22", - "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", - "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", - "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", - "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", - "83b5e943-a1d5-4164-b3f2-f7a37f01b524", - "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", - "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", - "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", - "9434b020-de42-43eb-bcc4-542b2be69015", - "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", - "4ed927e9-c099-49af-b8ce-a2652d069333", - "ed852810-a003-4386-9846-1638362cee39", - "f75f2ff4-2884-4c2d-b375-70de37a34507", - "97a17473-e2b1-4f31-a544-44a60773e2dd" - ] + + "heart":{ + "cta_actinn": [ + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" + ] + , + "cta_celltypist": [ + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" + ], + "cta_scdeepsort": [ + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" + ] + , + "cta_singlecellnet": [ + "572f3f3e-d3e4-4d13-8e2b-88215e508481", + "fa27492b-82ff-4ab7-ac61-0e2b184eee67", + "f15e263b-6544-46cb-a46e-e33ab7ce8347", + "f7995301-7551-4e1d-8396-ffe3c9497ace", + "e6a11140-2545-46bc-929e-da243eed2cae", + "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d", + "1c739a3e-c3f5-49d5-98e0-73975e751201", + "1252c5fb-945f-42d6-b1a8-8a3bd864384b", + "a68b64d8-aee3-4947-81b7-36b8fe5a44d2", + "d567b692-c374-4628-a508-8008f6778f22", + "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)", + "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039", + "f7c1c579-2dc0-47e2-ba19-8165c5a0e353", + "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)", + "83b5e943-a1d5-4164-b3f2-f7a37f01b524", + "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7", + "5a11f879-d1ef-458a-910c-9b0bdfca5ebf", + "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e", + "9434b020-de42-43eb-bcc4-542b2be69015", + "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e", + "4ed927e9-c099-49af-b8ce-a2652d069333", + "ed852810-a003-4386-9846-1638362cee39", + "f75f2ff4-2884-4c2d-b375-70de37a34507", + "97a17473-e2b1-4f31-a544-44a60773e2dd" + ] + } + } From 8a27fc6a37341f4f180caa617d58c37016b307fd Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 5 Dec 2024 19:48:09 +0800 Subject: [PATCH 118/203] minor --- examples/get_result_web.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/examples/get_result_web.py b/examples/get_result_web.py index ee5d4158..02da6824 100644 --- a/examples/get_result_web.py +++ b/examples/get_result_web.py @@ -13,13 +13,6 @@ # get yaml of best method -wandb = try_import("wandb") -entity = "xzy11632" -project = "dance-dev" -file_root = str(Path(__file__).resolve().parent) -with open(f"{file_root}/dataset_server.json") as f: - collect_datasets = json.load(f) -file_root = "./tuning" def check_identical_strings(string_list): @@ -153,8 +146,10 @@ def check_exist(file_path): return False -def write_ans(): +def write_ans(tissue): ans = [] + collect_datasets=all_datasets[tissue] + for method_folder in tqdm(collect_datasets): for dataset_id in collect_datasets[method_folder]: file_path = f"{file_root}/{method_folder}/{dataset_id}" @@ -180,8 +175,17 @@ def write_ans(): }) # with open('temp_ans.json', 'w') as f: # json.dump(ans, f,indent=4) - pd.DataFrame(ans).to_csv("temp_ans.csv") + pd.DataFrame(ans).to_csv(f"{tissue}_ans.csv") if __name__ == "__main__": - write_ans() + wandb = try_import("wandb") + entity = "xzy11632" + project = "dance-dev" + file_root = str(Path(__file__).resolve().parent) + with open(f"{file_root}/dataset_server.json") as f: + all_datasets = json.load(f) + file_root = "./tuning" + tissues=["heart"] + for tissue in tissues: + write_ans(tissue) From 43d489491a5551f8d71fe3a2eedf5b4756ea124c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:48:45 +0000 Subject: [PATCH 119/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/dataset_server.json | 4 ++-- examples/get_result_web.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/dataset_server.json b/examples/dataset_server.json index 615fd761..8d07f604 100644 --- a/examples/dataset_server.json +++ b/examples/dataset_server.json @@ -1,5 +1,5 @@ { - + "heart":{ "cta_actinn": [ "572f3f3e-d3e4-4d13-8e2b-88215e508481", @@ -108,5 +108,5 @@ "97a17473-e2b1-4f31-a544-44a60773e2dd" ] } - + } diff --git a/examples/get_result_web.py b/examples/get_result_web.py index 02da6824..4e5f9da2 100644 --- a/examples/get_result_web.py +++ b/examples/get_result_web.py @@ -14,7 +14,6 @@ # get yaml of best method - def check_identical_strings(string_list): if not string_list: raise ValueError("列表为空") @@ -148,8 +147,8 @@ def check_exist(file_path): def write_ans(tissue): ans = [] - collect_datasets=all_datasets[tissue] - + collect_datasets = all_datasets[tissue] + for method_folder in tqdm(collect_datasets): for dataset_id in collect_datasets[method_folder]: file_path = f"{file_root}/{method_folder}/{dataset_id}" @@ -186,6 +185,6 @@ def write_ans(tissue): with open(f"{file_root}/dataset_server.json") as f: all_datasets = json.load(f) file_root = "./tuning" - tissues=["heart"] + tissues = ["heart"] for tissue in tissues: write_ans(tissue) From def68bec15fa5de893b947787ca6eba3654a02f6 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 6 Dec 2024 21:25:50 +0800 Subject: [PATCH 120/203] minor --- dance/metadata/scdeepsort.csv | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 747fff41..6d2f6317 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -115,3 +115,6 @@ human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_ human,Brain,10000,train,,,train_human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad,https://www.dropbox.com/scl/fi/hu35a45qk3b4m2ep17poa/human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad?rlkey=zbb9otp1tu6kvlkxsc7absfih&st=p9nwvnjo&dl=1 human,Brain,10000,train,,,train_human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Brain)_data.h5ad,https://www.dropbox.com/scl/fi/f3l26pxi1d6bmtzzvugxr/human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Brain-_data.h5ad?rlkey=qn3hip8pk0be91uyrfvdf78uh&st=rsr9vb0j&dl=1 human,Brain,10000,train,,,train_human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad,https://www.dropbox.com/scl/fi/8hw6yprqqc3tk7k2g03nj/human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad?rlkey=urxy3hu4omlt2824l2epdsl70&st=dqmdsxac&dl=1 +human,Brain,10000,train,,,train_human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad,https://www.dropbox.com/scl/fi/hth8if16ri6y64yfabe66/human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad?rlkey=eiyf474thqawso5ltjn0pnwko&st=54qn18ky&dl=1 +human,Brain,10000,train,,,train_human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad,https://www.dropbox.com/scl/fi/a3te6jw55cv4mujujq4ir/human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad?rlkey=scj4lcv1y00yh7bk18bztl0fz&st=yarlckox&dl=1 +human,Brain,10000,train,,,train_human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad,https://www.dropbox.com/scl/fi/b9uqvab7lderxnrd8c0e9/human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad?rlkey=heobzdft1rcl1ttn0wvi9ra5y&st=q0xewgnf&dl=1 From ad8ecb78751c83e1bdd62c93cd5e760b975adc9c Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 6 Dec 2024 21:27:07 +0800 Subject: [PATCH 121/203] minor --- examples/get_result_web.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/get_result_web.py b/examples/get_result_web.py index ee5d4158..c1ca4536 100644 --- a/examples/get_result_web.py +++ b/examples/get_result_web.py @@ -65,9 +65,12 @@ def spilt_web(url: str): if match: entity = match.group(1) project = match.group(2) - sweep_id = match.group(3) - - return entity, project, sweep_id + pattern = r'/sweeps/([^/?]+)' # 正则表达式模式 + match = re.search(pattern, url) + if match: + sweep_id = match.group(1) + return entity, project, sweep_id + return None else: print(url) print("No match found") From 8fc8f622b7339a55d1fb482a0f5a7fb2c6d22a38 Mon Sep 17 00:00:00 2001 From: xzy Date: Sat, 7 Dec 2024 20:21:57 +0800 Subject: [PATCH 122/203] update data --- dance/metadata/scdeepsort.csv | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 6d2f6317..6d5eb666 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -118,3 +118,15 @@ human,Brain,10000,train,,,train_human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_ human,Brain,10000,train,,,train_human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad,https://www.dropbox.com/scl/fi/hth8if16ri6y64yfabe66/human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad?rlkey=eiyf474thqawso5ltjn0pnwko&st=54qn18ky&dl=1 human,Brain,10000,train,,,train_human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad,https://www.dropbox.com/scl/fi/a3te6jw55cv4mujujq4ir/human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad?rlkey=scj4lcv1y00yh7bk18bztl0fz&st=yarlckox&dl=1 human,Brain,10000,train,,,train_human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad,https://www.dropbox.com/scl/fi/b9uqvab7lderxnrd8c0e9/human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad?rlkey=heobzdft1rcl1ttn0wvi9ra5y&st=q0xewgnf&dl=1 +human,Brain,10000,train,,,train_human_Brain07760522-707a-4a1c-8891-dbd1226d6b27_data.h5ad,https://www.dropbox.com/scl/fi/fdg72e3smhtbnen0njt1p/human_Brain07760522-707a-4a1c-8891-dbd1226d6b27_data.h5ad?rlkey=fcus37nuvqvur8qwqhxdayo9r&st=2e00lvxu&dl=1 +human,Brain,10000,train,,,train_human_Brain146216e1-ec30-4fee-a1fb-25defe801e2d_data.h5ad,https://www.dropbox.com/scl/fi/m55bqo60paasb7hbuhvex/human_Brain146216e1-ec30-4fee-a1fb-25defe801e2d_data.h5ad?rlkey=a1zz2xe6lsk20g5nie9c2ft6d&st=100jl011&dl=1 +human,Brain,10000,train,,,train_human_Brain22658f4f-9268-41ad-8828-cc53f4baa9fa_data.h5ad,https://www.dropbox.com/scl/fi/t5vnklfwn8eu2ht5ph4fd/human_Brain22658f4f-9268-41ad-8828-cc53f4baa9fa_data.h5ad?rlkey=latl59ppqnr4bif69xxt92sss&st=yhhuzvrf&dl=1 +human,Brain,10000,train,,,train_human_Brain421e5f54-5de7-425f-b399-34ead0651ce1_data.h5ad,https://www.dropbox.com/scl/fi/wmysnkpfbdjteospe0jjx/human_Brain421e5f54-5de7-425f-b399-34ead0651ce1_data.h5ad?rlkey=xdx6g6cisyi2gfohckjhud46g&st=zxu0jhp7&dl=1 +human,Brain,10000,train,,,train_human_Brain595c9010-99ec-462d-b6a1-2b2fe5407871_data.h5ad,https://www.dropbox.com/scl/fi/598qwju5349fc8z1zuskh/human_Brain595c9010-99ec-462d-b6a1-2b2fe5407871_data.h5ad?rlkey=8848cndanq1bun59yvggbjvnx&st=nnasrgud&dl=1 +human,Brain,10000,train,,,train_human_Brain700aed19-c16e-4ba8-9191-07da098a8626_data.h5ad,https://www.dropbox.com/scl/fi/h4lcjdl04mpgtkuthgc94/human_Brain700aed19-c16e-4ba8-9191-07da098a8626_data.h5ad?rlkey=hi0fk5rxia7shu6m4twky2tay&st=uw6trm86&dl=1 +human,Brain,10000,train,,,train_human_Brain70e4f35b-c98c-45a1-9aa9-2053b07315dd_data.h5ad,https://www.dropbox.com/scl/fi/ubqn57ate29dfuvapx4qi/human_Brain70e4f35b-c98c-45a1-9aa9-2053b07315dd_data.h5ad?rlkey=5p5wnukkq5lb6c8v0s5itcv2j&st=730w6nsz&dl=1 +human,Brain,10000,train,,,train_human_Brain72822932-10f6-466f-baf3-a2c1d89364bc_data.h5ad,https://www.dropbox.com/scl/fi/llmit9a77dkby69p5eki5/human_Brain72822932-10f6-466f-baf3-a2c1d89364bc_data.h5ad?rlkey=o294joyf2yrwi403qll905bak&st=m95dzpp5&dl=1 +human,Brain,10000,train,,,train_human_Brain9372df2d-13d6-4fac-980b-919a5b7eb483_data.h5ad,https://www.dropbox.com/scl/fi/kkgq2ry1nyzqp3aq9mag6/human_Brain9372df2d-13d6-4fac-980b-919a5b7eb483_data.h5ad?rlkey=yek0em9f7cal5bhq4h4xtjnxc&st=x3isnepq&dl=1 +human,Brain,10000,train,,,train_human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad,https://www.dropbox.com/scl/fi/q6vtn80sf8jpkvte55zpm/human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad?rlkey=6ca2gf47w5r53rw5y5h5b53e8&st=3wkp779m&dl=1 +human,Brain,10000,train,,,train_human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad,https://www.dropbox.com/scl/fi/nl14mhuuwlq9zmjntot7g/human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad?rlkey=o158zcyq781w4rj71pfsw8yf1&st=ds6kabvk&dl=1 +human,Brain,10000,train,,,train_human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad,https://www.dropbox.com/scl/fi/pyphmfixfeyu2wzyr216p/human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad?rlkey=sd6su8cmlc4g4k3hixihg1zo9&st=99diymvk&dl=1 From 02893d5883bf206318bf1a27a32bf87d7bad4244 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 9 Dec 2024 21:24:51 +0800 Subject: [PATCH 123/203] minor --- dance/sc_similarity/anndata_similarity.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py index 0287dee8..1d421ecf 100644 --- a/dance/sc_similarity/anndata_similarity.py +++ b/dance/sc_similarity/anndata_similarity.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import scanpy as sc +import scipy import yaml from omegaconf import OmegaConf from scipy.spatial.distance import jaccard @@ -187,6 +188,13 @@ def get_dataset_info(data: ad.AnnData): con_sim["gene_num"] = len(data.var) con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"]) con_sim["n_counts_var"] = np.var(data.obs["n_counts"]) + if "n_counts" not in data.var.columns: + if scipy.sparse.issparse(data.X): + gene_counts = np.array(data.X.sum(axis=0)).flatten() + else: + gene_counts = data.X.sum(axis=0) + data.var["n_counts"]=gene_counts + data.var["n_counts"]=data.var["n_counts"].astype(float) con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"]) con_sim["var_n_counts_var"] = np.var(data.var["n_counts"]) data.uns["con_sim"] = con_sim From 75a20b84b323a8b80cd028167c715ea196c25cd3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:27:03 +0000 Subject: [PATCH 124/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/sc_similarity/anndata_similarity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py index 1d421ecf..fca863db 100644 --- a/dance/sc_similarity/anndata_similarity.py +++ b/dance/sc_similarity/anndata_similarity.py @@ -193,8 +193,8 @@ def get_dataset_info(data: ad.AnnData): gene_counts = np.array(data.X.sum(axis=0)).flatten() else: gene_counts = data.X.sum(axis=0) - data.var["n_counts"]=gene_counts - data.var["n_counts"]=data.var["n_counts"].astype(float) + data.var["n_counts"] = gene_counts + data.var["n_counts"] = data.var["n_counts"].astype(float) con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"]) con_sim["var_n_counts_var"] = np.var(data.var["n_counts"]) data.uns["con_sim"] = con_sim From 75c64297ad1d93f3b85a5177a65f5cb313197c6a Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 9 Dec 2024 22:01:24 +0800 Subject: [PATCH 125/203] minor --- dance/sc_similarity/anndata_similarity.py | 436 +++++++++++++++------- 1 file changed, 295 insertions(+), 141 deletions(-) diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py index 0287dee8..0ff697b5 100644 --- a/dance/sc_similarity/anndata_similarity.py +++ b/dance/sc_similarity/anndata_similarity.py @@ -7,13 +7,15 @@ import anndata import anndata as ad import numpy as np +import ot import pandas as pd import scanpy as sc import yaml from omegaconf import OmegaConf -from scipy.spatial.distance import jaccard -from scipy.stats import pearsonr, wasserstein_distance -from sklearn.metrics.pairwise import cosine_similarity +from scipy.linalg import sqrtm +from scipy.spatial import cKDTree +from scipy.spatial.distance import cdist, directed_hausdorff, jaccard, jensenshannon +from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel # Suppress scipy warnings for constant input in Pearson correlation warnings.filterwarnings("ignore", message="An input array is constant") @@ -21,141 +23,252 @@ class AnnDataSimilarity: - def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, cell_col: str, + def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None, + init_random_state: Optional[int] = None, n_runs: int = 10, ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None, adata2_name: Optional[str] = None, methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"): """Initialize the AnnDataSimilarity object and perform data preprocessing.""" - self.adata1 = adata1.copy() - self.adata2 = adata2.copy() self.origin_adata1 = adata1.copy() self.origin_adata2 = adata2.copy() - self.cell_col = cell_col + self.sample_size = sample_size + self.init_random_state = init_random_state self.preprocess() self.results = {} - self.results_score = {} self.ground_truth_conf_path = ground_truth_conf_path self.adata1_name = adata1_name self.adata2_name = adata2_name self.methods = methods self.tissue = tissue + self.n_runs = n_runs - def filter_gene(self): - sc.pp.highly_variable_genes(self.adata1, n_top_genes=2000, flavor='seurat_v3') - sc.pp.highly_variable_genes(self.adata2, n_top_genes=2000, flavor='seurat_v3') + def filter_gene(self, n_top_genes=3000): + sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3') + sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3') - common_hvg = self.adata1.var_names[self.adata1.var['highly_variable']].intersection( - self.adata2.var_names[self.adata2.var['highly_variable']]) + common_hvg = self.origin_adata1.var_names[self.origin_adata1.var['highly_variable']].intersection( + self.origin_adata2.var_names[self.origin_adata2.var['highly_variable']]) - self.adata1 = self.adata1[:, common_hvg].copy() - self.adata2 = self.adata2[:, common_hvg].copy() + self.origin_adata1 = self.origin_adata1[:, common_hvg].copy() + self.origin_adata2 = self.origin_adata2[:, common_hvg].copy() self.common_genes = common_hvg def preprocess(self): - self.filter_gene() """Preprocess the data, including log normalization and normalization to probability distribution.""" - self.adata1.obs[self.cell_col] = self.adata1.obs[self.cell_col].astype(str) - self.adata2.obs[self.cell_col] = self.adata2.obs[self.cell_col].astype(str) - self.avg_expr1 = self._compute_average_expression(self.adata1) - self.avg_expr2 = self._compute_average_expression(self.adata2) - self.prob_expr1 = self._normalize_to_probability(self.avg_expr1) - self.prob_expr2 = self._normalize_to_probability(self.avg_expr2) - - def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame: - """Calculate the average gene expression for each cell type""" - return adata.to_df().groupby(adata.obs[self.cell_col]).mean() - - def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame: - """Normalize the gene expression matrix to a probability distribution (expression sums to 1 for each cell type)""" - return df.div(df.sum(axis=1), axis=0).fillna(0) - - def cosine_sim(self) -> pd.DataFrame: - """Computes the cosine similarity between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively.""" - sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2) - return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index) - - def pearson_corr(self) -> pd.DataFrame: - """Computes the Pearson correlation coefficient between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively.""" - celltypes1 = self.avg_expr1.index - celltypes2 = self.avg_expr2.index - corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) - corr_matrix.at[ct1, ct2] = corr - - return corr_matrix.astype(float) - - def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame: - """Computes the Jaccard similarity between two datasets. Uses a binary representation of gene expression based on a specified threshold. Returns a data frame with rows and columns of cell types in adata1 and adata2 respectively.""" - # Binarized expression matrix - binary_expr1 = (self.avg_expr1 > threshold).astype(int) - binary_expr2 = (self.avg_expr2 > threshold).astype(int) - - celltypes1 = binary_expr1.index - celltypes2 = binary_expr2.index - sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2]) - sim_matrix.at[ct1, ct2] = sim - - return sim_matrix.astype(float) - - def js_distance(self) -> pd.DataFrame: - """Computes the Jensen-Shannon divergence between two datasets. The expression data must first be normalized to a probability distribution. Returns a data frame with rows and columns containing the cell types of adata1 and adata2, respectively.""" - # def jsd(p, q): - # """ - # 计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。 - # """ - # p = p + 1e-12 - # q = q + 1e-12 - # m = 0.5 * (p + q) - # return 0.5 * (entropy(p, m) + entropy(q, m)) - - # from scipy.stats import entropy - - celltypes1 = self.prob_expr1.index - celltypes2 = self.prob_expr2.index - js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values, - self.prob_expr2.loc[ct2].values) - js_matrix.at[ct1, ct2] = jsd_value - - return js_matrix.astype(float) - - def _jensen_shannon_divergence(self, p, q) -> float: - """Compute the Jensen-Shannon divergence of two probability distributions p and q.""" - from scipy.spatial.distance import jensenshannon - return jensenshannon(p, q) + self.filter_gene() + + def sample_cells(self, random_state): + """ + Randomly sample cells from each dataset if sample_size is specified. + """ + np.random.seed(random_state) + if self.sample_size is None: + self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs) #need to think + if self.adata1.n_obs > self.sample_size: + indices1 = np.random.choice(self.adata1.n_obs, size=self.sample_size, replace=False) + self.sampled_adata1 = self.adata1[indices1, :].copy() + else: + self.sampled_adata1 = self.adata1.copy() + if self.adata2.n_obs > self.sample_size: + indices2 = np.random.choice(self.adata2.n_obs, size=self.sample_size, replace=False) + self.sampled_adata2 = self.adata2[indices2, :].copy() + else: + self.sampled_adata2 = self.adata2.copy() + + def normalize_data(self): # I am not sure + """ + Normalize the data by total counts per cell and log-transform. + """ + sc.pp.normalize_total(self.adata1, target_sum=1e4) + sc.pp.log1p(self.adata1) + sc.pp.normalize_total(self.adata2, target_sum=1e4) + sc.pp.log1p(self.adata2) + + def set_prob_data(self, sampled=False): + # Normalize the data to probability distributions + if sampled: + prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) + prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) + else: + prob_adata1 = self.adata1.X / self.adata1.X.sum(axis=1) + prob_adata2 = self.adata2.X / self.adata2.X.sum(axis=1) + # Handle any NaN values resulting from division by zero + self.X = np.nan_to_num(prob_adata1).toarray() + self.Y = np.nan_to_num(prob_adata2).toarray() + + def cosine_sim_sampled(self) -> pd.DataFrame: + """ + Computes the average cosine similarity between all pairs of cells from the two datasets. + """ + # Compute cosine similarity matrix + sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X) + # Return the average similarity + return sim_matrix.mean() + + def pearson_corr_sampled(self) -> pd.DataFrame: + """ + Computes the average Pearson correlation coefficient between all pairs of cells from the two datasets. + """ + # Compute Pearson correlation matrix + corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(), + self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs, + self.sampled_adata1.n_obs:] + # Return the average correlation + return np.nanmean(corr_matrix) + + def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: + """ + Computes the average Jaccard similarity between all pairs of binarized cells from the two datasets. + """ + # Binarize the data + binary_adata1 = (self.sampled_adata1.X > threshold).astype(int) + binary_adata2 = (self.sampled_adata2.X > threshold).astype(int) + # Compute Jaccard distance matrix + distance_matrix = cdist(binary_adata1.A, binary_adata2.A, metric='jaccard') + # Convert to similarity and compute the average + similarity_matrix = 1 - distance_matrix + return similarity_matrix.mean() + + def js_divergence_sampled(self) -> float: + """ + Computes the average Jensen-Shannon divergence between all pairs of cells from the two datasets. + """ + # Normalize the data to probability distributions + prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) + prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) + # Handle any NaN values resulting from division by zero + prob_adata1 = np.nan_to_num(prob_adata1).toarray() + prob_adata2 = np.nan_to_num(prob_adata2).toarray() + + # Define a function to compute JS divergence for a pair of probability vectors + def jsd(p, q): + return jensenshannon(p, q) + + # Compute JS divergence matrix + jsd_vectorized = np.vectorize(jsd, signature='(n),(n)->()') + divergence_matrix = np.zeros((prob_adata1.shape[0], prob_adata2.shape[0])) + for i in range(prob_adata1.shape[0]): + divergence_matrix[i, :] = jsd_vectorized( + np.repeat(prob_adata1[i, :], prob_adata2.shape[0], axis=0).reshape(-1, prob_adata1.shape[1]), + prob_adata2) + + # Convert divergence to similarity and compute the average + similarity_matrix = 1 - divergence_matrix + return np.nanmean(similarity_matrix) + + def compute_mmd(self) -> float: + X = self.X + Y = self.Y + kernel = "rbf" + gamma = 1.0 + if kernel == 'rbf': + K_X = np.exp(-gamma * cdist(X, X, 'sqeuclidean')) + K_Y = np.exp(-gamma * cdist(Y, Y, 'sqeuclidean')) + K_XY = np.exp(-gamma * cdist(X, Y, 'sqeuclidean')) + elif kernel == 'linear': + K_X = np.dot(X, X.T) + K_Y = np.dot(Y, Y.T) + K_XY = np.dot(X, Y.T) + else: + raise ValueError("Unsupported kernel type") + + m = X.shape[0] + n = Y.shape[0] + + sum_X = (np.sum(K_X) - np.sum(np.diag(K_X))) / (m * (m - 1)) + sum_Y = (np.sum(K_Y) - np.sum(np.diag(K_Y))) / (n * (n - 1)) + sum_XY = np.sum(K_XY) / (m * n) + + mmd_squared = sum_X + sum_Y - 2 * sum_XY + mmd = np.sqrt(max(mmd_squared, 0)) + return 1 / (1 + mmd) def common_genes_num(self): return len(self.common_genes) - def otdd(): + def otdd(self): """Compute the OTDD between two data sets.""" raise NotImplementedError("OTDD!") - def data_company(): + def data_company(self): raise NotImplementedError("data company") - def wasserstein_dist(self) -> pd.DataFrame: - """Compute the Wasserstein distance between two datasets. Return a data frame with the cell types in rows and columns of adata1 and adata2 respectively.""" - celltypes1 = self.avg_expr1.index - celltypes2 = self.avg_expr2.index - wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2) - - for ct1 in celltypes1: - for ct2 in celltypes2: - wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2]) - wasserstein_matrix.at[ct1, ct2] = wd - - return wasserstein_matrix.astype(float) + def wasserstein_dist(self) -> float: + """ + Computes the average Wasserstein distance between all pairs of cells from the two datasets. + """ + X = self.X + Y = self.Y + a = np.ones((X.shape[0], )) / X.shape[0] + b = np.ones((Y.shape[0], )) / Y.shape[0] + M = ot.dist(X, Y, metric='euclidean') + wasserstein_dist = ot.emd2(a, b, M) + return 1 / 1 + wasserstein_dist + + def get_Hausdorff(self): + X = self.X + Y = self.Y + forward = directed_hausdorff(X, Y)[0] + backward = directed_hausdorff(X, Y)[0] + hausdorff_distance = max(forward, backward) + normalized_hausdorff = hausdorff_distance / np.sqrt(X.shape[1]) + similarity = 1 - normalized_hausdorff + return similarity + + def chamfer_distance(self): + X = self.X + Y = self.Y + tree_A = cKDTree(X) + tree_B = cKDTree(Y) + + distances_A_to_B, _ = tree_A.query(Y) + distances_B_to_A, _ = tree_B.query(X) + + chamfer_A_to_B = np.mean(distances_A_to_B) + chamfer_B_to_A = np.mean(distances_B_to_A) + distance = chamfer_A_to_B + chamfer_B_to_A + normalized_chamfer = distance / np.sqrt(X.shape[1]) + similarity = 1 - normalized_chamfer + return similarity + + def energy_distance_metric(self): + X = self.X + Y = self.Y + XX = cdist(X, X, 'euclidean') + YY = cdist(Y, Y, 'euclidean') + XY = cdist(X, Y, 'euclidean') + distance = 2 * np.mean(XY) - np.mean(XX) - np.mean(YY) + return 1 / (1 + distance) + + def get_sinkhorn2(self): + X = self.X + Y = self.Y + a = np.ones(X.shape[0]) / X.shape[0] + b = np.ones(Y.shape[0]) / Y.shape[0] + M = ot.dist(X, Y, metric='euclidean') + reg = 0.1 + sinkhorn_dist = ot.sinkhorn2(a, b, M, reg) + return 1 / (1 + sinkhorn_dist) + + def bures_distance(self): + X = self.X + Y = self.Y + C1 = np.cov(X, rowvar=False) + C2 = np.cov(Y, rowvar=False) + sqrt_C1 = sqrtm(C1) + product = sqrt_C1 @ C2 @ sqrt_C1 + sqrt_product = sqrtm(product) + trace = np.trace(C1) + np.trace(C2) - 2 * np.trace(sqrt_product) + return 1 / (1 + np.sqrt(max(trace, 0))) + + def spectral_distance(self): + X = self.X + Y = self.Y + C1 = np.cov(X, rowvar=False) + C2 = np.cov(Y, rowvar=False) + eig_A = np.linalg.eigvalsh(C1) + eig_B = np.linalg.eigvalsh(C2) + return 1 / (1 + np.linalg.norm(eig_A - eig_B)) def get_dataset_meta_sim(self): # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general'] @@ -177,7 +290,7 @@ def get_con_sim(con_data_1, con_data_2): def get_dataset_info(data: ad.AnnData): con_sim = {} - con_sim["nnz_mean"] = np.mean(data.obs["nnz"]) + con_sim["nnz_mean"] = np.mean(data.obs["nnz"]) #sample 10000之后这里是应该更新的 con_sim["nnz_var"] = np.var(data.obs["nnz"]) nnz_values = data.X[data.X.nonzero()] con_sim["nnz_counts_mean"] = np.mean(nnz_values) @@ -232,10 +345,10 @@ def get_targets(dataset_truth: str): return sim_targets def compute_similarity( - self, methods: List[str] = [ + self, random_state: int, methods: List[str] = [ 'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim" ] - ) -> Dict[str, pd.DataFrame]: + ) -> Dict[str, float]: """Computes the specified similarity measure. Parameters: methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' @@ -243,60 +356,101 @@ def compute_similarity( Dictionary containing the similarity matrices """ + self.adata1 = self.origin_adata1.copy() + self.adata2 = self.origin_adata2.copy() + self.normalize_data() + self.sample_cells(random_state) + self.set_prob_data() + results = {} for method in methods: + print(method) if method == 'cosine': - results['cosine'] = self.cosine_sim() + results['cosine'] = self.cosine_sim_sampled() elif method == 'pearson': - results['pearson'] = self.pearson_corr() + results['pearson'] = self.pearson_corr_sampled() elif method == 'jaccard': - results['jaccard'] = self.jaccard_sim() + results['jaccard'] = self.jaccard_sim_sampled() elif method == 'js_distance': - results['js_distance'] = self.js_distance() + results['js_distance'] = self.js_divergence_sampled() elif method == 'wasserstein': results['wasserstein'] = self.wasserstein_dist() elif method == "common_genes_num": results["common_genes_num"] = self.common_genes_num() + elif method == "Hausdorff": + results["Hausdorff"] = self.get_Hausdorff() + elif method == "chamfer": + results["chamfer"] = self.chamfer_distance() + elif method == "energy": + results["energy"] = self.energy_distance_metric() + elif method == "sinkhorn2": + results["sinkhorn2"] = self.get_sinkhorn2() + elif method == "bures": + results["bures"] = self.bures_distance() + elif method == "spectral": + results["spectral"] = self.spectral_distance() elif method == "otdd": results['otdd'] = self.otdd() elif method == "ground_truth": results["ground_truth"] = self.get_ground_truth() elif method == "metadata_sim": results["metadata_sim"] = self.get_dataset_meta_sim() + elif method == "mmd": + results["mmd"] = self.compute_mmd() else: raise ValueError(f"Unsupported similarity method: {method}") return results - def get_similarity_matrix( + def get_similarity_matrix_A2B( self, methods: List[str] = [ - 'cosine', 'pearson', 'jaccard', 'js_distance', "common_genes_num", "ground_truth", "metadata_sim" + "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num", + "ground_truth", "metadata_sim", "mmd" ] - ) -> Dict[str, pd.DataFrame]: + ) -> Dict[str, float]: """Same as compute_similarity, keeping method name consistency.""" - self.results = self.compute_similarity(methods) - return self.results + cumulative_results = {method: 0.0 for method in methods} - def get_max_similarity_A_to_B(self): - if self.results is None: - raise ValueError(f"need results!") - else: - self.results_score = {} - for key in self.results: - if key not in ["common_genes_num", "ground_truth", "metadata_sim"]: - self.results_score[key] = self._get_max_similarity(self.results[key]) + for run in range(self.n_runs): + # Update random state for each run + if self.init_random_state is not None: + current_random_state = self.init_random_state + run + else: + current_random_state = None + run_results = self.compute_similarity(methods=methods, random_state=current_random_state) + for method in methods: + if method in ["ground_truth"]: + cumulative_results[method] = run_results[method] else: - self.results_score[key] = self.results[key] - return self.results_score - - def _get_max_similarity(self, similarity_matrix: pd.DataFrame): - """Maximum matching average similarity score.""" - matched_values = [ - similarity_matrix.loc[label, - label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max() - for label in similarity_matrix.index - ] # need to ask - overall_similarity = np.mean(matched_values) - return overall_similarity + cumulative_results[method] += run_results[method] + # Average the results over the number of runs + averaged_results = { + method: + cumulative_results[method] if method in ["ground_truth"] else cumulative_results[method] / self.n_runs + for method in methods + } + return averaged_results + + # def get_max_similarity_A_to_B(self): + # if self.results is None: + # raise ValueError(f"need results!") + # else: + # self.results_score = {} + # for key in self.results: + # if key not in ["common_genes_num", "ground_truth", "metadata_sim"]: + # self.results_score[key] = self._get_max_similarity(self.results[key]) + # else: + # self.results_score[key] = self.results[key] + # return self.results_score + + # def _get_max_similarity(self, similarity_matrix: pd.DataFrame): + # """Maximum matching average similarity score.""" + # matched_values = [ + # similarity_matrix.loc[label, + # label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max() + # for label in similarity_matrix.index + # ] # need to ask + # overall_similarity = np.mean(matched_values) + # return overall_similarity def extract_type_target_params(item_text): From 9eba40ad9e14ee0c85467c03d14ee2d52fdfb5b3 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Mon, 9 Dec 2024 22:03:14 +0800 Subject: [PATCH 126/203] minor --- dance/metadata/scdeepsort.csv | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 6d5eb666..ec72d557 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -130,3 +130,6 @@ human,Brain,10000,train,,,train_human_Brain9372df2d-13d6-4fac-980b-919a5b7eb483_ human,Brain,10000,train,,,train_human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad,https://www.dropbox.com/scl/fi/q6vtn80sf8jpkvte55zpm/human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad?rlkey=6ca2gf47w5r53rw5y5h5b53e8&st=3wkp779m&dl=1 human,Brain,10000,train,,,train_human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad,https://www.dropbox.com/scl/fi/nl14mhuuwlq9zmjntot7g/human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad?rlkey=o158zcyq781w4rj71pfsw8yf1&st=ds6kabvk&dl=1 human,Brain,10000,train,,,train_human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad,https://www.dropbox.com/scl/fi/pyphmfixfeyu2wzyr216p/human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad?rlkey=sd6su8cmlc4g4k3hixihg1zo9&st=99diymvk&dl=1 +human,Brain,10000,train,,,train_human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad,https://www.dropbox.com/scl/fi/0v3auavah96csg1f486f2/human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad?rlkey=72cwgt8wh1421v32l3fsg6mvx&st=3fghbaz3&dl=1 +human,Brain,10000,train,,,train_human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad,https://www.dropbox.com/scl/fi/ne31m4apt1q90942cvfpy/human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad?rlkey=46s37qp2qpf8rqwfeef0gw5p2&st=8npktyyr&dl=1 +human,Brain,10000,train,,,train_human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad,https://www.dropbox.com/scl/fi/g9yrnvpj68nohpq97psoq/human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad?rlkey=tkqnbytv0yl7v0f0gngml83jq&st=a7lynn7i&dl=1 From cdeebd9615186a0852d6991c8eed11cb899c9b42 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 10 Dec 2024 22:27:19 +0800 Subject: [PATCH 127/203] minor --- dance/metadata/scdeepsort.csv | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index ec72d557..46d46646 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -133,3 +133,7 @@ human,Brain,10000,train,,,train_human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_ human,Brain,10000,train,,,train_human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad,https://www.dropbox.com/scl/fi/0v3auavah96csg1f486f2/human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad?rlkey=72cwgt8wh1421v32l3fsg6mvx&st=3fghbaz3&dl=1 human,Brain,10000,train,,,train_human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad,https://www.dropbox.com/scl/fi/ne31m4apt1q90942cvfpy/human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad?rlkey=46s37qp2qpf8rqwfeef0gw5p2&st=8npktyyr&dl=1 human,Brain,10000,train,,,train_human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad,https://www.dropbox.com/scl/fi/g9yrnvpj68nohpq97psoq/human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad?rlkey=tkqnbytv0yl7v0f0gngml83jq&st=a7lynn7i&dl=1 +human,Blood,6368,train,,,train_human_Bloodfe52003e-1460-4a65-a213-2bb1a508332f_data.h5ad,https://www.dropbox.com/scl/fi/esqgoi7vgwwt9j6apipn3/human_Bloodfe52003e-1460-4a65-a213-2bb1a508332f_data.h5ad?rlkey=i5tw4dsprwonypls9q1sjv6z1&st=rvpv0bx4&dl=1 +human,Blood,10000,train,,,train_human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad,https://www.dropbox.com/scl/fi/2ze4zzjl9ho0yioypet94/human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad?rlkey=4tjvgzj69eqnwqt34y6ykjxx7&st=6eazya61&dl=1 +human,Blood,10000,train,,,train_human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad,https://www.dropbox.com/scl/fi/ppmitw72imo7hoiqk02uj/human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad?rlkey=qkvls3xesyu4wdc4f46ayy9wc&st=8wc95ewv&dl=1 +human,Blood,10000,train,,,train_human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad,https://www.dropbox.com/scl/fi/troppy0ouk9w60xx3gucv/human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad?rlkey=vm77ead52n9fy9e4lp9kpt8y3&st=zjdzi0rs&dl=1 From 225b6558f0a7478811e336f09d5591e74835d4e9 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Wed, 11 Dec 2024 15:10:51 +0800 Subject: [PATCH 128/203] minor --- examples/multi_modality/joint_embedding/dcca.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index beefdeb0..0f89ce64 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -45,11 +45,11 @@ def parameter_setting(): parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch") parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test") parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") + parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex") parser.add_argument("-device", "--device", default="cuda") parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) - + parser.add_argument("--span", type=float, default=0.3) return parser @@ -65,7 +65,7 @@ def parameter_setting(): args.lr2 = 0.005 args.flr2 = 0.0005 - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection") + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span) data = dataset.load_data() le = preprocessing.LabelEncoder() @@ -132,7 +132,7 @@ def parameter_setting(): adata = adata[adata_sol.obs_names] adata_sol.obsm['X_emb'] = adata.X score = metrics.labeled_clustering_evaluate(adata, adata_sol) - score.update(metrics.integration_openproblems_evaluate(adata_sol)) + # score.update(metrics.integration_openproblems_evaluate(adata_sol)) score.update({ 'seed': args.seed + k, 'subtask': args.subtask, From f70ee3ee39b58b43d84244a5f9f4d5351d354b76 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 07:12:09 +0000 Subject: [PATCH 129/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_modality/joint_embedding/dcca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index 0f89ce64..1467a737 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -65,7 +65,8 @@ def parameter_setting(): args.lr2 = 0.005 args.flr2 = 0.0005 - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span) + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection", + span=args.span) data = dataset.load_data() le = preprocessing.LabelEncoder() From 5645a6add8c80f2eebfd18f013db945189283402 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 12 Dec 2024 09:46:37 +0800 Subject: [PATCH 130/203] update example --- .../multi_modality/joint_embedding/jae.py | 18 +++++++++++++---- .../multi_modality/joint_embedding/scmogcn.py | 20 ++++++++++++++----- .../multi_modality/joint_embedding/scmvae.py | 8 ++++---- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py index cca32808..ab136764 100644 --- a/examples/multi_modality/joint_embedding/jae.py +++ b/examples/multi_modality/joint_embedding/jae.py @@ -10,8 +10,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex", + choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -21,6 +21,7 @@ parser.add_argument("-bs", "--batch_size", default=128, type=int) parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("--span", type=float, default=0.3) args = parser.parse_args() @@ -30,7 +31,7 @@ rndseed = args.seed set_seed(rndseed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span) data = dataset.load_data() data.set_config( @@ -39,6 +40,15 @@ feature_channel=["X_pca", "X_pca"], label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], ) + if True: + cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() + cell_type_labels_unique = list(np.unique(cell_type_labels)) + c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) + data.data['mod1'].obsm["cell_type"] = c_labels + data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_train_data(return_type="torch") (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch") @@ -61,7 +71,7 @@ print(embeds) score = model.score(X_test, test_id, labels, metric="clustering") - score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) + # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems")) score.update({ 'seed': args.seed + k, 'subtask': args.subtask, diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py index 0ed73f3f..1e80786c 100644 --- a/examples/multi_modality/joint_embedding/scmogcn.py +++ b/examples/multi_modality/joint_embedding/scmogcn.py @@ -11,8 +11,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2", - choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"]) + parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex", + choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -24,7 +24,8 @@ parser.add_argument("-bs", "--batch_size", default=512, type=int) parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") - + parser.add_argument("--span", type=float, default=0.3) + args = parser.parse_args() device = args.device @@ -33,7 +34,7 @@ rndseed = args.seed set_seed(rndseed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span) data = dataset.load_data() train_size = len(data.get_split_idx("train")) @@ -45,6 +46,15 @@ feature_channel=["X_pca", "X_pca"], label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], ) + if True: + cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() + cell_type_labels_unique = list(np.unique(cell_type_labels)) + c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) + data.data['mod1'].obsm["cell_type"] = c_labels + data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0]) + data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch") phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1) test_id = np.arange(x_mod1.shape[0]) @@ -68,7 +78,7 @@ embeds = model.predict(test_id).cpu().numpy() print(embeds) score = model.score(test_id, labels, metric="clustering") - score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) + # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems")) score.update({ 'seed': args.seed + k, 'subtask': args.subtask, diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py index 65464c0f..b913c5f3 100644 --- a/examples/multi_modality/joint_embedding/scmvae.py +++ b/examples/multi_modality/joint_embedding/scmvae.py @@ -32,11 +32,11 @@ def parameter_setting(): parser.add_argument("--epoch_per_test", "-ept", type=int, default=1, help="Epoch per test, must smaller than max iteration.") parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") + parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex") parser.add_argument("-device", "--device", default="cuda") parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) - + parser.add_argument("--span", type=float, default=0.3) return parser @@ -46,7 +46,7 @@ def parameter_setting(): set_seed(args.seed) assert args.max_iteration > args.epoch_per_test - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection") + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span) data = dataset.load_data() le = preprocessing.LabelEncoder() @@ -121,7 +121,7 @@ def parameter_setting(): embeds = model.predict(x_test, y_test).cpu().numpy() print(embeds.shape) score = model.score(x_test, y_test, labels) - score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) + # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) score.update({ 'seed': args.seed + k, 'subtask': args.subtask, From 06d282daf412141287c2fb20949cd1c055e2dcde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 01:47:03 +0000 Subject: [PATCH 131/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_modality/joint_embedding/jae.py | 10 +++++++--- examples/multi_modality/joint_embedding/scmogcn.py | 12 ++++++++---- examples/multi_modality/joint_embedding/scmvae.py | 3 ++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py index ab136764..c726dd8b 100644 --- a/examples/multi_modality/joint_embedding/jae.py +++ b/examples/multi_modality/joint_embedding/jae.py @@ -10,8 +10,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex", - choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"]) + parser.add_argument( + "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[ + "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex", + "GSE140203_SKIN_atac2gex" + ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -31,7 +34,8 @@ rndseed = args.seed set_seed(rndseed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True, + span=args.span) data = dataset.load_data() data.set_config( diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py index 1e80786c..1ef52647 100644 --- a/examples/multi_modality/joint_embedding/scmogcn.py +++ b/examples/multi_modality/joint_embedding/scmogcn.py @@ -11,8 +11,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex", - choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"]) + parser.add_argument( + "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[ + "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex", + "GSE140203_SKIN_atac2gex" + ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv") @@ -25,7 +28,7 @@ parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") parser.add_argument("--span", type=float, default=0.3) - + args = parser.parse_args() device = args.device @@ -34,7 +37,8 @@ rndseed = args.seed set_seed(rndseed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span) + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True, + span=args.span) data = dataset.load_data() train_size = len(data.get_split_idx("train")) diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py index b913c5f3..461adcd4 100644 --- a/examples/multi_modality/joint_embedding/scmvae.py +++ b/examples/multi_modality/joint_embedding/scmvae.py @@ -46,7 +46,8 @@ def parameter_setting(): set_seed(args.seed) assert args.max_iteration > args.epoch_per_test - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span) + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection", + span=args.span) data = dataset.load_data() le = preprocessing.LabelEncoder() From cee920432d1d053c78135d90f407806c4e1291da Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Thu, 12 Dec 2024 11:34:12 +0800 Subject: [PATCH 132/203] minor --- examples/multi_modality/joint_embedding/dcca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index 1467a737..338f67fb 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -50,6 +50,7 @@ def parameter_setting(): parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) parser.add_argument("--span", type=float, default=0.3) + parser.add_argument("--selection_threshold", type=int, default=3000) return parser @@ -66,7 +67,7 @@ def parameter_setting(): args.flr2 = 0.0005 dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection", - span=args.span) + span=args.span,selection_threshold=args.selection_threshold) data = dataset.load_data() le = preprocessing.LabelEncoder() From ed0b6b4b3104284bd93190dff210a6e01f6c5a31 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 03:34:36 +0000 Subject: [PATCH 133/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/multi_modality/joint_embedding/dcca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index 338f67fb..9c172a07 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -67,7 +67,7 @@ def parameter_setting(): args.flr2 = 0.0005 dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection", - span=args.span,selection_threshold=args.selection_threshold) + span=args.span, selection_threshold=args.selection_threshold) data = dataset.load_data() le = preprocessing.LabelEncoder() From a19a0508bdd3f73feef68f29aac072fec2ed5028 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 12 Dec 2024 11:35:23 +0800 Subject: [PATCH 134/203] minor --- examples/multi_modality/joint_embedding/dcca.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index 0f89ce64..c604fac2 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -46,7 +46,7 @@ def parameter_setting(): parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test") parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex") - parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("-device", "--device", default="cuda:5") parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) parser.add_argument("--span", type=float, default=0.3) @@ -65,7 +65,8 @@ def parameter_setting(): args.lr2 = 0.005 args.flr2 = 0.0005 - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span) + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection", + span=args.span) data = dataset.load_data() le = preprocessing.LabelEncoder() From 33eaa23cd888c84ff1e42d4965757c99ef9b0ce0 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 12 Dec 2024 15:18:32 +0800 Subject: [PATCH 135/203] minor --- examples/multi_modality/joint_embedding/dcca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py index beefdeb0..47792c40 100644 --- a/examples/multi_modality/joint_embedding/dcca.py +++ b/examples/multi_modality/joint_embedding/dcca.py @@ -45,8 +45,8 @@ def parameter_setting(): parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch") parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test") parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") - parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("-t", "--subtask", default="GSE140203_BRAIN_atac2gex") + parser.add_argument("-device", "--device", default="cuda:4") parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) From b08ce147742e8c226e30d5bf3b100497ab3b955e Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 12 Dec 2024 15:18:40 +0800 Subject: [PATCH 136/203] minor --- examples/multi_modality/joint_embedding/scmvae.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py index 65464c0f..dfb5f4a6 100644 --- a/examples/multi_modality/joint_embedding/scmvae.py +++ b/examples/multi_modality/joint_embedding/scmvae.py @@ -32,8 +32,8 @@ def parameter_setting(): parser.add_argument("--epoch_per_test", "-ept", type=int, default=1, help="Epoch per test, must smaller than max iteration.") parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") - parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex") + parser.add_argument("-device", "--device", default="cuda:4") parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) From 1ad3cecabd7ad7d4e2f2fce145b1fa76be203a79 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Thu, 12 Dec 2024 20:28:45 +0800 Subject: [PATCH 137/203] minor --- dance/metadata/scdeepsort.csv | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 46d46646..04085cfa 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -137,3 +137,10 @@ human,Blood,6368,train,,,train_human_Bloodfe52003e-1460-4a65-a213-2bb1a508332f_d human,Blood,10000,train,,,train_human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad,https://www.dropbox.com/scl/fi/2ze4zzjl9ho0yioypet94/human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad?rlkey=4tjvgzj69eqnwqt34y6ykjxx7&st=6eazya61&dl=1 human,Blood,10000,train,,,train_human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad,https://www.dropbox.com/scl/fi/ppmitw72imo7hoiqk02uj/human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad?rlkey=qkvls3xesyu4wdc4f46ayy9wc&st=8wc95ewv&dl=1 human,Blood,10000,train,,,train_human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad,https://www.dropbox.com/scl/fi/troppy0ouk9w60xx3gucv/human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad?rlkey=vm77ead52n9fy9e4lp9kpt8y3&st=zjdzi0rs&dl=1 +human,Brain,8077,train,,,train_human_Braind5452b83-7c3d-4d7c-ab7a-c7fece7196c5_data.h5ad,https://www.dropbox.com/scl/fi/yqk7qe9qynbysy2qzuymp/human_Braind5452b83-7c3d-4d7c-ab7a-c7fece7196c5_data.h5ad?rlkey=9zbbwyewq97ff9eaermqx8ers&st=j1ordew8&dl=1 +human,Brain,3581,train,,,train_human_Brain774c18c5-efa1-4dc5-9e5e-2c824bab2e34_data.h5ad,https://www.dropbox.com/scl/fi/6jvi5wnl28u4dw6notnpo/human_Brain774c18c5-efa1-4dc5-9e5e-2c824bab2e34_data.h5ad?rlkey=5cmvurxmnv9u2gmigw5cc250s&st=id2kc01n&dl=1 +human,Brain,1318,train,,,train_human_Brain3d044b52-140a-4528-bf0d-a2dbef9e1f40_data.h5ad,https://www.dropbox.com/scl/fi/ocowzkh5d6jlo7stam48h/human_Brain3d044b52-140a-4528-bf0d-a2dbef9e1f40_data.h5ad?rlkey=rjkcpggc3btgsti8sx0tychif&st=z79g3sv1&dl=1 +human,Brain,6877,train,,,train_human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_data.h5ad,https://www.dropbox.com/scl/fi/jem4d9yaa7ovg4ahhxken/human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_data.h5ad?rlkey=xjzehuoucxjamouyw7tfyo54h&st=0ti1j2kq&dl=1 +human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad,https://www.dropbox.com/scl/fi/xutjy05pxtqlt2nyk35kp/human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad?rlkey=34ahhk9g9cxrugt8canfmfl3u&st=ia53zjuo&dl=1 +human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1 +human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1 From c26f490d78f231689fc8e7c1902e61d6c4e6a2c8 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 12 Dec 2024 20:33:08 +0800 Subject: [PATCH 138/203] minor --- examples/multi_modality/joint_embedding/scmvae.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py index 461adcd4..97354164 100644 --- a/examples/multi_modality/joint_embedding/scmvae.py +++ b/examples/multi_modality/joint_embedding/scmvae.py @@ -32,11 +32,12 @@ def parameter_setting(): parser.add_argument("--epoch_per_test", "-ept", type=int, default=1, help="Epoch per test, must smaller than max iteration.") parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") - parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex") + parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex") parser.add_argument("-device", "--device", default="cuda") parser.add_argument("--final_rate", type=float, default=1e-4) parser.add_argument("--scale_factor", type=float, default=4) parser.add_argument("--span", type=float, default=0.3) + parser.add_argument("--selection_threshold", type=int, default=3000) return parser @@ -47,7 +48,7 @@ def parameter_setting(): assert args.max_iteration > args.epoch_per_test dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection", - span=args.span) + span=args.span, selection_threshold=args.selection_threshold) data = dataset.load_data() le = preprocessing.LabelEncoder() From 467d4a01a8df5f07041c3781fdf8d5896461311c Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 12 Dec 2024 22:26:58 +0800 Subject: [PATCH 139/203] minor --- dance/metadata/scdeepsort.csv | 61 +++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 04085cfa..381f8efc 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -144,3 +144,64 @@ human,Brain,6877,train,,,train_human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_d human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad,https://www.dropbox.com/scl/fi/xutjy05pxtqlt2nyk35kp/human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad?rlkey=34ahhk9g9cxrugt8canfmfl3u&st=ia53zjuo&dl=1 human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1 human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1 +human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&dl=1 +human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&dl=1 +human,Intestine,6444,train,,,train_human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad,https://www.dropbox.com/scl/fi/6rd98oo0z68dmqpuhn7ap/human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad?rlkey=v6zz40z2f73h4oivfx67y7cgr&dl=1 +human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad,https://www.dropbox.com/scl/fi/u57hnsh6nv88r8nnrwfnl/human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad?rlkey=suzpw1ikma8kpr3uxkv7nosue&dl=1 +human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&dl=1 +human,Intestine,7443,train,,,train_human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad,https://www.dropbox.com/scl/fi/i9271g8nx3kmxi6rb12iv/human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad?rlkey=2vfdh7k5amnul3d786g2zmr6g&dl=1 +human,Kidney,10000,train,,,train_human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad,https://www.dropbox.com/scl/fi/3zfgu6g7mzv2v7f7qpwvq/human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad?rlkey=euso7cvuwcpauk21o3yme3959&dl=1 +human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&dl=1 +human,Kidney,10000,train,,,train_human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/krhe97shrnuofdlthnopj/human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=0lft0ebfaz4e0rzo1xpotmt79&dl=1 +human,Kidney,10000,train,,,train_human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad,https://www.dropbox.com/scl/fi/mhr92khp2j4ydit9mjiky/human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad?rlkey=9eg9ota05td575buw4vh6qdjd&dl=1 +human,Kidney,10000,train,,,train_human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad,https://www.dropbox.com/scl/fi/29d15brkbpsx14tr9qt0w/human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad?rlkey=sxzcd0btog8nhm13sf36r0q78&dl=1 +human,Kidney,10000,train,,,train_human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad,https://www.dropbox.com/scl/fi/gkqbvzh99bhegjar2b9q0/human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad?rlkey=emfdlopj8v08yqqp9c82n16je&dl=1 +human,Kidney,10000,train,,,train_human_Kidney9df60c57-fdf3-4e93-828e-fe9303f20438_data.h5ad,https://www.dropbox.com/scl/fi/0uc53tlnh0c38kjtmoa76/human_Kidney9df60c57-fdf3-4e93-828e-fe9303f20438_data.h5ad?rlkey=633zoidj6app2k616pue50d7u&dl=1 +human,Kidney,10000,train,,,train_human_Kidneybe39785b-67cb-4177-be19-a40ee3747e45_data.h5ad,https://www.dropbox.com/scl/fi/uo55loqew4nsh2yntiec6/human_Kidneybe39785b-67cb-4177-be19-a40ee3747e45_data.h5ad?rlkey=z2qc037d8g6nc3rbgkchnsv1v&dl=1 +human,Kidney,10000,train,,,train_human_Kidney32b9bdce-2481-4c85-ba1b-6ad5fcea844c_data.h5ad,https://www.dropbox.com/scl/fi/ugqjbf78tlc5g55dygysz/human_Kidney32b9bdce-2481-4c85-ba1b-6ad5fcea844c_data.h5ad?rlkey=h0d4nxyftqt9ktsh7s1tn623e&dl=1 +human,Kidney,10000,train,,,train_human_Kidney0b4a15a7-4e9e-4555-9733-2423e5c66469_data.h5ad,https://www.dropbox.com/scl/fi/aql02yzq6rosod8071qlu/human_Kidney0b4a15a7-4e9e-4555-9733-2423e5c66469_data.h5ad?rlkey=0oaq4962yw2642wa7mlxpkzab&dl=1 +human,Kidney,9641,train,,,train_human_Kidney53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad,https://www.dropbox.com/scl/fi/bqnrkyjzmbppgnej9l1qq/human_Kidney53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad?rlkey=co1531kyzzbghf4bvcqgg5rjn&dl=1 +human,Kidney,86,train,,,train_human_Kidney6a30bf44-c490-41ac-965b-0bb58432b10a_data.h5ad,https://www.dropbox.com/scl/fi/8vs3bwrk84shbekth3the/human_Kidney6a30bf44-c490-41ac-965b-0bb58432b10a_data.h5ad?rlkey=kbcz4p48v0rhu18hb8r2y47hq&dl=1 +human,Kidney,6044,train,,,train_human_Kidneyf801b7a9-80a6-4d09-9161-71474deb58ae_data.h5ad,https://www.dropbox.com/scl/fi/60uxtvicy2n8srhqfmub3/human_Kidneyf801b7a9-80a6-4d09-9161-71474deb58ae_data.h5ad?rlkey=x0im2udw8litcyzcsywipm49u&dl=1 +human,Kidney,7802,train,,,train_human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad,https://www.dropbox.com/scl/fi/xmzomvt0c8bza3fy8me0p/human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad?rlkey=iqzword5254z5rujjdey1u8hc&dl=1 +human,Kidney,6847,train,,,train_human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad,https://www.dropbox.com/scl/fi/rhngz2alde48jotpy5c5v/human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad?rlkey=u0x4dsnt569wq07l3h1rqjzum&dl=1 +human,Kidney,10000,train,,,train_human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad,https://www.dropbox.com/scl/fi/ybml7y2bth0qjnv3x1ieg/human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad?rlkey=qkjgdqttk3s10ht54109a4cad&dl=1 +human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/553s0af5q2nibafj4nkux/human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=p85qlsjixsuuutgwnms3w4y30&dl=1 +human,Kidney,10000,train,,,train_human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad,https://www.dropbox.com/scl/fi/feklth6jvnc5qqwvgaydy/human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad?rlkey=28vpy2m90lnri9aekfthrsvr1&dl=1 +human,Kidney,5848,train,,,train_human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad,https://www.dropbox.com/scl/fi/1jq1wrqo1rcl041antcm8/human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad?rlkey=ssgfsiobqfah3pxgqnrsaff6l&dl=1 +human,Kidney,9641,train,,,train_human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad,https://www.dropbox.com/scl/fi/o2cnntkrd5j6coeqehv8b/human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad?rlkey=5tbupfd3cdvqzy2rix6scvwzu&dl=1 +human,Lung,10000,train,,,train_human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/w0n6axa32nej87tw4rk49/human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=8lgoi54y9wtxtfwpmnumpmzex&dl=1 +human,Lung,10000,train,,,train_human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad,https://www.dropbox.com/scl/fi/dqhei15s96dg3q8bdd31b/human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad?rlkey=ykpxbucys97t327fwehflkoa2&dl=1 +human,Lung,10000,train,,,train_human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad,https://www.dropbox.com/scl/fi/pwhyse079mo9radk2xzuw/human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad?rlkey=t60bp7w5mf3k877q1i430oc14&dl=1 +human,Lung,10000,train,,,train_human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad,https://www.dropbox.com/scl/fi/w2r13kqrkzdxecvhizm0i/human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad?rlkey=6s4wbv2ii1d8ged5l8s8lwt6l&dl=1 +human,Lung,10000,train,,,train_human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad,https://www.dropbox.com/scl/fi/ubcw0cyn5uvaq034ysgxl/human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad?rlkey=m57pb8bx4936fnao2yyljqdgz&dl=1 +human,Lung,10000,train,,,train_human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/0vqe7wmb0afoubwnb5srb/human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=7necb5o9afgpnppsj74tga5y2&dl=1 +human,Lung,10000,train,,,train_human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/sbe6h2v5dijlu36qd6nyw/human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=gunxweprd7r8e0xlk9mo2kkv3&dl=1 +human,Lung,10000,train,,,train_human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad,https://www.dropbox.com/scl/fi/mz6umlbnjoxynhqklwyxg/human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad?rlkey=upom03ch71gebjvxq15x59gk9&dl=1 +human,Lung,10000,train,,,train_human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad,https://www.dropbox.com/scl/fi/b2e6gr542wah0t5xtgshh/human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad?rlkey=7pkq1kh7wz6z0qzj4wdj94i79&dl=1 +human,Lung,10000,train,,,train_human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad,https://www.dropbox.com/scl/fi/ymmdfevzihlcyjosugyuq/human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad?rlkey=71rly1fkb21yl8gxy8af42ke7&dl=1 +human,Lung,4138,train,,,train_human_Lung7b3368a5-c1a0-4973-9e75-d95b4150c7da_data.h5ad,https://www.dropbox.com/scl/fi/kkfqus7bbbc5fyammhvs6/human_Lung7b3368a5-c1a0-4973-9e75-d95b4150c7da_data.h5ad?rlkey=7pi96515hn1wp6vdb2mgx5wke&dl=1 +human,Lung,8657,train,,,train_human_Lunge04daea4-4412-45b5-989e-76a9be070a89_data.h5ad,https://www.dropbox.com/scl/fi/3yi17ckqej50wbvrhrmev/human_Lunge04daea4-4412-45b5-989e-76a9be070a89_data.h5ad?rlkey=dhvki1bayogmxfvwbgdmr63m0&dl=1 +human,Lung,9784,train,,,train_human_Lunge9175006-8978-4417-939f-819855eab80e_data.h5ad,https://www.dropbox.com/scl/fi/1n3ztjbo8v7pksbwocwvt/human_Lunge9175006-8978-4417-939f-819855eab80e_data.h5ad?rlkey=x7mhr3hqee4yon8iiqncjxzlv&dl=1 +human,Lung,6947,train,,,train_human_Lung0ba16f4b-cb87-4fa3-9363-19fc51eec6e7_data.h5ad,https://www.dropbox.com/scl/fi/u1l3vw5jfjzo5j438hns0/human_Lung0ba16f4b-cb87-4fa3-9363-19fc51eec6e7_data.h5ad?rlkey=bdrqcl1rnqj3ckzae2nivdryk&dl=1 +human,Lung,1135,train,,,train_human_Lunga68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/wzajeyqibto1nmg6lcnnu/human_Lunga68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=yaj6fw3lmf4gj2om9myv2xqe2&dl=1 +human,Lung,10000,train,,,train_human_Lung8c42cfd0-0b0a-46d5-910c-fc833d83c45e_data.h5ad,https://www.dropbox.com/scl/fi/0octhu7p45vhm141xpxho/human_Lung8c42cfd0-0b0a-46d5-910c-fc833d83c45e_data.h5ad?rlkey=f2god56i85fy7kbicd3omlcrr&dl=1 +human,Lung,10000,train,,,train_human_Lung2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/h943ko4tvioc5nbf74xh4/human_Lung2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=8tr5zd09f5w6bh81do2plszem&dl=1 +human,Lung,10000,train,,,train_human_Lungd8da613f-e681-4c69-b463-e94f5e66847f_data.h5ad,https://www.dropbox.com/scl/fi/87y1haxwnvw18ip34fuc3/human_Lungd8da613f-e681-4c69-b463-e94f5e66847f_data.h5ad?rlkey=zhxl8bv8ttize8x0iomrln7bu&dl=1 +human,Lung,9096,train,,,train_human_Lung4023a2bc-6325-47db-bfdf-9639e91042c2_data.h5ad,https://www.dropbox.com/scl/fi/5lyw275vtgivn93cvryho/human_Lung4023a2bc-6325-47db-bfdf-9639e91042c2_data.h5ad?rlkey=0bbi53hwtg0bpche9zmsbe3bb&dl=1 +human,Lung,329,train,,,train_human_Lung71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad,https://www.dropbox.com/scl/fi/322l1c9u9yw96m5jcrgx2/human_Lung71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=no8s8y76yfh0h5rj2oycto9e4&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreas53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad,https://www.dropbox.com/scl/fi/isf2fm5xy6ymlxmy28j42/human_Pancreas53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad?rlkey=sr9rmdejoqevjl7nfy5eieg1h&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreas78f10833-3e61-4fad-96c9-4bbd4f14bdfa_data.h5ad,https://www.dropbox.com/scl/fi/6euec8e6la5lb536eca9v/human_Pancreas78f10833-3e61-4fad-96c9-4bbd4f14bdfa_data.h5ad?rlkey=h048sako37cm7fldz0qyanbbt&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreasf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/13dwwcqh8uxne0fkv1nec/human_Pancreasf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=71velvjv1y9xp433ite74lc4o&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreas37b21763-7f0f-41ae-9001-60bad6e2841d_data.h5ad,https://www.dropbox.com/scl/fi/4saarn2fj0p9m1wz31oqm/human_Pancreas37b21763-7f0f-41ae-9001-60bad6e2841d_data.h5ad?rlkey=at8y4ydgoldzxo45dwd8r5q63&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreas9c4c8515-8f82-4c72-b0c6-f87647b00bbe_data.h5ad,https://www.dropbox.com/scl/fi/xie7wdjl99rijwubmkm8v/human_Pancreas9c4c8515-8f82-4c72-b0c6-f87647b00bbe_data.h5ad?rlkey=ev0t2j0oug6jn5lx3217nfilv&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreasfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/m57p32xedb57iy039t2u7/human_Pancreasfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=biux7hptrp8dkiu4bt5xsibdh&dl=1 +human,Pancreas,2126,train,,,train_human_Pancreasb07e5164-baf6-43d2-bdba-5a249d0da879_data.h5ad,https://www.dropbox.com/scl/fi/eklbb3ecg87j4ioh7j3yj/human_Pancreasb07e5164-baf6-43d2-bdba-5a249d0da879_data.h5ad?rlkey=d4rjld6ngijy8ozavipf62k7j&dl=1 +human,Pancreas,2742,train,,,train_human_Pancreasc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/wkmrwsu12z50p10knvhij/human_Pancreasc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=k4uaaq8dj1glvwjtw7dguhffu&dl=1 +human,Pancreas,440,train,,,train_human_Pancreasa68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/96fmfcse1tkek9fmt3l9i/human_Pancreasa68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=iblncgxaczniac0tdhwluv3hd&dl=1 +human,Pancreas,2100,train,,,train_human_Pancreas5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/l437mlea7j4kndeuqmgfc/human_Pancreas5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=hf9cp43iokqzr3zkt7xhji46o&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreas3294d050-6eeb-4a00-b24c-71aacc9b777f_data.h5ad,https://www.dropbox.com/scl/fi/vu8iqocybwv1ntyfvniq0/human_Pancreas3294d050-6eeb-4a00-b24c-71aacc9b777f_data.h5ad?rlkey=t8kg0khgxloxvppl8n5yck5qw&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreas2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/1ohvymmbhbnuxztcg073v/human_Pancreas2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=4wqv2ep9tw0962rk3wit5qf1l&dl=1 +human,Pancreas,10000,train,,,train_human_Pancreasff45e623-7f5f-46e3-b47d-56be0341f66b_data.h5ad,https://www.dropbox.com/scl/fi/zg4zyy9g3ana3aozo8wpx/human_Pancreasff45e623-7f5f-46e3-b47d-56be0341f66b_data.h5ad?rlkey=os9h57ravoaw8dg4mav9qahxm&dl=1 +human,Pancreas,2544,train,,,train_human_Pancreas66d15835-5dc8-4e96-b0eb-f48971cb65e8_data.h5ad,https://www.dropbox.com/scl/fi/p2zo6qt4j0hq2xtd8yudn/human_Pancreas66d15835-5dc8-4e96-b0eb-f48971cb65e8_data.h5ad?rlkey=hum53j5mvk3vs3ybqwe0fb2r1&dl=1 +human,Pancreas,8215,train,,,train_human_Pancreas97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad,https://www.dropbox.com/scl/fi/43r5btoo1z1r43xwlg1st/human_Pancreas97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad?rlkey=2zlbl33carcm5xsyp9dbazn84&dl=1 From 30eb4fd46ddba62f7944e352dbe5f6651c8425ee Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Fri, 13 Dec 2024 23:10:10 +0800 Subject: [PATCH 140/203] minor --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 381f8efc..9f9e5ae9 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -144,7 +144,7 @@ human,Brain,6877,train,,,train_human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_d human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad,https://www.dropbox.com/scl/fi/xutjy05pxtqlt2nyk35kp/human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad?rlkey=34ahhk9g9cxrugt8canfmfl3u&st=ia53zjuo&dl=1 human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1 human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1 -human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&dl=1 +human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Intestine-_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&st=nay2m851&dl=1 human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&dl=1 human,Intestine,6444,train,,,train_human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad,https://www.dropbox.com/scl/fi/6rd98oo0z68dmqpuhn7ap/human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad?rlkey=v6zz40z2f73h4oivfx67y7cgr&dl=1 human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad,https://www.dropbox.com/scl/fi/u57hnsh6nv88r8nnrwfnl/human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad?rlkey=suzpw1ikma8kpr3uxkv7nosue&dl=1 From 2d4070d81a4944c14ee179b2fa4ff6f7cb7f4039 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 16 Dec 2024 09:28:46 +0800 Subject: [PATCH 141/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 9f9e5ae9..0f537ded 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -145,10 +145,10 @@ human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_d human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1 human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1 human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Intestine-_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&st=nay2m851&dl=1 -human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&dl=1 +human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353-Intestine-_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&st=odk1wce6&dl=1 human,Intestine,6444,train,,,train_human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad,https://www.dropbox.com/scl/fi/6rd98oo0z68dmqpuhn7ap/human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad?rlkey=v6zz40z2f73h4oivfx67y7cgr&dl=1 human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad,https://www.dropbox.com/scl/fi/u57hnsh6nv88r8nnrwfnl/human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad?rlkey=suzpw1ikma8kpr3uxkv7nosue&dl=1 -human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&dl=1 +human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67-Intestine-_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&st=u1nnlhbw&dl=1 human,Intestine,7443,train,,,train_human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad,https://www.dropbox.com/scl/fi/i9271g8nx3kmxi6rb12iv/human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad?rlkey=2vfdh7k5amnul3d786g2zmr6g&dl=1 human,Kidney,10000,train,,,train_human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad,https://www.dropbox.com/scl/fi/3zfgu6g7mzv2v7f7qpwvq/human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad?rlkey=euso7cvuwcpauk21o3yme3959&dl=1 human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&dl=1 From da9b8a67d45b4adfc615ce8c82ee44f3b4ee7fb7 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 16 Dec 2024 09:32:33 +0800 Subject: [PATCH 142/203] minor --- examples/tuning/cta_singlecellnet/main.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py index cc5406d9..790b2f2f 100644 --- a/examples/tuning/cta_singlecellnet/main.py +++ b/examples/tuning/cta_singlecellnet/main.py @@ -6,12 +6,12 @@ from typing import get_args import numpy as np -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet -from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.pipeline import Pipeline, PipelinePlaner, get_step3_yaml, run_step3, save_summary_data from dance.typing import LogLevel from dance.utils import set_seed @@ -56,7 +56,15 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) - + if "run_kwargs" in pipeline_planer.config and tune_mode == "params": + wandb_config = dict(wandb.config) + config = {'pipeline': wandb_config["run_kwargs"], "type": "preprocessor"} + preprocessing_pipeline = Pipeline(config) + + else: + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) # Initialize model and get model specific preprocessing pipeline model = SingleCellNet(num_trees=args.num_trees) @@ -64,8 +72,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, species=args.species, tissue=args.tissue, valid_dataset=args.valid_dataset, data_dir="../temp_data", filetype=args.filetype).load_data(cache=args.cache) - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) + # kwargs = {tune_mode: dict(wandb.config)} + # preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) From 3881988492be5ce0f550e951fca9f5c1b757e19d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 01:33:14 +0000 Subject: [PATCH 143/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/cta_singlecellnet/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py index 790b2f2f..bfe8bb88 100644 --- a/examples/tuning/cta_singlecellnet/main.py +++ b/examples/tuning/cta_singlecellnet/main.py @@ -6,8 +6,8 @@ from typing import get_args import numpy as np - import wandb + from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet From 27d9817c0094b3b5c580522abf660cdbb0f6437c Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 16 Dec 2024 09:35:23 +0800 Subject: [PATCH 144/203] ignore result --- .gitignore | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.gitignore b/.gitignore index bf917f79..ffb7f309 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,14 @@ temp_data *.egg* __pycache__ build/ + +#ignore example result +examples/tuning/**/*.log +examples/tuning/**/*.yaml +examples/tuning/**/*.csv +examples/tuning/**/*.h5ad +examples/tuning/**/*.sh +examples/tuning/**/*.h5 +examples/tuning/**/*.tar.gz +examples/tuning/**/*.tif +examples/tuning/**/*.txt From f42e6fcdb128b03355c6020ae0d1d04f32bbb3df Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 16 Dec 2024 14:54:34 +0800 Subject: [PATCH 145/203] minor --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bf917f79..0a74ef7b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ temp_data *.egg* __pycache__ build/ +*.log From 66923ab3643938370c3f1e89da7ff8c3c1077cfb Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 16 Dec 2024 15:01:53 +0800 Subject: [PATCH 146/203] minor --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index aa5948e7..5adb8a91 100644 --- a/.gitignore +++ b/.gitignore @@ -17,10 +17,9 @@ build/ *.log #ignore example result -examples/tuning/**/*.log +examples/**/*.h5ad examples/tuning/**/*.yaml examples/tuning/**/*.csv -examples/tuning/**/*.h5ad examples/tuning/**/*.sh examples/tuning/**/*.h5 examples/tuning/**/*.tar.gz From 5859db7d4c5a2e2572ab2ccdb58fd90d08331991 Mon Sep 17 00:00:00 2001 From: xzy Date: Mon, 16 Dec 2024 15:52:23 +0800 Subject: [PATCH 147/203] minor --- .../tuning/joint_embedding_scmvae/main.py | 100 ++++++------------ 1 file changed, 30 insertions(+), 70 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 8b091eb7..1f778b6a 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -9,9 +9,9 @@ import pandas as pd import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE @@ -73,31 +73,18 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) wandb_config = wandb.config - if "run_kwargs" in pipeline_planer.config: - if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): - wandb_config = wandb_config["run_kwargs"] - else: - wandb.log({"skip": 1}) - wandb.finish() - return - try: - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") - data = dataset.load_data() - wandb_config = wandb.config - if "run_kwargs" in pipeline_planer.config: - if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): - wandb_config = wandb_config["run_kwargs"] - else: - wandb.log({"skip": 1}) - wandb.finish() - return try: + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") data = dataset.load_data() - le = preprocessing.LabelEncoder() - labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) - data.mod["mod1"].obsm["labels"] = labels le = preprocessing.LabelEncoder() labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) data.mod["mod1"].obsm["labels"] = labels @@ -107,19 +94,23 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) preprocessing_pipeline = pipeline_planer.generate(**kwargs) print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) - train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] - train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] - test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) - - # train_size=data.mod["meta1"].shape[0] - # test_size=data.mod["mod1"].shape[0]-train_size - data.set_split_idx("train",train_idx) - data.set_split_idx("test",test_idx) - (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch") - (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch") + # train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + # train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + # test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx))) + # data.set_split_idx("train",train_idx) + # data.set_split_idx("test",test_idx) + + (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch") + (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch") + + train_size = len(x_train) + test_size = len(x_test) + train_idx = np.arange(train_size) + test_idx = np.arange(test_size) + train_size + # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels) - lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()])) - lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()])) + lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()])) + lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()])) lib_mean1 = torch.from_numpy(lib_mean1) lib_var1 = torch.from_numpy(lib_var1) lib_mean2 = torch.from_numpy(lib_mean2) @@ -127,13 +118,13 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) Nfeature1 = x_train.shape[1] Nfeature2 = y_train.shape[1] - # train_size = len(data.get_split_idx("train")) - # train_size=x_train.shape[0] + + temp = lib_mean1[train_idx] train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx], - lib_var2[train_idx], y_train) + lib_var2[train_idx], y_train) valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx], - lib_var2[test_idx], y_test) + lib_var2[test_idx], y_test) total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) @@ -171,37 +162,6 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) model.to(device) model.init_gmm_params(total_loader) model.fit(args, train, valid, args.final_rate, args.scale_factor, device) - x_test = torch.cat([x_train, x_test]) - y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 - model = scMVAE( - encoder_1=[Nfeature1, 1024, 128, 128], - hidden_1=128, - Z_DIMS=22, - decoder_share=[22, 128, 256], - share_hidden=128, - decoder_1=[128, 128, 1024], - hidden_2=1024, - encoder_l=[Nfeature1, 128], - hidden3=128, - encoder_2=[Nfeature2, 1024, 128, 128], - hidden_4=128, - encoder_l1=[Nfeature2, 128], - hidden3_1=128, - decoder_2=[128, 128, 1024], - hidden_5=1024, - drop_rate=0.1, - log_variational=True, - Type="ZINB", - device=device, - n_centroids=22, - penality="GMM", - model=1, - ) - model.to(device) - model.init_gmm_params(total_loader) - model.fit(args, train, valid, args.final_rate, args.scale_factor, device) - # embeds = model.predict(x_test, y_test).cpu().numpy() score = model.score(x_test, y_test, labels) # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) @@ -210,7 +170,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) wandb.log(score) wandb.finish() finally: - locals_keys=list(locals().keys()) + locals_keys = list(locals().keys()) for var in locals_keys: try: exec(f"del {var}") From 00c1b82136f0d5bd052dc2e5f5e0eb6d0f018f6b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 07:54:38 +0000 Subject: [PATCH 148/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmvae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index 1f778b6a..e20be682 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -9,9 +9,9 @@ import pandas as pd import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE From 7bbef3ea5390265e688dbdd86d3fed00a5adc880 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Tue, 17 Dec 2024 10:26:47 +0800 Subject: [PATCH 149/203] update pre-commit --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 194f88fa..3af9d357 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,13 +32,13 @@ repos: name: Sort imports args: [--line-width, "120", --profile, black] - # - repo: https://github.com/PyCQA/docformatter - # rev: v1.7.5 - # hooks: - # - id: docformatter - # name: Format docstring - # additional_dependencies: [tomli] - # args: [--config, pyproject.toml] + - repo: https://github.com/PyCQA/docformatter + rev: eb1df34 + hooks: + - id: docformatter + name: Format docstring + additional_dependencies: [tomli] + args: [--config, pyproject.toml] - repo: https://github.com/executablebooks/mdformat rev: 0.7.17 From 505e5d61d3b31ebc9d7a4b64f122d2c43484e625 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 02:28:03 +0000 Subject: [PATCH 150/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/sc_similarity/anndata_similarity.py | 36 +++++++++-------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py index 9a84fa4b..5409fdb6 100644 --- a/dance/sc_similarity/anndata_similarity.py +++ b/dance/sc_similarity/anndata_similarity.py @@ -55,13 +55,12 @@ def filter_gene(self, n_top_genes=3000): self.common_genes = common_hvg def preprocess(self): - """Preprocess the data, including log normalization and normalization to probability distribution.""" + """Preprocess the data, including log normalization and normalization to + probability distribution.""" self.filter_gene() def sample_cells(self, random_state): - """ - Randomly sample cells from each dataset if sample_size is specified. - """ + """Randomly sample cells from each dataset if sample_size is specified.""" np.random.seed(random_state) if self.sample_size is None: self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs) #need to think @@ -77,9 +76,7 @@ def sample_cells(self, random_state): self.sampled_adata2 = self.adata2.copy() def normalize_data(self): # I am not sure - """ - Normalize the data by total counts per cell and log-transform. - """ + """Normalize the data by total counts per cell and log-transform.""" sc.pp.normalize_total(self.adata1, target_sum=1e4) sc.pp.log1p(self.adata1) sc.pp.normalize_total(self.adata2, target_sum=1e4) @@ -98,18 +95,16 @@ def set_prob_data(self, sampled=False): self.Y = np.nan_to_num(prob_adata2).toarray() def cosine_sim_sampled(self) -> pd.DataFrame: - """ - Computes the average cosine similarity between all pairs of cells from the two datasets. - """ + """Computes the average cosine similarity between all pairs of cells from the + two datasets.""" # Compute cosine similarity matrix sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X) # Return the average similarity return sim_matrix.mean() def pearson_corr_sampled(self) -> pd.DataFrame: - """ - Computes the average Pearson correlation coefficient between all pairs of cells from the two datasets. - """ + """Computes the average Pearson correlation coefficient between all pairs of + cells from the two datasets.""" # Compute Pearson correlation matrix corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(), self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs, @@ -118,9 +113,8 @@ def pearson_corr_sampled(self) -> pd.DataFrame: return np.nanmean(corr_matrix) def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: - """ - Computes the average Jaccard similarity between all pairs of binarized cells from the two datasets. - """ + """Computes the average Jaccard similarity between all pairs of binarized cells + from the two datasets.""" # Binarize the data binary_adata1 = (self.sampled_adata1.X > threshold).astype(int) binary_adata2 = (self.sampled_adata2.X > threshold).astype(int) @@ -131,9 +125,8 @@ def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: return similarity_matrix.mean() def js_divergence_sampled(self) -> float: - """ - Computes the average Jensen-Shannon divergence between all pairs of cells from the two datasets. - """ + """Computes the average Jensen-Shannon divergence between all pairs of cells + from the two datasets.""" # Normalize the data to probability distributions prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) @@ -195,9 +188,8 @@ def data_company(self): raise NotImplementedError("data company") def wasserstein_dist(self) -> float: - """ - Computes the average Wasserstein distance between all pairs of cells from the two datasets. - """ + """Computes the average Wasserstein distance between all pairs of cells from the + two datasets.""" X = self.X Y = self.Y a = np.ones((X.shape[0], )) / X.shape[0] From b7683af51bd8824860c9fdcf32c042a293be7582 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Tue, 17 Dec 2024 12:59:45 +0800 Subject: [PATCH 151/203] minor --- dance/datasets/multimodality.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py index 14e9c5c6..87c8689c 100644 --- a/dance/datasets/multimodality.py +++ b/dance/datasets/multimodality.py @@ -575,7 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre def _raw_to_dance(self, raw_data): mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data) - self.to_array([mod1, mod2, meta1, meta2, test_sol]) + self._to_csr([mod1, mod2, meta1, meta2, test_sol]) assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned" mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol}) @@ -585,7 +585,7 @@ def _raw_to_dance(self, raw_data): return data - def to_array(self, datas): + def _to_csr(self, datas): for data in datas: if scipy.sparse.issparse(data.X): if not isinstance(data.X, scipy.sparse.csr_matrix): From 7026ee0f1db028438f235e82ced2c42451c70504 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Tue, 17 Dec 2024 14:34:48 +0800 Subject: [PATCH 152/203] minor --- dance/sc_similarity/anndata_similarity.py | 36 +++++++++-------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py index 9a84fa4b..5409fdb6 100644 --- a/dance/sc_similarity/anndata_similarity.py +++ b/dance/sc_similarity/anndata_similarity.py @@ -55,13 +55,12 @@ def filter_gene(self, n_top_genes=3000): self.common_genes = common_hvg def preprocess(self): - """Preprocess the data, including log normalization and normalization to probability distribution.""" + """Preprocess the data, including log normalization and normalization to + probability distribution.""" self.filter_gene() def sample_cells(self, random_state): - """ - Randomly sample cells from each dataset if sample_size is specified. - """ + """Randomly sample cells from each dataset if sample_size is specified.""" np.random.seed(random_state) if self.sample_size is None: self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs) #need to think @@ -77,9 +76,7 @@ def sample_cells(self, random_state): self.sampled_adata2 = self.adata2.copy() def normalize_data(self): # I am not sure - """ - Normalize the data by total counts per cell and log-transform. - """ + """Normalize the data by total counts per cell and log-transform.""" sc.pp.normalize_total(self.adata1, target_sum=1e4) sc.pp.log1p(self.adata1) sc.pp.normalize_total(self.adata2, target_sum=1e4) @@ -98,18 +95,16 @@ def set_prob_data(self, sampled=False): self.Y = np.nan_to_num(prob_adata2).toarray() def cosine_sim_sampled(self) -> pd.DataFrame: - """ - Computes the average cosine similarity between all pairs of cells from the two datasets. - """ + """Computes the average cosine similarity between all pairs of cells from the + two datasets.""" # Compute cosine similarity matrix sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X) # Return the average similarity return sim_matrix.mean() def pearson_corr_sampled(self) -> pd.DataFrame: - """ - Computes the average Pearson correlation coefficient between all pairs of cells from the two datasets. - """ + """Computes the average Pearson correlation coefficient between all pairs of + cells from the two datasets.""" # Compute Pearson correlation matrix corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(), self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs, @@ -118,9 +113,8 @@ def pearson_corr_sampled(self) -> pd.DataFrame: return np.nanmean(corr_matrix) def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: - """ - Computes the average Jaccard similarity between all pairs of binarized cells from the two datasets. - """ + """Computes the average Jaccard similarity between all pairs of binarized cells + from the two datasets.""" # Binarize the data binary_adata1 = (self.sampled_adata1.X > threshold).astype(int) binary_adata2 = (self.sampled_adata2.X > threshold).astype(int) @@ -131,9 +125,8 @@ def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: return similarity_matrix.mean() def js_divergence_sampled(self) -> float: - """ - Computes the average Jensen-Shannon divergence between all pairs of cells from the two datasets. - """ + """Computes the average Jensen-Shannon divergence between all pairs of cells + from the two datasets.""" # Normalize the data to probability distributions prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) @@ -195,9 +188,8 @@ def data_company(self): raise NotImplementedError("data company") def wasserstein_dist(self) -> float: - """ - Computes the average Wasserstein distance between all pairs of cells from the two datasets. - """ + """Computes the average Wasserstein distance between all pairs of cells from the + two datasets.""" X = self.X Y = self.Y a = np.ones((X.shape[0], )) / X.shape[0] From d7c63add4ae8ab3e32661d45ee0d6c0b70bd20c9 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Tue, 17 Dec 2024 14:42:02 +0800 Subject: [PATCH 153/203] minor --- examples/tuning/cta_celltypist/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py index c625065f..58a10303 100644 --- a/examples/tuning/cta_celltypist/main.py +++ b/examples/tuning/cta_celltypist/main.py @@ -42,6 +42,8 @@ args = parser.parse_args() logger.setLevel(args.log_level) logger.info(f"Running Celltypist with the following parameters:\n{pprint.pformat(vars(args))}") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + # os.environ["WANDB_AGENT_DISABLE_FLAPPING"]="true" file_root_path = Path( args.root_path, "_".join([ "-".join([str(num) for num in dataset]) @@ -51,7 +53,6 @@ logger.info(f"\n files is saved in {file_root_path}") MAINDIR = Path(__file__).resolve().parent pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") - os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) From 81b83309d842edc18906ce035ef10803dfbd14f4 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 17 Dec 2024 21:46:21 +0800 Subject: [PATCH 154/203] update data --- dance/metadata/scdeepsort.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 0f537ded..44091d47 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -151,8 +151,8 @@ human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67-Intestine-_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&st=u1nnlhbw&dl=1 human,Intestine,7443,train,,,train_human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad,https://www.dropbox.com/scl/fi/i9271g8nx3kmxi6rb12iv/human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad?rlkey=2vfdh7k5amnul3d786g2zmr6g&dl=1 human,Kidney,10000,train,,,train_human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad,https://www.dropbox.com/scl/fi/3zfgu6g7mzv2v7f7qpwvq/human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad?rlkey=euso7cvuwcpauk21o3yme3959&dl=1 -human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&dl=1 -human,Kidney,10000,train,,,train_human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/krhe97shrnuofdlthnopj/human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=0lft0ebfaz4e0rzo1xpotmt79&dl=1 +human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67(Kidney)_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67-Kidney-_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&st=jwhicxrp&dl=1 +human,Kidney,10000,train,,,train_human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353(Kidney)_data.h5ad,https://www.dropbox.com/scl/fi/krhe97shrnuofdlthnopj/human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353-Kidney-_data.h5ad?rlkey=0lft0ebfaz4e0rzo1xpotmt79&st=7ie3uc6i&dl=1 human,Kidney,10000,train,,,train_human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad,https://www.dropbox.com/scl/fi/mhr92khp2j4ydit9mjiky/human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad?rlkey=9eg9ota05td575buw4vh6qdjd&dl=1 human,Kidney,10000,train,,,train_human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad,https://www.dropbox.com/scl/fi/29d15brkbpsx14tr9qt0w/human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad?rlkey=sxzcd0btog8nhm13sf36r0q78&dl=1 human,Kidney,10000,train,,,train_human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad,https://www.dropbox.com/scl/fi/gkqbvzh99bhegjar2b9q0/human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad?rlkey=emfdlopj8v08yqqp9c82n16je&dl=1 From 7f37be45ec8410ebec7486754c3bdccedf87ff78 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 09:21:09 +0800 Subject: [PATCH 155/203] update notes --- dance/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/pipeline.py b/dance/pipeline.py index 0841eed7..aa4a0c00 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -1068,7 +1068,7 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_ for target, d_p in p1.default_params.items(): if target == p2["target"]: p2["params"] = d_p - #顺序不对,参考_sanitize_pipeline进行修改 TODO + #The order is wrong, refer to _sanitize_pipeline for modification TODO use test to check step2_pipeline = step2_pipeline_planer.config.pipeline # step2_pipeline=sorted(step2_pipeline_planer.config.pipeline,key=lambda x: float(x.split('.')[1])) for p1, p2 in zip(step2_pipeline, pipeline): #need order From 759912422da292a90b1fc5108c9a9cff42181f32 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 10:18:09 +0800 Subject: [PATCH 156/203] update forest and other minor changes --- .../atlas/sc_similarity/anndata_similarity.py | 520 ++++++++++++++++++ dance/sc_similarity/download_data.py | 9 - .../multi_modality/joint_embedding/jae.py | 4 +- .../multi_modality/joint_embedding/scmogcn.py | 4 +- .../result_analysis/get_important_pattern.py | 156 ++++-- .../get_important_pattern_sweep.py | 77 ++- examples/result_analysis/get_num.py | 30 + 7 files changed, 745 insertions(+), 55 deletions(-) create mode 100644 dance/atlas/sc_similarity/anndata_similarity.py delete mode 100644 dance/sc_similarity/download_data.py create mode 100644 examples/result_analysis/get_num.py diff --git a/dance/atlas/sc_similarity/anndata_similarity.py b/dance/atlas/sc_similarity/anndata_similarity.py new file mode 100644 index 00000000..ab44e6e7 --- /dev/null +++ b/dance/atlas/sc_similarity/anndata_similarity.py @@ -0,0 +1,520 @@ +# anndata_similarity.py +# TODO translate notes +import re +import warnings +from typing import Callable, Dict, List, Optional + +import anndata +import anndata as ad +import numpy as np +import ot +import pandas as pd +import scanpy as sc +import scipy +import yaml +from omegaconf import OmegaConf +from scipy.linalg import sqrtm +from scipy.spatial import cKDTree +from scipy.spatial.distance import cdist, directed_hausdorff, jaccard, jensenshannon +from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel + +# Suppress scipy warnings for constant input in Pearson correlation +warnings.filterwarnings("ignore", message="An input array is constant") +from dance.datasets.singlemodality import CellTypeAnnotationDataset + + +def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "h5ad", train_dataset=[], + test_dataset=[], valid_dataset=[], data_dir="../temp_data"): + data = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset, + valid_dataset=valid_dataset, data_dir=data_dir, tissue=tissue, species=species, + filetype=filetype).load_data() + return data.data + + +class AnnDataSimilarity: + + def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None, + init_random_state: Optional[int] = None, n_runs: int = 10, + ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None, + adata2_name: Optional[str] = None, + methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"): + """Initialize the AnnDataSimilarity object and perform data preprocessing.""" + self.origin_adata1 = adata1.copy() + self.origin_adata2 = adata2.copy() + self.sample_size = sample_size + self.init_random_state = init_random_state + self.preprocess() + self.results = {} + self.ground_truth_conf_path = ground_truth_conf_path + self.adata1_name = adata1_name + self.adata2_name = adata2_name + self.methods = methods + self.tissue = tissue + self.n_runs = n_runs + + def filter_gene(self, n_top_genes=3000): + sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3') + sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3') + + common_hvg = self.origin_adata1.var_names[self.origin_adata1.var['highly_variable']].intersection( + self.origin_adata2.var_names[self.origin_adata2.var['highly_variable']]) + + self.origin_adata1 = self.origin_adata1[:, common_hvg].copy() + self.origin_adata2 = self.origin_adata2[:, common_hvg].copy() + self.common_genes = common_hvg + + def preprocess(self): + """Preprocess the data, including log normalization and normalization to + probability distribution.""" + self.filter_gene() + + def sample_cells(self, random_state): + """Randomly sample cells from each dataset if sample_size is specified.""" + np.random.seed(random_state) + if self.sample_size is None: + self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs) #need to think + if self.adata1.n_obs > self.sample_size: + indices1 = np.random.choice(self.adata1.n_obs, size=self.sample_size, replace=False) + self.sampled_adata1 = self.adata1[indices1, :].copy() + else: + self.sampled_adata1 = self.adata1.copy() + if self.adata2.n_obs > self.sample_size: + indices2 = np.random.choice(self.adata2.n_obs, size=self.sample_size, replace=False) + self.sampled_adata2 = self.adata2[indices2, :].copy() + else: + self.sampled_adata2 = self.adata2.copy() + + def normalize_data(self): # I am not sure + """Normalize the data by total counts per cell and log-transform.""" + sc.pp.normalize_total(self.adata1, target_sum=1e4) + sc.pp.log1p(self.adata1) + sc.pp.normalize_total(self.adata2, target_sum=1e4) + sc.pp.log1p(self.adata2) + + def set_prob_data(self, sampled=False): + # Normalize the data to probability distributions + if sampled: + prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) + prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) + else: + prob_adata1 = self.adata1.X / self.adata1.X.sum(axis=1) + prob_adata2 = self.adata2.X / self.adata2.X.sum(axis=1) + # Handle any NaN values resulting from division by zero + self.X = np.nan_to_num(prob_adata1).toarray() + self.Y = np.nan_to_num(prob_adata2).toarray() + + def cosine_sim_sampled(self) -> pd.DataFrame: + """Computes the average cosine similarity between all pairs of cells from the + two datasets.""" + # Compute cosine similarity matrix + sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X) + # Return the average similarity + return sim_matrix.mean() + + def pearson_corr_sampled(self) -> pd.DataFrame: + """Computes the average Pearson correlation coefficient between all pairs of + cells from the two datasets.""" + # Compute Pearson correlation matrix + corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(), + self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs, + self.sampled_adata1.n_obs:] + # Return the average correlation + return np.nanmean(corr_matrix) + + def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: + """Computes the average Jaccard similarity between all pairs of binarized cells + from the two datasets.""" + # Binarize the data + binary_adata1 = (self.sampled_adata1.X > threshold).astype(int) + binary_adata2 = (self.sampled_adata2.X > threshold).astype(int) + # Compute Jaccard distance matrix + distance_matrix = cdist(binary_adata1.A, binary_adata2.A, metric='jaccard') + # Convert to similarity and compute the average + similarity_matrix = 1 - distance_matrix + return similarity_matrix.mean() + + def js_divergence_sampled(self) -> float: + """Computes the average Jensen-Shannon divergence between all pairs of cells + from the two datasets.""" + # Normalize the data to probability distributions + prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) + prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) + # Handle any NaN values resulting from division by zero + prob_adata1 = np.nan_to_num(prob_adata1).toarray() + prob_adata2 = np.nan_to_num(prob_adata2).toarray() + + # Define a function to compute JS divergence for a pair of probability vectors + def jsd(p, q): + return jensenshannon(p, q) + + # Compute JS divergence matrix + jsd_vectorized = np.vectorize(jsd, signature='(n),(n)->()') + divergence_matrix = np.zeros((prob_adata1.shape[0], prob_adata2.shape[0])) + for i in range(prob_adata1.shape[0]): + divergence_matrix[i, :] = jsd_vectorized( + np.repeat(prob_adata1[i, :], prob_adata2.shape[0], axis=0).reshape(-1, prob_adata1.shape[1]), + prob_adata2) + + # Convert divergence to similarity and compute the average + similarity_matrix = 1 - divergence_matrix + return np.nanmean(similarity_matrix) + + def compute_mmd(self) -> float: + X = self.X + Y = self.Y + kernel = "rbf" + gamma = 1.0 + if kernel == 'rbf': + K_X = np.exp(-gamma * cdist(X, X, 'sqeuclidean')) + K_Y = np.exp(-gamma * cdist(Y, Y, 'sqeuclidean')) + K_XY = np.exp(-gamma * cdist(X, Y, 'sqeuclidean')) + elif kernel == 'linear': + K_X = np.dot(X, X.T) + K_Y = np.dot(Y, Y.T) + K_XY = np.dot(X, Y.T) + else: + raise ValueError("Unsupported kernel type") + + m = X.shape[0] + n = Y.shape[0] + + sum_X = (np.sum(K_X) - np.sum(np.diag(K_X))) / (m * (m - 1)) + sum_Y = (np.sum(K_Y) - np.sum(np.diag(K_Y))) / (n * (n - 1)) + sum_XY = np.sum(K_XY) / (m * n) + + mmd_squared = sum_X + sum_Y - 2 * sum_XY + mmd = np.sqrt(max(mmd_squared, 0)) + return 1 / (1 + mmd) + + def common_genes_num(self): + return len(self.common_genes) + + def otdd(self): + """Compute the OTDD between two data sets.""" + raise NotImplementedError("OTDD!") + + def data_company(self): + raise NotImplementedError("data company") + + def wasserstein_dist(self) -> float: + """Computes the average Wasserstein distance between all pairs of cells from the + two datasets.""" + X = self.X + Y = self.Y + a = np.ones((X.shape[0], )) / X.shape[0] + b = np.ones((Y.shape[0], )) / Y.shape[0] + M = ot.dist(X, Y, metric='euclidean') + wasserstein_dist = ot.emd2(a, b, M) + return 1 / 1 + wasserstein_dist + + def get_Hausdorff(self): + X = self.X + Y = self.Y + forward = directed_hausdorff(X, Y)[0] + backward = directed_hausdorff(X, Y)[0] + hausdorff_distance = max(forward, backward) + normalized_hausdorff = hausdorff_distance / np.sqrt(X.shape[1]) + similarity = 1 - normalized_hausdorff + return similarity + + def chamfer_distance(self): + X = self.X + Y = self.Y + tree_A = cKDTree(X) + tree_B = cKDTree(Y) + + distances_A_to_B, _ = tree_A.query(Y) + distances_B_to_A, _ = tree_B.query(X) + + chamfer_A_to_B = np.mean(distances_A_to_B) + chamfer_B_to_A = np.mean(distances_B_to_A) + distance = chamfer_A_to_B + chamfer_B_to_A + normalized_chamfer = distance / np.sqrt(X.shape[1]) + similarity = 1 - normalized_chamfer + return similarity + + def energy_distance_metric(self): + X = self.X + Y = self.Y + XX = cdist(X, X, 'euclidean') + YY = cdist(Y, Y, 'euclidean') + XY = cdist(X, Y, 'euclidean') + distance = 2 * np.mean(XY) - np.mean(XX) - np.mean(YY) + return 1 / (1 + distance) + + def get_sinkhorn2(self): + X = self.X + Y = self.Y + a = np.ones(X.shape[0]) / X.shape[0] + b = np.ones(Y.shape[0]) / Y.shape[0] + M = ot.dist(X, Y, metric='euclidean') + reg = 0.1 + sinkhorn_dist = ot.sinkhorn2(a, b, M, reg) + return 1 / (1 + sinkhorn_dist) + + def bures_distance(self): + X = self.X + Y = self.Y + C1 = np.cov(X, rowvar=False) + C2 = np.cov(Y, rowvar=False) + sqrt_C1 = sqrtm(C1) + product = sqrt_C1 @ C2 @ sqrt_C1 + sqrt_product = sqrtm(product) + trace = np.trace(C1) + np.trace(C2) - 2 * np.trace(sqrt_product) + return 1 / (1 + np.sqrt(max(trace, 0))) + + def spectral_distance(self): + X = self.X + Y = self.Y + C1 = np.cov(X, rowvar=False) + C2 = np.cov(Y, rowvar=False) + eig_A = np.linalg.eigvalsh(C1) + eig_B = np.linalg.eigvalsh(C2) + return 1 / (1 + np.linalg.norm(eig_A - eig_B)) + + def get_dataset_meta_sim(self): + # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general'] + con_cols = [ + "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean", + "n_counts_var", "var_n_counts_mean", "var_n_counts_var" + ] + dis_cols = ['assay', 'tissue'] + + def get_discrete_sim(col_list1, col_list2): + set1 = set(col_list1) + set2 = set(col_list2) + intersection = len(set1.intersection(set2)) + union = len(set1.union(set2)) + return intersection / union + + def get_con_sim(con_data_1, con_data_2): + return abs(con_data_1 - con_data_2) / max(con_data_1, con_data_2) + + def get_dataset_info(data: ad.AnnData): + con_sim = {} + con_sim["nnz_mean"] = np.mean(data.obs["nnz"]) + con_sim["nnz_var"] = np.var(data.obs["nnz"]) + nnz_values = data.X[data.X.nonzero()] + con_sim["nnz_counts_mean"] = np.mean(nnz_values) + con_sim["nnz_counts_var"] = np.var(nnz_values) + con_sim["n_measured_vars"] = np.mean(data.obs["n_measured_vars"]) + con_sim["cell_num"] = len(data.obs) + con_sim["gene_num"] = len(data.var) + con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"]) + con_sim["n_counts_var"] = np.var(data.obs["n_counts"]) + if "n_counts" not in data.var.columns: + if scipy.sparse.issparse(data.X): + gene_counts = np.array(data.X.sum(axis=0)).flatten() + else: + gene_counts = data.X.sum(axis=0) + data.var["n_counts"] = gene_counts + data.var["n_counts"] = data.var["n_counts"].astype(float) + con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"]) + con_sim["var_n_counts_var"] = np.var(data.var["n_counts"]) + data.uns["con_sim"] = con_sim + return data + + data_1 = self.adata1.copy() + data_2 = self.adata2.copy() + data_1 = get_dataset_info(data_1) + data_2 = get_dataset_info(data_2) + ans = {} + obs_1 = data_1.obs + obs_2 = data_2.obs + con_sim_1 = data_1.uns["con_sim"] + con_sim_2 = data_2.uns["con_sim"] + for dis_col in dis_cols: + ans[f"{dis_col}_sim"] = get_discrete_sim(obs_1[dis_col].values, obs_2[dis_col].values) + for con_col in con_cols: + ans[f"{con_col}_sim"] = get_con_sim(con_sim_1[con_col], con_sim_2[con_col]) + return np.mean(list(ans.values())) + + def get_ground_truth(self): + assert self.ground_truth_conf_path is not None + assert self.adata1_name is not None + assert self.adata2_name is not None + ground_truth_conf = pd.read_excel(self.ground_truth_conf_path, sheet_name=self.tissue, index_col=0) + + def get_targets(dataset_truth: str): + dataset_truth = OmegaConf.create(fix_yaml_string(dataset_truth)) + targets = [] + for item in dataset_truth: + targets.append(item["target"]) + return targets + + sim_targets = [] + for method in self.methods: + query_dataset_truth = ground_truth_conf.loc[self.adata1_name, f"{method}_method"] + atlas_dataset_truth = ground_truth_conf.loc[self.adata2_name, f"{method}_method"] + query_targets = get_targets(query_dataset_truth) + atlas_targets = get_targets(atlas_dataset_truth) + assert len(query_targets) == len(atlas_targets) + sim_targets.append((sum(a == b for a, b in zip(query_targets, atlas_targets)), len(query_targets))) + sim_targets.append((sum(x for x, y in sim_targets), sum(y for x, y in sim_targets))) + return sim_targets + + def compute_similarity( + self, random_state: int, methods: List[str] = [ + 'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim" + ] + ) -> Dict[str, float]: + """Computes the specified similarity measure. Parameters: + + methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' + Returns: + Dictionary containing the similarity matrices + + """ + self.adata1 = self.origin_adata1.copy() + self.adata2 = self.origin_adata2.copy() + self.normalize_data() + self.sample_cells(random_state) + self.set_prob_data() + + results = {} + for method in methods: + print(method) + if method == 'cosine': + results['cosine'] = self.cosine_sim_sampled() + elif method == 'pearson': + results['pearson'] = self.pearson_corr_sampled() + elif method == 'jaccard': + results['jaccard'] = self.jaccard_sim_sampled() + elif method == 'js_distance': + results['js_distance'] = self.js_divergence_sampled() + elif method == 'wasserstein': + results['wasserstein'] = self.wasserstein_dist() + elif method == "common_genes_num": + results["common_genes_num"] = self.common_genes_num() + elif method == "Hausdorff": + results["Hausdorff"] = self.get_Hausdorff() + elif method == "chamfer": + results["chamfer"] = self.chamfer_distance() + elif method == "energy": + results["energy"] = self.energy_distance_metric() + elif method == "sinkhorn2": + results["sinkhorn2"] = self.get_sinkhorn2() + elif method == "bures": + results["bures"] = self.bures_distance() + elif method == "spectral": + results["spectral"] = self.spectral_distance() + elif method == "otdd": + results['otdd'] = self.otdd() + elif method == "ground_truth": + results["ground_truth"] = self.get_ground_truth() + elif method == "metadata_sim": + results["metadata_sim"] = self.get_dataset_meta_sim() + elif method == "mmd": + results["mmd"] = self.compute_mmd() + else: + raise ValueError(f"Unsupported similarity method: {method}") + return results + + def get_similarity_matrix_A2B( + self, methods: List[str] = [ + "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num", + "ground_truth", "metadata_sim", "mmd" + ] + ) -> Dict[str, float]: + """Same as compute_similarity, keeping method name consistency.""" + cumulative_results = {method: 0.0 for method in methods} + + for run in range(self.n_runs): + # Update random state for each run + if self.init_random_state is not None: + current_random_state = self.init_random_state + run + else: + current_random_state = None + run_results = self.compute_similarity(methods=methods, random_state=current_random_state) + for method in methods: + if method in ["ground_truth"]: + cumulative_results[method] = run_results[method] + else: + cumulative_results[method] += run_results[method] + # Average the results over the number of runs + averaged_results = { + method: + cumulative_results[method] if method in ["ground_truth"] else cumulative_results[method] / self.n_runs + for method in methods + } + return averaged_results + + # def get_max_similarity_A_to_B(self): + # if self.results is None: + # raise ValueError(f"need results!") + # else: + # self.results_score = {} + # for key in self.results: + # if key not in ["common_genes_num", "ground_truth", "metadata_sim"]: + # self.results_score[key] = self._get_max_similarity(self.results[key]) + # else: + # self.results_score[key] = self.results[key] + # return self.results_score + + # def _get_max_similarity(self, similarity_matrix: pd.DataFrame): + # """Maximum matching average similarity score.""" + # matched_values = [ + # similarity_matrix.loc[label, + # label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max() + # for label in similarity_matrix.index + # ] # need to ask + # overall_similarity = np.mean(matched_values) + # return overall_similarity + + +def extract_type_target_params(item_text): + lines = item_text.strip().split('\n') + item_dict = {} + params_dict = {} + current_param_key = None + in_params = False + for line in lines: + stripped_line = line.strip() + if stripped_line.startswith('- type:'): + item_dict['type'] = stripped_line.split(':', 1)[1].strip() + elif stripped_line.startswith('target:'): + item_dict['target'] = stripped_line.split(':', 1)[1].strip() + elif stripped_line.startswith('params:'): + params_content = stripped_line.split(':', 1)[1].strip() + if params_content == '{}': + params_dict = {} + in_params = False + else: + params_dict = {} + in_params = True + elif in_params: + if re.match(r'^\w+:$', stripped_line): + current_param_key = stripped_line[:-1].strip() + params_dict[current_param_key] = {} + elif re.match(r'^- ', stripped_line): + list_item = stripped_line[2:].strip() + if current_param_key: + if not isinstance(params_dict[current_param_key], list): + params_dict[current_param_key] = [] + params_dict[current_param_key].append(list_item) + elif ':' in stripped_line: + key, value = map(str.strip, stripped_line.split(':', 1)) + if current_param_key and isinstance(params_dict.get(current_param_key, None), dict): + params_dict[current_param_key][key] = yaml.safe_load(value) + else: + params_dict[key] = yaml.safe_load(value) + item_dict['params'] = params_dict + return item_dict + + +def fix_yaml_string(original_str): + #It will be deleted + yaml_str = original_str.replace('\\n', '\n').strip() + items = re.split(r'(?=-\s*type:)', yaml_str) + config_list = [] + for item in items: + if not item.strip(): + continue + if not item.strip().startswith('- type:'): + print(item) + print("Warning: An item does not start with '- type:', skipping this item.") + continue + item_dict = extract_type_target_params(item) + config_list.append(item_dict) + fixed_yaml = yaml.dump(config_list, sort_keys=False) + return fixed_yaml diff --git a/dance/sc_similarity/download_data.py b/dance/sc_similarity/download_data.py deleted file mode 100644 index 83c705fd..00000000 --- a/dance/sc_similarity/download_data.py +++ /dev/null @@ -1,9 +0,0 @@ -from dance.datasets.singlemodality import CellTypeAnnotationDataset - - -def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "h5ad", train_dataset=[], - test_dataset=[], valid_dataset=[], data_dir="../temp_data"): - data = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset, - valid_dataset=valid_dataset, data_dir=data_dir, tissue=tissue, species=species, - filetype=filetype).load_data() - return data.data diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py index c726dd8b..0b1d79bb 100644 --- a/examples/multi_modality/joint_embedding/jae.py +++ b/examples/multi_modality/joint_embedding/jae.py @@ -11,9 +11,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[ + "-t", "--subtask", default="openproblems_2022_multi_atac2gex", choices=[ "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex", - "GSE140203_SKIN_atac2gex" + "GSE140203_SKIN_atac2gex", "openproblems_2022_multi_atac2gex" ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py index 1ef52647..44e2a748 100644 --- a/examples/multi_modality/joint_embedding/scmogcn.py +++ b/examples/multi_modality/joint_embedding/scmogcn.py @@ -12,9 +12,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[ + "-t", "--subtask", default="openproblems_2022_multi_atac2gex", choices=[ "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex", - "GSE140203_SKIN_atac2gex" + "GSE140203_SKIN_atac2gex", "openproblems_2022_multi_atac2gex" ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py index b03731cc..657fe9b0 100644 --- a/examples/result_analysis/get_important_pattern.py +++ b/examples/result_analysis/get_important_pattern.py @@ -4,7 +4,9 @@ import itertools import pathlib from collections import Counter +from copy import deepcopy from itertools import combinations +from os import X_OK from pathlib import Path import matplotlib.pyplot as plt @@ -12,10 +14,18 @@ import pandas as pd import scikit_posthocs as sp import seaborn as sns +import shapiq from mlxtend.frequent_patterns import apriori from mlxtend.preprocessing import TransactionEncoder from networkx import parse_adjlist from scipy import cluster, stats +from scipy.stats import pointbiserialr +from sklearn.compose import ColumnTransformer +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import GridSearchCV, KFold, LeaveOneOut, cross_val_score +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from typing_extensions import deprecated #TODO need to sync all files or get sweep,not file @@ -68,7 +78,7 @@ def change_real_rank(rank_item, real_rank): return [] -def replace_nan_in_2d(lst): +def replace_nan_in_2d(lst): #nan应该是个极差的值而不是直接删掉 return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst] @@ -83,7 +93,9 @@ def are_all_elements_same_direct(list_2d): return True if first_element is not None else True -def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1): +def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1, multi_mod=False): + if multi_mod: + raise NotImplementedError("need multimod") threshold = int(len(step2_data) * threshold_per) step2_data.loc[:, metric_name] = step2_data.loc[:, metric_name].astype(float) df_sorted = step2_data.sort_values(metric_name, ascending=ascending) @@ -93,43 +105,35 @@ def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1) te = TransactionEncoder() te_ary = te.fit(transactions).transform(transactions) df = pd.DataFrame(te_ary, columns=te.columns_) - frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True) + frequent_itemsets = apriori(df, use_colnames=True, min_support=0.3) # print(frequent_itemsets) # rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) - return [tuple(a) for a in frequent_itemsets["itemsets"]] + frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: tuple(x)) + return frequent_itemsets.to_dict(orient='records') -# def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False): -# if not data: -# return [] - -# n = max(1, n) - -# mean = np.mean(data) -# std = np.std(data) - -# if std == 0: -# return sorted(data, reverse=not ascending)[:n] - -# z_scores = [(x, (x - mean) / std) for x in data] - -# significant_values = [x for x, z in z_scores if z > threshold] - -# significant_values_sorted = sorted(significant_values, reverse=not ascending) - -# if len(significant_values_sorted) < n: -# remaining = sorted(data, reverse=not ascending)[:n - len(significant_values_sorted)] -# significant_values_sorted.extend(remaining) - -# return significant_values_sorted[:n] +def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False): + if not data: + return [] + n = max(1, n) + mean = np.mean(data) + std = np.std(data) + if std == 0: + return sorted(data, reverse=not ascending)[:n] + z_scores = [(x, (x - mean) / std) for x in data] + significant_values = [x for x, z in z_scores if z > threshold] + significant_values_sorted = sorted(significant_values, reverse=not ascending) + if len(significant_values_sorted) < n: + remaining = sorted(data, reverse=not ascending)[:n - len(significant_values_sorted)] + significant_values_sorted.extend(remaining) + return significant_values_sorted[:n] -def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05): - ans_all = [] +def get_test_acc_and_names(step2_data, metric_name): columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) test_accs = [] test_acc_names = [] - for r in range(1, len(columns)): #全流程的单独处理 + for r in range(1, len(columns) + 1): for com in itertools.combinations(columns, r): test_accs_arrays = [] groups = step2_data.groupby(by=list(com)) @@ -142,7 +146,14 @@ def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05): # if are_all_elements_same_direct(test_accs): # continue test_accs = replace_nan_in_2d(test_accs) - final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title=" ".join(list(com)), vis=vis) + return test_accs, test_acc_names + + +@deprecated("not used") +def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05): + ans_all = [] + test_accs, test_acc_names = get_test_acc_and_names(step2_data, metric_name) + final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title="all_pattern", vis=vis) if len(final_ranks) > 0: #TODO maybe need to think ascending max_rank = max(final_ranks) max_rank_count = final_ranks.count(max_rank) @@ -155,6 +166,89 @@ def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05): return ans_all +def get_significant_items(data): + abs_values = np.abs(list(data.values())) + percentile = 60 + threshold = np.percentile(abs_values, percentile) + significant_items = {k: v for k, v in data.items() if abs(v) >= threshold} + return significant_items + + +def get_forest_model_pattern(step2_data, metric_name): + columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) + X = step2_data.loc[:, columns] + y = step2_data.loc[:, metric_name] + preprocessor = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'), + columns) # drop='first'防止虚拟变量陷阱 + ]) + pipeline = Pipeline(steps=[('preprocessor', preprocessor), + ('regressor', + RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_split=2, + min_samples_leaf=1, random_state=42))]) + + param_grid = { + 'regressor__n_estimators': [10, 50, 100, 200], + 'regressor__max_depth': [3, 5, 7], + 'regressor__min_samples_split': [2, 5], + 'regressor__min_samples_leaf': [1, 2] + } + loo = LeaveOneOut() + + grid_search = GridSearchCV( + estimator=pipeline, + param_grid=param_grid, + cv=loo, + scoring='neg_mean_squared_error', + n_jobs=-1, + verbose=1, + refit=True # 确保在所有数据上重新训练最佳模型 + ) + grid_search.fit(X, y) + best_pipeline = grid_search.best_estimator_ + model = best_pipeline.named_steps['regressor'] + X_preprocessed = best_pipeline.named_steps['preprocessor'].transform( + X) #TODO best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)是否和X_preprocessed一定是对应的? + explainer = shapiq.TreeExplainer(model=model, index="k-SII", max_order=3) #思考为什么没有负值,因为是绝对值相加,可能是为了正负值不会相互抵消 + list_of_interaction_values = explainer.explain_X(X_preprocessed.toarray(), n_jobs=96, random_state=42) + plt.cla() + ax = shapiq.plot.bar_plot(list_of_interaction_values, + feature_names=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns), + max_display=None, show=False, need_abbreviate=False) + ax.yaxis.get_major_locator().MAXTICKS = 1000000 + plt.show() + rects = ax.containers[0] + yticklabels = ax.get_yticklabels() #label和rect是否重合需要验证 + shap_ans = {} + for rect, label in zip(rects, yticklabels): + xy = rect.get_xy() + height = rect.get_height() + width = rect.get_width() + k = label.get_text() + v = width + if k in shap_ans: + raise RuntimeError("Features should not be repeated") + shap_ans[k] = v + + ans = get_significant_items(shap_ans) #检查一下是不是真的pattern,好像结果不太好,再检验一下 + preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), index=X.index, + columns=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)) + preprocessed_df[metric_name] = step2_data[metric_name] + preprocessed_df_copy = deepcopy(preprocessed_df) + real_ans = {} + for k, v in ans.items(): + feature_name = k.split(' x ') + one_col = f"{','.join(feature_name)}__all__one" + preprocessed_df_copy[one_col] = preprocessed_df_copy[feature_name].eq(1).all(axis=1) + # method='pearson' + # pearson_corr = preprocessed_df_copy.loc[:,one_col].corr(preprocessed_df_copy.loc[:,metric_name], method=method) + r_pb, p_value = pointbiserialr(preprocessed_df_copy.loc[:, one_col].astype('category'), + preprocessed_df_copy.loc[:, metric_name]) + real_ans[k] = {"shapiq": v, "pointbiserialr": {"r_pb": r_pb, "p_value": p_value}} + real_ans["best_params"] = grid_search.best_params_ + real_ans["best_mse"] = -grid_search.best_score_ + return real_ans + + def summary_pattern(data_path, metric_name, ascending, alpha=0.05, vis=False): step2_origin_data = pd.read_csv(data_path) step2_data = step2_origin_data.dropna() diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py index 7010505d..896a0d0a 100644 --- a/examples/result_analysis/get_important_pattern_sweep.py +++ b/examples/result_analysis/get_important_pattern_sweep.py @@ -1,3 +1,4 @@ +import argparse import json import sys from pathlib import Path @@ -5,7 +6,8 @@ import pandas as pd import requests -from get_important_pattern import get_com_all, get_frequent_itemsets +from get_important_pattern import get_com_all, get_forest_model_pattern, get_frequent_itemsets +from numpy import choose sys.path.append("..") from get_result_web import spilt_web @@ -18,11 +20,26 @@ tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"] mertic_names = ["test_acc", "acc", "MRE", "ARI", "MSE"] ascendings = [False, False, True, False, True] + +multi_mod = False +if multi_mod: + raise NotImplementedError("multi mod") + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--positive", action='store_true') +parser.add_argument("--only_apr", action='store_true') +parser.add_argument("--choose_tasks", nargs="+", default=tasks) +args = parser.parse_args() +choose_tasks = args.choose_tasks +positive = args.positive +only_apr = args.only_apr +if not positive: + assert only_apr + ascendings = [not item for item in ascendings] file_root = Path(__file__).resolve().parent prefix = f'https://wandb.ai/{entity}/{project}' runs_sum = 0 wandb = try_import("wandb") -positive = True def get_additional_sweep(sweep_id): @@ -47,28 +64,61 @@ def get_additional_sweep(sweep_id): def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=False): # try: - step2_data = step2_origin_data.dropna() - com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha) + columns = sorted([col for col in step2_origin_data.columns if col.startswith("pipeline")]) + step2_data = step2_origin_data.loc[:, columns + [metric_name]] + # com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha) + step2_data[metric_name] = step2_data[metric_name].astype(float) + if not ascending: + min_metric = step2_data[metric_name].min() + if pd.isna(min_metric): + return { + "error": + f"All {metric_name} values ​​are NaN and the minimum cannot be calculated. Please check your data." + } + step2_data[metric_name] = step2_data[metric_name].fillna(0) #if ascending=False + else: + max_metric = step2_data[metric_name].max() + if pd.isna(max_metric): + return { + "error": + f"All {metric_name} values ​​are NaN and the maximum cannot be calculated. Please check your data." + } + print(f"\nmax {metric_name}:{max_metric}") + buffer_percentage = 0.2 # 20% + replacement = max_metric * (1 + buffer_percentage) + step2_data[metric_name] = step2_data[metric_name].fillna(replacement) apr_ans = get_frequent_itemsets(step2_data, metric_name, ascending) - return list(set(com_ans) & set(apr_ans)) + if positive and not only_apr: + return {"forest_model": get_forest_model_pattern(step2_data, metric_name), "apr_ans": apr_ans} + else: + return {"apr_ans": apr_ans} # except Exception as e: # print(e) # return str(e) if __name__ == "__main__": + start = True ans_all = [] for i, task in enumerate(tasks): + + if task not in choose_tasks: + continue data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str) data = data.ffill().set_index(['Methods']) for row_idx in range(data.shape[0]): for col_idx in range(data.shape[1]): + method = data.index[row_idx] dataset = data.columns[col_idx] value = data.iloc[row_idx, col_idx] step_name = data.iloc[row_idx]["Unnamed: 1"] - if method != "SVM" or dataset != "Dataset 1: GSE67835 Brain": + # if dataset=="Dataset6:pancreatic_cancer" and method == "Stlearn": + # start=True + if not start: continue + # if method !="ACTINN" : + # continue if isinstance(value, str) and value.startswith(prefix) and ( str(step_name).lower() == "step2" or str(step_name).lower() == "step 2"): #TODO add step3 sweep_url = value @@ -86,13 +136,18 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F summary_data.append(flatten_dict(result)) # get result and config ans = pd.DataFrame(summary_data).set_index(["id"]) ans.sort_index(axis=1, inplace=True) - print(dataset) - print(method) - ans_all.append({ + ans_single = { "task": task, "dataset": dataset, "method": method, "pattern": summary_pattern(ans, mertic_names[i], ascendings[i]) - }) - with open(f"positive:{positive}_pattern.json", "w") as f: + } + with open( + f"dance_auto_preprocess/patterns/{'only_apr_' if only_apr else ''}{'neg_' if not positive else ''}{task}_{dataset}_{method}_pattern.json", + "w") as f: + json.dump(ans_single, f, indent=2) + ans_all.append(ans_single) + print(dataset) + print(method) + with open(f"pattern.json", "w") as f: json.dump(ans_all, f, indent=2) diff --git a/examples/result_analysis/get_num.py b/examples/result_analysis/get_num.py new file mode 100644 index 00000000..6573bd70 --- /dev/null +++ b/examples/result_analysis/get_num.py @@ -0,0 +1,30 @@ +import sys +from pathlib import Path + +import pandas as pd + +sys.path.append("..") +import urllib + +from get_result_web import spilt_web + +from dance.utils import try_import + +wandb = try_import("wandb") +entity = "xzy11632" +project = "dance-dev" +tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"] +file_root = Path(__file__).resolve().parent +prefix = 'https://wandb.ai/xzy11632/dance-dev' + +runs_sum = 0 + +for task in tasks: + data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str) + matched_list = data.applymap(lambda x: x if isinstance(x, str) and x.startswith(prefix) else None).stack().tolist() + for sweep_url in matched_list: + _, _, sweep_id = spilt_web(sweep_url) + print(sweep_id) + sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + runs_sum += (len(sweep.runs)) +print(runs_sum) From 8e1d33f6df65c9145e5572661740d5ef5290cd3a Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 10:24:47 +0800 Subject: [PATCH 157/203] minor --- dance/transforms/filter.py | 2 +- dance/transforms/misc.py | 2 +- examples/single_modality/clustering/graphsc.py | 2 +- examples/single_modality/imputation/graphsci.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py index 8ec0ac1f..d34600fc 100644 --- a/dance/transforms/filter.py +++ b/dance/transforms/filter.py @@ -157,7 +157,7 @@ def prepCounts(self, x): @register_preprocessor("filter", "cell") -class FilterCellsScanpy(FilterScanpy): +class FilterCellsScanpy(FilterScanpy): """Scanpy filtering cell transformation with additional options. Allow passing gene counts as ratio diff --git a/dance/transforms/misc.py b/dance/transforms/misc.py index 8b47c8b5..877eb6f6 100644 --- a/dance/transforms/misc.py +++ b/dance/transforms/misc.py @@ -169,7 +169,7 @@ def __init__(self, **kwargs): def __call__(self, data: Data) -> Data: mod1, mod2, meta1, meta2, test_sol = data.data.mod.values() meta1 = meta1[:, mod1.var.index] - meta2 = meta2[:, mod2.var.index] + meta2 = meta2[:, mod2.var.index] test_sol = test_sol[:, mod1.var.index] data.data.mod["meta1"] = meta1 data.data.mod["meta2"] = meta2 diff --git a/examples/single_modality/clustering/graphsc.py b/examples/single_modality/clustering/graphsc.py index b1d72cef..af2c7576 100644 --- a/examples/single_modality/clustering/graphsc.py +++ b/examples/single_modality/clustering/graphsc.py @@ -37,7 +37,7 @@ parser.add_argument("-data", "--dataset", default="10X_PBMC", choices=["10X_PBMC", "mouse_bladder_cell", "mouse_ES_cell", "worm_neuron_cell"]) parser.add_argument("--seed", type=int, default=0, help="Initial seed random, offset for each repeatition") - parser.add_argument("--num_runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("--num_runs", type=int, default=5, help="Number of repetitions") parser.add_argument("--cache", action="store_true", help="Cache processed data.") args = parser.parse_args() aris = [] diff --git a/examples/single_modality/imputation/graphsci.py b/examples/single_modality/imputation/graphsci.py index c107614d..be3a272b 100644 --- a/examples/single_modality/imputation/graphsci.py +++ b/examples/single_modality/imputation/graphsci.py @@ -32,7 +32,7 @@ parser.add_argument("--cache", action="store_true", help="Cache processed data.") parser.add_argument("--mask", type=bool, default=True, help="Mask data for validation.") parser.add_argument("--seed", type=int, default=0, help="Initial seed random, offset for each repeatition") - parser.add_argument("--num_runs", type=int, default=1, help="Number of repetitions") + parser.add_argument("--num_runs", type=int, default=5, help="Number of repetitions") params = parser.parse_args() print(vars(params)) rmses = [] From 1087e3c17a0344e8210efd3e29e9be8a620cce67 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 10:27:12 +0800 Subject: [PATCH 158/203] minor --- examples/tuning/get_important_pattern.py | 90 +++++++++++++++++++----- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py index 39fcfb2f..c93d7114 100644 --- a/examples/tuning/get_important_pattern.py +++ b/examples/tuning/get_important_pattern.py @@ -10,6 +10,7 @@ import seaborn as sns from mlxtend.frequent_patterns import apriori from mlxtend.preprocessing import TransactionEncoder +from mlxtend.frequent_patterns import association_rules from networkx import parse_adjlist from scipy import stats @@ -17,13 +18,13 @@ ascending = False -def get_important_pattern(test_accs, vis=True, alpha=0.8, title=""): +def get_important_pattern(test_accs, vis=True, alpha=0.8, title="",test_acc_names=None): medians = [np.median(group) for group in test_accs] _, p_value = stats.kruskal(*test_accs) if vis: fig = plt.figure(figsize=(12, 4)) sns.boxplot(data=test_accs) - plt.xticks(list(range(len(test_accs))), [f"{i}" for i in range(len(test_accs))]) + plt.xticks(list(range(len(test_accs))), ([f"{i}" for i in range(len(test_accs))] if test_acc_names is None else test_acc_names),rotation=45, fontsize=10) plt.title(title) plt.show() if p_value < alpha: @@ -71,7 +72,7 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True): test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])}) test_accs = [i[metric_name] for i in test_accs_arrays] test_acc_names = [i["name"] for i in test_accs_arrays] - final_ranks = get_important_pattern(test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis) + final_ranks = get_important_pattern(test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis,test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names]) if len(final_ranks) > 0: max_rank = max(final_ranks) max_rank_count = final_ranks.count(max_rank) @@ -82,9 +83,57 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True): print(f"index={index},name={test_acc_name},rank={rank}") ans.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, )) return ans - - -def get_frequent_itemsets(step2_data, threshold_per=0.1): +def draw_graph(rules, rules_to_show): + import networkx as nx + G1 = nx.DiGraph() + + color_map=[] + N = 50 + colors = np.random.rand(N) + strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11'] + + + for i in range (rules_to_show): + G1.add_nodes_from(["R"+str(i)]) + + + for a in rules.iloc[i]['antecedents']: + + G1.add_nodes_from([a]) + + G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2) + + for c in rules.iloc[i]['consequents']: + + G1.add_nodes_from([c]) + + G1.add_edge("R"+str(i), c, color=colors[i], weight=2) + + for node in G1: + found_a_string = False + for item in strs: + if node==item: + found_a_string = True + if found_a_string: + color_map.append('yellow') + else: + color_map.append('green') + + + + edges = G1.edges() + colors = [G1[u][v]['color'] for u,v in edges] + weights = [G1[u][v]['weight'] for u,v in edges] + + pos = nx.spring_layout(G1, k=16, scale=1) + nx.draw(G1, pos, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False) + + for p in pos: # raise text positions + pos[p][1] += 0.07 + nx.draw_networkx_labels(G1, pos) + plt.show() + +def get_frequent_itemsets(step2_data, threshold_per=0.1,vis=False): threshold = int(len(step2_data) * threshold_per) df_sorted = step2_data.sort_values(metric_name, ascending=ascending) top_10_percent = df_sorted.head(threshold) @@ -95,7 +144,16 @@ def get_frequent_itemsets(step2_data, threshold_per=0.1): df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True) # print(frequent_itemsets) - # rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) + rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) + if vis: + # print(frequent_itemsets) + # print(frequent_itemsets) + # draw_graph(rules=rules,rules_to_show=10) + frequent_itemsets_copy=frequent_itemsets.copy() + frequent_itemsets_copy=frequent_itemsets_copy.sort_values(by="support") + frequent_itemsets_copy.plot(x="itemsets",y="support",kind="bar") + plt.xticks(rotation=30, fontsize=7) + # print(type(rules)) return [tuple(a) for a in frequent_itemsets["itemsets"]] @@ -111,7 +169,7 @@ def summary_pattern(data_path, alpha=0.8, vis=False): step2_origin_data = pd.read_csv(data_path) step2_data = step2_origin_data.dropna() com_ans = get_com_all(step2_data, vis=vis, alpha=alpha) - apr_ans = get_frequent_itemsets(step2_data) + apr_ans = get_frequent_itemsets(step2_data,vis=vis) return list(set(com_ans) & set(apr_ans)) @@ -136,11 +194,11 @@ def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False) if __name__ == "__main__": - directories = [] - for path in Path('/home/zyxing/dance/examples/tuning').iterdir(): - if path.is_dir(): - if str(path.name).startswith("cluster"): - directories.append(path) - list_files(directories) - - # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_scdeepsort/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True)) + # directories = [] + # for path in Path('/home/zyxing/dance/examples/tuning').iterdir(): + # if path.is_dir(): + # if str(path.name).startswith("cluster"): + # directories.append(path) + # list_files(directories) + + print(summary_pattern("/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv",alpha=0.3,vis=False)) From 1e843d3507e280c4c9b6633481fdcd3296eee14e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:27:58 +0000 Subject: [PATCH 159/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/transforms/filter.py | 2 +- dance/transforms/misc.py | 2 +- examples/tuning/get_important_pattern.py | 119 ++++++++++++----------- 3 files changed, 64 insertions(+), 59 deletions(-) diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py index d34600fc..8ec0ac1f 100644 --- a/dance/transforms/filter.py +++ b/dance/transforms/filter.py @@ -157,7 +157,7 @@ def prepCounts(self, x): @register_preprocessor("filter", "cell") -class FilterCellsScanpy(FilterScanpy): +class FilterCellsScanpy(FilterScanpy): """Scanpy filtering cell transformation with additional options. Allow passing gene counts as ratio diff --git a/dance/transforms/misc.py b/dance/transforms/misc.py index 877eb6f6..8b47c8b5 100644 --- a/dance/transforms/misc.py +++ b/dance/transforms/misc.py @@ -169,7 +169,7 @@ def __init__(self, **kwargs): def __call__(self, data: Data) -> Data: mod1, mod2, meta1, meta2, test_sol = data.data.mod.values() meta1 = meta1[:, mod1.var.index] - meta2 = meta2[:, mod2.var.index] + meta2 = meta2[:, mod2.var.index] test_sol = test_sol[:, mod1.var.index] data.data.mod["meta1"] = meta1 data.data.mod["meta2"] = meta2 diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py index c93d7114..04c0dc44 100644 --- a/examples/tuning/get_important_pattern.py +++ b/examples/tuning/get_important_pattern.py @@ -8,9 +8,8 @@ import pandas as pd import scikit_posthocs as sp import seaborn as sns -from mlxtend.frequent_patterns import apriori +from mlxtend.frequent_patterns import apriori, association_rules from mlxtend.preprocessing import TransactionEncoder -from mlxtend.frequent_patterns import association_rules from networkx import parse_adjlist from scipy import stats @@ -18,13 +17,15 @@ ascending = False -def get_important_pattern(test_accs, vis=True, alpha=0.8, title="",test_acc_names=None): +def get_important_pattern(test_accs, vis=True, alpha=0.8, title="", test_acc_names=None): medians = [np.median(group) for group in test_accs] _, p_value = stats.kruskal(*test_accs) if vis: fig = plt.figure(figsize=(12, 4)) sns.boxplot(data=test_accs) - plt.xticks(list(range(len(test_accs))), ([f"{i}" for i in range(len(test_accs))] if test_acc_names is None else test_acc_names),rotation=45, fontsize=10) + plt.xticks(list(range(len(test_accs))), + ([f"{i}" for i in range(len(test_accs))] if test_acc_names is None else test_acc_names), rotation=45, + fontsize=10) plt.title(title) plt.show() if p_value < alpha: @@ -72,7 +73,9 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True): test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])}) test_accs = [i[metric_name] for i in test_accs_arrays] test_acc_names = [i["name"] for i in test_accs_arrays] - final_ranks = get_important_pattern(test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis,test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names]) + final_ranks = get_important_pattern( + test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis, + test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names]) if len(final_ranks) > 0: max_rank = max(final_ranks) max_rank_count = final_ranks.count(max_rank) @@ -83,57 +86,56 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True): print(f"index={index},name={test_acc_name},rank={rank}") ans.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, )) return ans + + def draw_graph(rules, rules_to_show): - import networkx as nx - G1 = nx.DiGraph() - - color_map=[] - N = 50 - colors = np.random.rand(N) - strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11'] - - - for i in range (rules_to_show): - G1.add_nodes_from(["R"+str(i)]) - - - for a in rules.iloc[i]['antecedents']: - - G1.add_nodes_from([a]) - - G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2) - - for c in rules.iloc[i]['consequents']: - + import networkx as nx + G1 = nx.DiGraph() + + color_map = [] + N = 50 + colors = np.random.rand(N) + strs = ['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11'] + + for i in range(rules_to_show): + G1.add_nodes_from(["R" + str(i)]) + + for a in rules.iloc[i]['antecedents']: + + G1.add_nodes_from([a]) + + G1.add_edge(a, "R" + str(i), color=colors[i], weight=2) + + for c in rules.iloc[i]['consequents']: + G1.add_nodes_from([c]) - - G1.add_edge("R"+str(i), c, color=colors[i], weight=2) - - for node in G1: - found_a_string = False - for item in strs: - if node==item: + + G1.add_edge("R" + str(i), c, color=colors[i], weight=2) + + for node in G1: + found_a_string = False + for item in strs: + if node == item: found_a_string = True - if found_a_string: + if found_a_string: color_map.append('yellow') - else: - color_map.append('green') - - - - edges = G1.edges() - colors = [G1[u][v]['color'] for u,v in edges] - weights = [G1[u][v]['weight'] for u,v in edges] - - pos = nx.spring_layout(G1, k=16, scale=1) - nx.draw(G1, pos, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False) - - for p in pos: # raise text positions - pos[p][1] += 0.07 - nx.draw_networkx_labels(G1, pos) - plt.show() - -def get_frequent_itemsets(step2_data, threshold_per=0.1,vis=False): + else: + color_map.append('green') + + edges = G1.edges() + colors = [G1[u][v]['color'] for u, v in edges] + weights = [G1[u][v]['weight'] for u, v in edges] + + pos = nx.spring_layout(G1, k=16, scale=1) + nx.draw(G1, pos, node_color=color_map, edge_color=colors, width=weights, font_size=16, with_labels=False) + + for p in pos: # raise text positions + pos[p][1] += 0.07 + nx.draw_networkx_labels(G1, pos) + plt.show() + + +def get_frequent_itemsets(step2_data, threshold_per=0.1, vis=False): threshold = int(len(step2_data) * threshold_per) df_sorted = step2_data.sort_values(metric_name, ascending=ascending) top_10_percent = df_sorted.head(threshold) @@ -149,9 +151,9 @@ def get_frequent_itemsets(step2_data, threshold_per=0.1,vis=False): # print(frequent_itemsets) # print(frequent_itemsets) # draw_graph(rules=rules,rules_to_show=10) - frequent_itemsets_copy=frequent_itemsets.copy() - frequent_itemsets_copy=frequent_itemsets_copy.sort_values(by="support") - frequent_itemsets_copy.plot(x="itemsets",y="support",kind="bar") + frequent_itemsets_copy = frequent_itemsets.copy() + frequent_itemsets_copy = frequent_itemsets_copy.sort_values(by="support") + frequent_itemsets_copy.plot(x="itemsets", y="support", kind="bar") plt.xticks(rotation=30, fontsize=7) # print(type(rules)) return [tuple(a) for a in frequent_itemsets["itemsets"]] @@ -169,7 +171,7 @@ def summary_pattern(data_path, alpha=0.8, vis=False): step2_origin_data = pd.read_csv(data_path) step2_data = step2_origin_data.dropna() com_ans = get_com_all(step2_data, vis=vis, alpha=alpha) - apr_ans = get_frequent_itemsets(step2_data,vis=vis) + apr_ans = get_frequent_itemsets(step2_data, vis=vis) return list(set(com_ans) & set(apr_ans)) @@ -201,4 +203,7 @@ def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False) # directories.append(path) # list_files(directories) - print(summary_pattern("/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv",alpha=0.3,vis=False)) + print( + summary_pattern( + "/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv", + alpha=0.3, vis=False)) From 46b5cb4a5109dc299383110a94dd2bc58bb6f8b3 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 10:41:01 +0800 Subject: [PATCH 160/203] minor --- examples/tuning/get_important_pattern.py | 19 +++++++++++-------- examples/tuning/joint_embedding_jae/main.py | 7 ++++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py index 39fcfb2f..25c1b90c 100644 --- a/examples/tuning/get_important_pattern.py +++ b/examples/tuning/get_important_pattern.py @@ -136,11 +136,14 @@ def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False) if __name__ == "__main__": - directories = [] - for path in Path('/home/zyxing/dance/examples/tuning').iterdir(): - if path.is_dir(): - if str(path.name).startswith("cluster"): - directories.append(path) - list_files(directories) - - # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_scdeepsort/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True)) + # directories = [] + # for path in Path('/home/zyxing/dance/examples/tuning').iterdir(): + # if path.is_dir(): + # if str(path.name).startswith("cluster"): + # directories.append(path) + # list_files(directories) + + print( + summary_pattern( + "/home/zyxing/dance/examples/tuning/cta_actinn/1013-1247-598-732-767-768-770-784-845-864_315-340-376-381-390-404-437-490-551-559/results/pipeline/best_test_acc.csv", + alpha=0.3, vis=True)) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index 3ba455d0..a45a9b40 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper @@ -21,7 +21,7 @@ parser.add_argument( "-t", "--subtask", default="openproblems_bmmc_cite_phase2", choices=[ "GSE140203_BRAIN_atac2gex", "GSE140203_SKIN_atac2gex", "openproblems_bmmc_cite_phase2", - "openproblems_bmmc_multiome_phase2" + "openproblems_bmmc_multiome_phase2", "openproblems_2022_multi_atac2gex" ]) parser.add_argument("-d", "--data_folder", default="./data/joint_embedding") parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained") @@ -136,7 +136,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", root_path=file_root_path, required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", - "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") + "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], + metric="ARI") # need to delete required_funs and required_indexes if args.tune_mode == "pipeline_params": run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) """To reproduce JAE on other samples, please refer to command lines belows: From 10f82995a7ab0547e705681847614340ab636ac4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:45:22 +0000 Subject: [PATCH 161/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_jae/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py index a45a9b40..0c5d283b 100644 --- a/examples/tuning/joint_embedding_jae/main.py +++ b/examples/tuning/joint_embedding_jae/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper From f6b39897c0c97deece5c364bd44ed6c4f82c52da Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Wed, 18 Dec 2024 10:46:18 +0800 Subject: [PATCH 162/203] add scmvae --- .../joint_embedding_scmvae/main.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py diff --git a/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py b/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py new file mode 100644 index 00000000..9fb85885 --- /dev/null +++ b/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py @@ -0,0 +1,208 @@ +import argparse +import gc +import os +import pprint +import sys +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +import torch.utils.data as data_utils +import wandb +from sklearn import preprocessing + +from dance import logger +from dance.datasets.multimodality import JointEmbeddingNIPSDataset +from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE +from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.transforms.preprocess import calculate_log_library_size +from dance.utils import set_seed + + +def parameter_setting(): + parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis") + + parser.add_argument("--workdir", "-wk", type=str, default="./new_test", help="work path") + parser.add_argument("--outdir", "-od", type=str, default="./new_test", help="Output path") + + parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate") + parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay") + parser.add_argument("--eps", type=float, default=0.01, help="eps") + parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") + + parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size") + parser.add_argument('-seed', '--seed', type=int, default=1, help='Random seed for repeat results') + parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim") + parser.add_argument("--max_epoch", "-me", type=int, default=25, help="Max epoches") + parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration") + parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch") + parser.add_argument("--epoch_per_test", "-ept", type=int, default=1, + help="Epoch per test, must smaller than max iteration.") + parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") + parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") + parser.add_argument("-device", "--device", default="cuda") + parser.add_argument("--final_rate", type=float, default=1e-4) + parser.add_argument("--scale_factor", type=float, default=4) + + parser.add_argument("--cache", action="store_true", help="Cache processed data.") + parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) + parser.add_argument("--count", type=int, default=2) + parser.add_argument("--sweep_id", type=str, default=None) + parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) + parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) + + return parser + + +if __name__ == "__main__": + parser = parameter_setting() + args = parser.parse_args() + assert args.max_iteration > args.epoch_per_test + device = torch.device(args.device) + args.lr = 0.001 + args.anneal_epoch = 200 + res = None + logger.info(f"\n{pprint.pformat(vars(args))}") + file_root_path = Path(args.root_path, args.subtask).resolve() + logger.info(f"\n files is saved in {file_root_path}") + pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") + os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" + + def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): + wandb.init(settings=wandb.Settings(start_method='thread')) + set_seed(args.seed) + wandb_config = wandb.config + if "run_kwargs" in pipeline_planer.config: + if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): + wandb_config = wandb_config["run_kwargs"] + else: + wandb.log({"skip": 1}) + wandb.finish() + return + try: + dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") + data = dataset.load_data() + + le = preprocessing.LabelEncoder() + labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + data.mod["mod1"].obsm["labels"] = labels + + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb_config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") + preprocessing_pipeline(data) + train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] + train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] + test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx))) + + # train_size=data.mod["meta1"].shape[0] + # test_size=data.mod["mod1"].shape[0]-train_size + data.set_split_idx("train", train_idx) + data.set_split_idx("test", test_idx) + (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch") + (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch") + # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels) + lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()])) + lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()])) + lib_mean1 = torch.from_numpy(lib_mean1) + lib_var1 = torch.from_numpy(lib_var1) + lib_mean2 = torch.from_numpy(lib_mean2) + lib_var2 = torch.from_numpy(lib_var2) + + Nfeature1 = x_train.shape[1] + Nfeature2 = y_train.shape[1] + # train_size = len(data.get_split_idx("train")) + # train_size=x_train.shape[0] + train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx], + lib_var2[train_idx], y_train) + + valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx], + lib_var2[test_idx], y_test) + + total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) + + total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) + + x_test = torch.cat([x_train, x_test]) + y_test = torch.cat([y_train, y_test]) + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 + model = scMVAE( + encoder_1=[Nfeature1, 1024, 128, 128], + hidden_1=128, + Z_DIMS=22, + decoder_share=[22, 128, 256], + share_hidden=128, + decoder_1=[128, 128, 1024], + hidden_2=1024, + encoder_l=[Nfeature1, 128], + hidden3=128, + encoder_2=[Nfeature2, 1024, 128, 128], + hidden_4=128, + encoder_l1=[Nfeature2, 128], + hidden3_1=128, + decoder_2=[128, 128, 1024], + hidden_5=1024, + drop_rate=0.1, + log_variational=True, + Type="ZINB", + device=device, + n_centroids=22, + penality="GMM", + model=1, + ) + model.to(device) + model.init_gmm_params(total_loader) + model.fit(args, train, valid, args.final_rate, args.scale_factor, device) + + # embeds = model.predict(x_test, y_test).cpu().numpy() + score = model.score(x_test, y_test, labels) + # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) + score["ARI"] = score["dance_ari"] + del score["dance_ari"] + wandb.log(score) + wandb.finish() + finally: + locals_keys = list(locals().keys()) + for var in locals_keys: + try: + exec(f"del {var}") + logger.info(f"Deleted '{var}'") + except NameError: + logger.info(f"Variable '{var}' does not exist, continuing...") + torch.cuda.empty_cache() + gc.collect() + # score.update({ + # 'seed': args.seed + k, + # 'subtask': args.subtask, + # 'method': 'scmvae', + # }) + + # if res is not None: + # res = res.append(score, ignore_index=True) + # else: + # for s in score: + # score[s] = [score[s]] + # res = pd.DataFrame(score) + + entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( + evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch + save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) + if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": + get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, + conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", + root_path=file_root_path, + required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", + "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") + if args.tune_mode == "pipeline_params": + run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) +"""To reproduce scMVAE on other samples, please refer to command lines belows: + +GEX-ADT: +$ python scmvae.py --subtask openproblems_bmmc_cite_phase2 --device cuda + +GEX-ATAC: +$ python scmvae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda + +""" From d0c5c07e6a6f5f4ee59c15323b5febd13bf910b7 Mon Sep 17 00:00:00 2001 From: xzy Date: Tue, 17 Dec 2024 21:54:12 -0500 Subject: [PATCH 163/203] minor --- examples/tuning/cta_celltypist/main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py index c625065f..c8936b90 100644 --- a/examples/tuning/cta_celltypist/main.py +++ b/examples/tuning/cta_celltypist/main.py @@ -7,12 +7,12 @@ import numpy as np import torch -import wandb +import wandb from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist -from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data +from dance.pipeline import Pipeline, PipelinePlaner, get_step3_yaml, run_step3, save_summary_data from dance.typing import LogLevel from dance.utils import set_seed @@ -56,12 +56,20 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): wandb.init(settings=wandb.Settings(start_method='thread')) set_seed(args.seed) + if "run_kwargs" in pipeline_planer.config and tune_mode == "params": + wandb_config = dict(wandb.config) + config = {'pipeline': wandb_config["run_kwargs"], "type": "preprocessor"} + preprocessing_pipeline = Pipeline(config) + else: + # Prepare preprocessing pipeline and apply it to data + kwargs = {tune_mode: dict(wandb.config)} + preprocessing_pipeline = pipeline_planer.generate(**kwargs) # Initialize model and get model specific preprocessing pipeline model = Celltypist(majority_voting=args.majority_voting) # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb.config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) + # kwargs = {tune_mode: dict(wandb.config)} + # preprocessing_pipeline = pipeline_planer.generate(**kwargs) # Load data and perform necessary preprocessing data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, From 577919a0fc7809877bcf6a1778fedb1fee9cfc17 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:56:25 +0000 Subject: [PATCH 164/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/cta_celltypist/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py index 089e63a2..58870699 100644 --- a/examples/tuning/cta_celltypist/main.py +++ b/examples/tuning/cta_celltypist/main.py @@ -7,8 +7,8 @@ import numpy as np import torch - import wandb + from dance import logger from dance.datasets.singlemodality import CellTypeAnnotationDataset from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist From c7169ebf6e769a15736a299394cfd0d4aba5e246 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 11:11:00 +0800 Subject: [PATCH 165/203] minor --- dance/atlas/sc_similarity/anndata_similarity.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dance/atlas/sc_similarity/anndata_similarity.py b/dance/atlas/sc_similarity/anndata_similarity.py index ab44e6e7..88784a9d 100644 --- a/dance/atlas/sc_similarity/anndata_similarity.py +++ b/dance/atlas/sc_similarity/anndata_similarity.py @@ -1,5 +1,4 @@ # anndata_similarity.py -# TODO translate notes import re import warnings from typing import Callable, Dict, List, Optional From fa7be0a73b90a81d995557d6f5a5c60b987e956b Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 11:13:04 +0800 Subject: [PATCH 166/203] minor --- dance/sc_similarity/anndata_similarity.py | 511 ---------------------- 1 file changed, 511 deletions(-) delete mode 100644 dance/sc_similarity/anndata_similarity.py diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py deleted file mode 100644 index 5409fdb6..00000000 --- a/dance/sc_similarity/anndata_similarity.py +++ /dev/null @@ -1,511 +0,0 @@ -# anndata_similarity.py -# TODO translate notes -import re -import warnings -from typing import Callable, Dict, List, Optional - -import anndata -import anndata as ad -import numpy as np -import ot -import pandas as pd -import scanpy as sc -import scipy -import yaml -from omegaconf import OmegaConf -from scipy.linalg import sqrtm -from scipy.spatial import cKDTree -from scipy.spatial.distance import cdist, directed_hausdorff, jaccard, jensenshannon -from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel - -# Suppress scipy warnings for constant input in Pearson correlation -warnings.filterwarnings("ignore", message="An input array is constant") - - -class AnnDataSimilarity: - - def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None, - init_random_state: Optional[int] = None, n_runs: int = 10, - ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None, - adata2_name: Optional[str] = None, - methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"): - """Initialize the AnnDataSimilarity object and perform data preprocessing.""" - self.origin_adata1 = adata1.copy() - self.origin_adata2 = adata2.copy() - self.sample_size = sample_size - self.init_random_state = init_random_state - self.preprocess() - self.results = {} - self.ground_truth_conf_path = ground_truth_conf_path - self.adata1_name = adata1_name - self.adata2_name = adata2_name - self.methods = methods - self.tissue = tissue - self.n_runs = n_runs - - def filter_gene(self, n_top_genes=3000): - sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3') - sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3') - - common_hvg = self.origin_adata1.var_names[self.origin_adata1.var['highly_variable']].intersection( - self.origin_adata2.var_names[self.origin_adata2.var['highly_variable']]) - - self.origin_adata1 = self.origin_adata1[:, common_hvg].copy() - self.origin_adata2 = self.origin_adata2[:, common_hvg].copy() - self.common_genes = common_hvg - - def preprocess(self): - """Preprocess the data, including log normalization and normalization to - probability distribution.""" - self.filter_gene() - - def sample_cells(self, random_state): - """Randomly sample cells from each dataset if sample_size is specified.""" - np.random.seed(random_state) - if self.sample_size is None: - self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs) #need to think - if self.adata1.n_obs > self.sample_size: - indices1 = np.random.choice(self.adata1.n_obs, size=self.sample_size, replace=False) - self.sampled_adata1 = self.adata1[indices1, :].copy() - else: - self.sampled_adata1 = self.adata1.copy() - if self.adata2.n_obs > self.sample_size: - indices2 = np.random.choice(self.adata2.n_obs, size=self.sample_size, replace=False) - self.sampled_adata2 = self.adata2[indices2, :].copy() - else: - self.sampled_adata2 = self.adata2.copy() - - def normalize_data(self): # I am not sure - """Normalize the data by total counts per cell and log-transform.""" - sc.pp.normalize_total(self.adata1, target_sum=1e4) - sc.pp.log1p(self.adata1) - sc.pp.normalize_total(self.adata2, target_sum=1e4) - sc.pp.log1p(self.adata2) - - def set_prob_data(self, sampled=False): - # Normalize the data to probability distributions - if sampled: - prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) - prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) - else: - prob_adata1 = self.adata1.X / self.adata1.X.sum(axis=1) - prob_adata2 = self.adata2.X / self.adata2.X.sum(axis=1) - # Handle any NaN values resulting from division by zero - self.X = np.nan_to_num(prob_adata1).toarray() - self.Y = np.nan_to_num(prob_adata2).toarray() - - def cosine_sim_sampled(self) -> pd.DataFrame: - """Computes the average cosine similarity between all pairs of cells from the - two datasets.""" - # Compute cosine similarity matrix - sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X) - # Return the average similarity - return sim_matrix.mean() - - def pearson_corr_sampled(self) -> pd.DataFrame: - """Computes the average Pearson correlation coefficient between all pairs of - cells from the two datasets.""" - # Compute Pearson correlation matrix - corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(), - self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs, - self.sampled_adata1.n_obs:] - # Return the average correlation - return np.nanmean(corr_matrix) - - def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame: - """Computes the average Jaccard similarity between all pairs of binarized cells - from the two datasets.""" - # Binarize the data - binary_adata1 = (self.sampled_adata1.X > threshold).astype(int) - binary_adata2 = (self.sampled_adata2.X > threshold).astype(int) - # Compute Jaccard distance matrix - distance_matrix = cdist(binary_adata1.A, binary_adata2.A, metric='jaccard') - # Convert to similarity and compute the average - similarity_matrix = 1 - distance_matrix - return similarity_matrix.mean() - - def js_divergence_sampled(self) -> float: - """Computes the average Jensen-Shannon divergence between all pairs of cells - from the two datasets.""" - # Normalize the data to probability distributions - prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1) - prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1) - # Handle any NaN values resulting from division by zero - prob_adata1 = np.nan_to_num(prob_adata1).toarray() - prob_adata2 = np.nan_to_num(prob_adata2).toarray() - - # Define a function to compute JS divergence for a pair of probability vectors - def jsd(p, q): - return jensenshannon(p, q) - - # Compute JS divergence matrix - jsd_vectorized = np.vectorize(jsd, signature='(n),(n)->()') - divergence_matrix = np.zeros((prob_adata1.shape[0], prob_adata2.shape[0])) - for i in range(prob_adata1.shape[0]): - divergence_matrix[i, :] = jsd_vectorized( - np.repeat(prob_adata1[i, :], prob_adata2.shape[0], axis=0).reshape(-1, prob_adata1.shape[1]), - prob_adata2) - - # Convert divergence to similarity and compute the average - similarity_matrix = 1 - divergence_matrix - return np.nanmean(similarity_matrix) - - def compute_mmd(self) -> float: - X = self.X - Y = self.Y - kernel = "rbf" - gamma = 1.0 - if kernel == 'rbf': - K_X = np.exp(-gamma * cdist(X, X, 'sqeuclidean')) - K_Y = np.exp(-gamma * cdist(Y, Y, 'sqeuclidean')) - K_XY = np.exp(-gamma * cdist(X, Y, 'sqeuclidean')) - elif kernel == 'linear': - K_X = np.dot(X, X.T) - K_Y = np.dot(Y, Y.T) - K_XY = np.dot(X, Y.T) - else: - raise ValueError("Unsupported kernel type") - - m = X.shape[0] - n = Y.shape[0] - - sum_X = (np.sum(K_X) - np.sum(np.diag(K_X))) / (m * (m - 1)) - sum_Y = (np.sum(K_Y) - np.sum(np.diag(K_Y))) / (n * (n - 1)) - sum_XY = np.sum(K_XY) / (m * n) - - mmd_squared = sum_X + sum_Y - 2 * sum_XY - mmd = np.sqrt(max(mmd_squared, 0)) - return 1 / (1 + mmd) - - def common_genes_num(self): - return len(self.common_genes) - - def otdd(self): - """Compute the OTDD between two data sets.""" - raise NotImplementedError("OTDD!") - - def data_company(self): - raise NotImplementedError("data company") - - def wasserstein_dist(self) -> float: - """Computes the average Wasserstein distance between all pairs of cells from the - two datasets.""" - X = self.X - Y = self.Y - a = np.ones((X.shape[0], )) / X.shape[0] - b = np.ones((Y.shape[0], )) / Y.shape[0] - M = ot.dist(X, Y, metric='euclidean') - wasserstein_dist = ot.emd2(a, b, M) - return 1 / 1 + wasserstein_dist - - def get_Hausdorff(self): - X = self.X - Y = self.Y - forward = directed_hausdorff(X, Y)[0] - backward = directed_hausdorff(X, Y)[0] - hausdorff_distance = max(forward, backward) - normalized_hausdorff = hausdorff_distance / np.sqrt(X.shape[1]) - similarity = 1 - normalized_hausdorff - return similarity - - def chamfer_distance(self): - X = self.X - Y = self.Y - tree_A = cKDTree(X) - tree_B = cKDTree(Y) - - distances_A_to_B, _ = tree_A.query(Y) - distances_B_to_A, _ = tree_B.query(X) - - chamfer_A_to_B = np.mean(distances_A_to_B) - chamfer_B_to_A = np.mean(distances_B_to_A) - distance = chamfer_A_to_B + chamfer_B_to_A - normalized_chamfer = distance / np.sqrt(X.shape[1]) - similarity = 1 - normalized_chamfer - return similarity - - def energy_distance_metric(self): - X = self.X - Y = self.Y - XX = cdist(X, X, 'euclidean') - YY = cdist(Y, Y, 'euclidean') - XY = cdist(X, Y, 'euclidean') - distance = 2 * np.mean(XY) - np.mean(XX) - np.mean(YY) - return 1 / (1 + distance) - - def get_sinkhorn2(self): - X = self.X - Y = self.Y - a = np.ones(X.shape[0]) / X.shape[0] - b = np.ones(Y.shape[0]) / Y.shape[0] - M = ot.dist(X, Y, metric='euclidean') - reg = 0.1 - sinkhorn_dist = ot.sinkhorn2(a, b, M, reg) - return 1 / (1 + sinkhorn_dist) - - def bures_distance(self): - X = self.X - Y = self.Y - C1 = np.cov(X, rowvar=False) - C2 = np.cov(Y, rowvar=False) - sqrt_C1 = sqrtm(C1) - product = sqrt_C1 @ C2 @ sqrt_C1 - sqrt_product = sqrtm(product) - trace = np.trace(C1) + np.trace(C2) - 2 * np.trace(sqrt_product) - return 1 / (1 + np.sqrt(max(trace, 0))) - - def spectral_distance(self): - X = self.X - Y = self.Y - C1 = np.cov(X, rowvar=False) - C2 = np.cov(Y, rowvar=False) - eig_A = np.linalg.eigvalsh(C1) - eig_B = np.linalg.eigvalsh(C2) - return 1 / (1 + np.linalg.norm(eig_A - eig_B)) - - def get_dataset_meta_sim(self): - # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general'] - con_cols = [ - "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean", - "n_counts_var", "var_n_counts_mean", "var_n_counts_var" - ] - dis_cols = ['assay', 'tissue'] - - def get_discrete_sim(col_list1, col_list2): - set1 = set(col_list1) - set2 = set(col_list2) - intersection = len(set1.intersection(set2)) - union = len(set1.union(set2)) - return intersection / union - - def get_con_sim(con_data_1, con_data_2): - return abs(con_data_1 - con_data_2) / max(con_data_1, con_data_2) - - def get_dataset_info(data: ad.AnnData): - con_sim = {} - con_sim["nnz_mean"] = np.mean(data.obs["nnz"]) #sample 10000之后这里是应该更新的 - con_sim["nnz_var"] = np.var(data.obs["nnz"]) - nnz_values = data.X[data.X.nonzero()] - con_sim["nnz_counts_mean"] = np.mean(nnz_values) - con_sim["nnz_counts_var"] = np.var(nnz_values) - con_sim["n_measured_vars"] = np.mean(data.obs["n_measured_vars"]) - con_sim["cell_num"] = len(data.obs) - con_sim["gene_num"] = len(data.var) - con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"]) - con_sim["n_counts_var"] = np.var(data.obs["n_counts"]) - if "n_counts" not in data.var.columns: - if scipy.sparse.issparse(data.X): - gene_counts = np.array(data.X.sum(axis=0)).flatten() - else: - gene_counts = data.X.sum(axis=0) - data.var["n_counts"] = gene_counts - data.var["n_counts"] = data.var["n_counts"].astype(float) - con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"]) - con_sim["var_n_counts_var"] = np.var(data.var["n_counts"]) - data.uns["con_sim"] = con_sim - return data - - data_1 = self.adata1.copy() - data_2 = self.adata2.copy() - data_1 = get_dataset_info(data_1) - data_2 = get_dataset_info(data_2) - ans = {} - obs_1 = data_1.obs - obs_2 = data_2.obs - con_sim_1 = data_1.uns["con_sim"] - con_sim_2 = data_2.uns["con_sim"] - for dis_col in dis_cols: - ans[f"{dis_col}_sim"] = get_discrete_sim(obs_1[dis_col].values, obs_2[dis_col].values) - for con_col in con_cols: - ans[f"{con_col}_sim"] = get_con_sim(con_sim_1[con_col], con_sim_2[con_col]) - return np.mean(list(ans.values())) - - def get_ground_truth(self): - assert self.ground_truth_conf_path is not None - assert self.adata1_name is not None - assert self.adata2_name is not None - ground_truth_conf = pd.read_excel(self.ground_truth_conf_path, sheet_name=self.tissue, index_col=0) - - def get_targets(dataset_truth: str): - dataset_truth = OmegaConf.create(fix_yaml_string(dataset_truth)) - targets = [] - for item in dataset_truth: - targets.append(item["target"]) - return targets - - sim_targets = [] - for method in self.methods: - query_dataset_truth = ground_truth_conf.loc[self.adata1_name, f"{method}_method"] - atlas_dataset_truth = ground_truth_conf.loc[self.adata2_name, f"{method}_method"] - query_targets = get_targets(query_dataset_truth) - atlas_targets = get_targets(atlas_dataset_truth) - assert len(query_targets) == len(atlas_targets) - sim_targets.append((sum(a == b for a, b in zip(query_targets, atlas_targets)), len(query_targets))) - sim_targets.append((sum(x for x, y in sim_targets), sum(y for x, y in sim_targets))) - return sim_targets - - def compute_similarity( - self, random_state: int, methods: List[str] = [ - 'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim" - ] - ) -> Dict[str, float]: - """Computes the specified similarity measure. Parameters: - - methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' - Returns: - Dictionary containing the similarity matrices - - """ - self.adata1 = self.origin_adata1.copy() - self.adata2 = self.origin_adata2.copy() - self.normalize_data() - self.sample_cells(random_state) - self.set_prob_data() - - results = {} - for method in methods: - print(method) - if method == 'cosine': - results['cosine'] = self.cosine_sim_sampled() - elif method == 'pearson': - results['pearson'] = self.pearson_corr_sampled() - elif method == 'jaccard': - results['jaccard'] = self.jaccard_sim_sampled() - elif method == 'js_distance': - results['js_distance'] = self.js_divergence_sampled() - elif method == 'wasserstein': - results['wasserstein'] = self.wasserstein_dist() - elif method == "common_genes_num": - results["common_genes_num"] = self.common_genes_num() - elif method == "Hausdorff": - results["Hausdorff"] = self.get_Hausdorff() - elif method == "chamfer": - results["chamfer"] = self.chamfer_distance() - elif method == "energy": - results["energy"] = self.energy_distance_metric() - elif method == "sinkhorn2": - results["sinkhorn2"] = self.get_sinkhorn2() - elif method == "bures": - results["bures"] = self.bures_distance() - elif method == "spectral": - results["spectral"] = self.spectral_distance() - elif method == "otdd": - results['otdd'] = self.otdd() - elif method == "ground_truth": - results["ground_truth"] = self.get_ground_truth() - elif method == "metadata_sim": - results["metadata_sim"] = self.get_dataset_meta_sim() - elif method == "mmd": - results["mmd"] = self.compute_mmd() - else: - raise ValueError(f"Unsupported similarity method: {method}") - return results - - def get_similarity_matrix_A2B( - self, methods: List[str] = [ - "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num", - "ground_truth", "metadata_sim", "mmd" - ] - ) -> Dict[str, float]: - """Same as compute_similarity, keeping method name consistency.""" - cumulative_results = {method: 0.0 for method in methods} - - for run in range(self.n_runs): - # Update random state for each run - if self.init_random_state is not None: - current_random_state = self.init_random_state + run - else: - current_random_state = None - run_results = self.compute_similarity(methods=methods, random_state=current_random_state) - for method in methods: - if method in ["ground_truth"]: - cumulative_results[method] = run_results[method] - else: - cumulative_results[method] += run_results[method] - # Average the results over the number of runs - averaged_results = { - method: - cumulative_results[method] if method in ["ground_truth"] else cumulative_results[method] / self.n_runs - for method in methods - } - return averaged_results - - # def get_max_similarity_A_to_B(self): - # if self.results is None: - # raise ValueError(f"need results!") - # else: - # self.results_score = {} - # for key in self.results: - # if key not in ["common_genes_num", "ground_truth", "metadata_sim"]: - # self.results_score[key] = self._get_max_similarity(self.results[key]) - # else: - # self.results_score[key] = self.results[key] - # return self.results_score - - # def _get_max_similarity(self, similarity_matrix: pd.DataFrame): - # """Maximum matching average similarity score.""" - # matched_values = [ - # similarity_matrix.loc[label, - # label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max() - # for label in similarity_matrix.index - # ] # need to ask - # overall_similarity = np.mean(matched_values) - # return overall_similarity - - -def extract_type_target_params(item_text): - lines = item_text.strip().split('\n') - item_dict = {} - params_dict = {} - current_param_key = None - in_params = False - for line in lines: - stripped_line = line.strip() - if stripped_line.startswith('- type:'): - item_dict['type'] = stripped_line.split(':', 1)[1].strip() - elif stripped_line.startswith('target:'): - item_dict['target'] = stripped_line.split(':', 1)[1].strip() - elif stripped_line.startswith('params:'): - params_content = stripped_line.split(':', 1)[1].strip() - if params_content == '{}': - params_dict = {} - in_params = False - else: - params_dict = {} - in_params = True - elif in_params: - if re.match(r'^\w+:$', stripped_line): - current_param_key = stripped_line[:-1].strip() - params_dict[current_param_key] = {} - elif re.match(r'^- ', stripped_line): - list_item = stripped_line[2:].strip() - if current_param_key: - if not isinstance(params_dict[current_param_key], list): - params_dict[current_param_key] = [] - params_dict[current_param_key].append(list_item) - elif ':' in stripped_line: - key, value = map(str.strip, stripped_line.split(':', 1)) - if current_param_key and isinstance(params_dict.get(current_param_key, None), dict): - params_dict[current_param_key][key] = yaml.safe_load(value) - else: - params_dict[key] = yaml.safe_load(value) - item_dict['params'] = params_dict - return item_dict - - -def fix_yaml_string(original_str): - #It will be deleted - yaml_str = original_str.replace('\\n', '\n').strip() - items = re.split(r'(?=-\s*type:)', yaml_str) - config_list = [] - for item in items: - if not item.strip(): - continue - if not item.strip().startswith('- type:'): - print(item) - print("警告: 某个项未以 '- type:' 开头,跳过此项.") - continue - item_dict = extract_type_target_params(item) - config_list.append(item_dict) - fixed_yaml = yaml.dump(config_list, sort_keys=False) - return fixed_yaml From a66bf43829b35b4d63386ff1389744d372c5ae86 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Wed, 18 Dec 2024 11:31:10 +0800 Subject: [PATCH 167/203] add note --- dance/transforms/filter.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py index 8ec0ac1f..ef703d0d 100644 --- a/dance/transforms/filter.py +++ b/dance/transforms/filter.py @@ -274,6 +274,20 @@ def __init__( @register_preprocessor("filter", "cell") @add_mod_and_transform class FilterCellsCommonMod(BaseTransform): + """Initialize the FilterCellsCommonMod class. + + Parameters + ---------- + mod1 : str + Name of the first modality in the single-cell dataset. + mod2 : str + Name of the second modality in the single-cell dataset. + sol : Optional[str], default=None + Name of the optional solution dataset containing cell labels or annotations. + **kwargs : dict + Additional keyword arguments passed to the base transformation class. + + """ def __init__(self, mod1: str, mod2: str, sol: Optional[str] = None, **kwargs): super().__init__(**kwargs) From 78493331a45724c09ece9fab37eb0705ed67f176 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Wed, 18 Dec 2024 11:49:24 +0800 Subject: [PATCH 168/203] minor --- dance/utils/wrappers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py index 28ffefe8..74336b9c 100644 --- a/dance/utils/wrappers.py +++ b/dance/utils/wrappers.py @@ -107,6 +107,16 @@ def new_init(self, *args, **kwargs): @functools.wraps(original_call) def new_call(self, data: Data, *args, **kwargs): + """ + Parameters + ---------- + data : Data + The input data object containing the `mudata` with multiple modalities. + Returns + ------- + Any + The result of the original_call method. + """ if hasattr(self, 'mod') and self.mod is not None: md_data = data.data ad_data = Data(data=transform_mod_to_anndata(md_data, self.mod)) From 1b8dd4769a7778cf0a7f4e2450e57ee7dcdb4b6a Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Wed, 18 Dec 2024 11:51:33 +0800 Subject: [PATCH 169/203] minor --- dance/utils/wrappers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py index 74336b9c..a7f308ee 100644 --- a/dance/utils/wrappers.py +++ b/dance/utils/wrappers.py @@ -95,6 +95,8 @@ def wrapped_func(*args): def add_mod_and_transform(cls): + """A decorator that modifies a class to add functionality for working with specific + modalities (`mod`) in a `mudata` object.""" original_init = cls.__init__ original_call = cls.__call__ cls.add_mod_and_transform = "add_mod_and_transform" From e420a6d049014762be0e8fb87846ab053c7ec523 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 17:00:36 +0800 Subject: [PATCH 170/203] translate notes --- examples/get_result_web.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/get_result_web.py b/examples/get_result_web.py index 891c98d9..c4d6a0d8 100644 --- a/examples/get_result_web.py +++ b/examples/get_result_web.py @@ -16,22 +16,24 @@ def check_identical_strings(string_list): if not string_list: - raise ValueError("列表为空") + raise ValueError("The list is empty") arr = np.array(string_list) if not np.all(arr == arr[0]): - raise ValueError("发现不同的字符串") + raise ValueError("Different strings found") return string_list[0] # if not string_list: - # raise ValueError("列表为空") + # raise ValueError("The list is empty") # first_string = string_list[0] # for s in string_list[1:]: # if s != first_string: - # raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'") + # raise ValueError(f"Different strings found: '{first_string}' and '{s}'") # return first_string + + def get_sweep_url(step_csv: pd.DataFrame, single=True): ids = step_csv["id"] sweep_urls = [] @@ -57,7 +59,7 @@ def spilt_web(url: str): if match: entity = match.group(1) project = match.group(2) - pattern = r'/sweeps/([^/?]+)' # 正则表达式模式 + pattern = r'/sweeps/([^/?]+)' # Regular expression pattern match = re.search(pattern, url) if match: sweep_id = match.group(1) @@ -161,8 +163,8 @@ def write_ans(tissue): step3_urls = [] for i in range(3): file_csv = f"{file_path}/results/params/{i}_best_test_acc.csv" - if not os.path.exists(file_csv): #no parameter - print(f"文件 {file_csv} 不存在,跳过。") + if not os.path.exists(file_csv): # no parameter + print(f"File {file_csv} does not exist, skipping.") continue step3_urls.append(get_sweep_url(pd.read_csv(file_csv))) step3_str = ",".join(step3_urls) From 67fb57b97f0a8bde492c22d2fa27043446087938 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:02:17 +0000 Subject: [PATCH 171/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/get_result_web.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/get_result_web.py b/examples/get_result_web.py index c4d6a0d8..c69e9d4e 100644 --- a/examples/get_result_web.py +++ b/examples/get_result_web.py @@ -24,7 +24,6 @@ def check_identical_strings(string_list): return string_list[0] - # if not string_list: # raise ValueError("The list is empty") # first_string = string_list[0] From d4529734e25a5b8c1db01012a8642412728cd2d4 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 20:07:17 +0800 Subject: [PATCH 172/203] add argument preprocess --- examples/multi_modality/joint_embedding/jae.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py index 0b1d79bb..7504f2e7 100644 --- a/examples/multi_modality/joint_embedding/jae.py +++ b/examples/multi_modality/joint_embedding/jae.py @@ -25,7 +25,7 @@ parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") parser.add_argument("--span", type=float, default=0.3) - + parser.add_argument("--preprocess", default="aux") args = parser.parse_args() device = args.device @@ -34,7 +34,7 @@ rndseed = args.seed set_seed(rndseed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True, + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess, normalize=True, span=args.span) data = dataset.load_data() @@ -44,7 +44,7 @@ feature_channel=["X_pca", "X_pca"], label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], ) - if True: + if args.preprocess != "aux": cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() cell_type_labels_unique = list(np.unique(cell_type_labels)) c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) From 631ce663eefa9478cb1c4d3fb906e56dbcd50ec1 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 20:10:29 +0800 Subject: [PATCH 173/203] add argument preprocess --- examples/multi_modality/joint_embedding/scmogcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py index 44e2a748..51e556c2 100644 --- a/examples/multi_modality/joint_embedding/scmogcn.py +++ b/examples/multi_modality/joint_embedding/scmogcn.py @@ -28,7 +28,7 @@ parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1]) parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") parser.add_argument("--span", type=float, default=0.3) - + parser.add_argument("--preprocess", default="aux") args = parser.parse_args() device = args.device @@ -37,7 +37,7 @@ rndseed = args.seed set_seed(rndseed) - dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True, + dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess, normalize=True, span=args.span) data = dataset.load_data() train_size = len(data.get_split_idx("train")) @@ -50,7 +50,7 @@ feature_channel=["X_pca", "X_pca"], label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"], ) - if True: + if args.preprocess != "aux": cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy() cell_type_labels_unique = list(np.unique(cell_type_labels)) c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels]) From 0f3c2686f7d4a05c8b52f0a058c3689f048cce31 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 21:15:09 +0800 Subject: [PATCH 174/203] translate notes --- .../result_analysis/get_important_pattern.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py index 657fe9b0..e44a871c 100644 --- a/examples/result_analysis/get_important_pattern.py +++ b/examples/result_analysis/get_important_pattern.py @@ -8,6 +8,7 @@ from itertools import combinations from os import X_OK from pathlib import Path +from venv import logger import matplotlib.pyplot as plt import numpy as np @@ -28,9 +29,9 @@ from typing_extensions import deprecated -#TODO need to sync all files or get sweep,not file +#use get_important_pattern_sweep.py #asceding need to think -#负向的pattern,换一下顺序就可以吧 +#Negative pattern, just need to change the order def get_important_pattern(test_accs, ascending, vis=True, alpha=0.05, title=""): if vis: @@ -78,10 +79,6 @@ def change_real_rank(rank_item, real_rank): return [] -def replace_nan_in_2d(lst): #nan应该是个极差的值而不是直接删掉 - return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst] - - def are_all_elements_same_direct(list_2d): first_element = None for sublist in list_2d: @@ -130,6 +127,10 @@ def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False): def get_test_acc_and_names(step2_data, metric_name): + + def replace_nan_in_2d(lst): #nan should be an extreme value rather than being directly deleted + return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst] + columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) test_accs = [] test_acc_names = [] @@ -154,7 +155,7 @@ def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05): ans_all = [] test_accs, test_acc_names = get_test_acc_and_names(step2_data, metric_name) final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title="all_pattern", vis=vis) - if len(final_ranks) > 0: #TODO maybe need to think ascending + if len(final_ranks) > 0: max_rank = max(final_ranks) max_rank_count = final_ranks.count(max_rank) if max_rank_count < len(final_ranks) / 2: @@ -179,7 +180,7 @@ def get_forest_model_pattern(step2_data, metric_name): X = step2_data.loc[:, columns] y = step2_data.loc[:, metric_name] preprocessor = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'), - columns) # drop='first'防止虚拟变量陷阱 + columns) # drop='first' to prevent dummy variable trap ]) pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', @@ -201,23 +202,26 @@ def get_forest_model_pattern(step2_data, metric_name): scoring='neg_mean_squared_error', n_jobs=-1, verbose=1, - refit=True # 确保在所有数据上重新训练最佳模型 + refit=True # Ensure the best model is retrained on all data ) grid_search.fit(X, y) best_pipeline = grid_search.best_estimator_ model = best_pipeline.named_steps['regressor'] - X_preprocessed = best_pipeline.named_steps['preprocessor'].transform( - X) #TODO best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)是否和X_preprocessed一定是对应的? - explainer = shapiq.TreeExplainer(model=model, index="k-SII", max_order=3) #思考为什么没有负值,因为是绝对值相加,可能是为了正负值不会相互抵消 + X_preprocessed = best_pipeline.named_steps['preprocessor'].transform(X) + feature_names = best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns) + logger.info(f"X_preprocessed.columns={X_preprocessed.columns}") + logger.info(f"feature_names={feature_names}") + explainer = shapiq.TreeExplainer( + model=model, index="k-SII", max_order=3 + ) # Consider why there are no negative values, possibly to prevent cancellation of positive and negative values list_of_interaction_values = explainer.explain_X(X_preprocessed.toarray(), n_jobs=96, random_state=42) plt.cla() - ax = shapiq.plot.bar_plot(list_of_interaction_values, - feature_names=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns), - max_display=None, show=False, need_abbreviate=False) + ax = shapiq.plot.bar_plot(list_of_interaction_values, feature_names=feature_names, max_display=None, show=False, + need_abbreviate=False) ax.yaxis.get_major_locator().MAXTICKS = 1000000 plt.show() rects = ax.containers[0] - yticklabels = ax.get_yticklabels() #label和rect是否重合需要验证 + yticklabels = ax.get_yticklabels() # Need to verify if labels and rectangles overlap shap_ans = {} for rect, label in zip(rects, yticklabels): xy = rect.get_xy() @@ -229,7 +233,7 @@ def get_forest_model_pattern(step2_data, metric_name): raise RuntimeError("Features should not be repeated") shap_ans[k] = v - ans = get_significant_items(shap_ans) #检查一下是不是真的pattern,好像结果不太好,再检验一下 + ans = get_significant_items(shap_ans) # Check if it's really a pattern, the results seem not good, need to verify preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), index=X.index, columns=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)) preprocessed_df[metric_name] = step2_data[metric_name] From 53551304dcc4d2b0252d2f862de9e9f17b4b9ea3 Mon Sep 17 00:00:00 2001 From: xzy Date: Wed, 18 Dec 2024 21:34:47 +0800 Subject: [PATCH 175/203] translate notes --- examples/result_analysis/get_important_pattern_sweep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py index 896a0d0a..9fe72ada 100644 --- a/examples/result_analysis/get_important_pattern_sweep.py +++ b/examples/result_analysis/get_important_pattern_sweep.py @@ -50,7 +50,7 @@ def get_additional_sweep(sweep_id): #last run command run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None) additional_sweep_ids = [sweep_id] - if run is None: #check summary data num,note aznph5wt,数量可能不一致。 + if run is None: # check summary data count, note aznph5wt, quantities may be inconsistent return additional_sweep_ids run_id = run.id web_abs = requests.get(f"https://api.wandb.ai/files/{run.entity}/{run.project}/{run_id}/wandb-metadata.json") @@ -73,7 +73,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F if pd.isna(min_metric): return { "error": - f"All {metric_name} values ​​are NaN and the minimum cannot be calculated. Please check your data." + f"All {metric_name} values are NaN and the minimum cannot be calculated. Please check your data." } step2_data[metric_name] = step2_data[metric_name].fillna(0) #if ascending=False else: @@ -81,7 +81,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F if pd.isna(max_metric): return { "error": - f"All {metric_name} values ​​are NaN and the maximum cannot be calculated. Please check your data." + f"All {metric_name} values are NaN and the maximum cannot be calculated. Please check your data." } print(f"\nmax {metric_name}:{max_metric}") buffer_percentage = 0.2 # 20% From 7c8c6575110847a30121e0df68cd14b43c81b631 Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Dec 2024 17:14:16 +0800 Subject: [PATCH 176/203] add atlas --- dance/atlas/data_dropbox_upload.py | 101 ++++++++++++++++++ .../pipeline_params_tuning_config.yaml | 73 +++++++++++++ examples/atlas/config/commands.yaml | 2 + examples/atlas/config/run_config.yaml | 8 ++ examples/atlas/setup_run.py | 71 ++++++++++++ examples/atlas/upload_data.py | 69 ++++++++++++ .../get_important_pattern_sweep.py | 6 +- 7 files changed, 327 insertions(+), 3 deletions(-) create mode 100644 dance/atlas/data_dropbox_upload.py create mode 100644 examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml create mode 100644 examples/atlas/config/commands.yaml create mode 100644 examples/atlas/config/run_config.yaml create mode 100644 examples/atlas/setup_run.py create mode 100644 examples/atlas/upload_data.py diff --git a/dance/atlas/data_dropbox_upload.py b/dance/atlas/data_dropbox_upload.py new file mode 100644 index 00000000..718e07c9 --- /dev/null +++ b/dance/atlas/data_dropbox_upload.py @@ -0,0 +1,101 @@ +import json +import os +import pathlib + +import dropbox +import numpy as np +import pandas as pd +import scanpy as sc +from dropbox.exceptions import ApiError, AuthError + +from dance.utils import logger + + +def upload_file_to_dropbox(dropbox_path, access_token, local_path): + dbx = dropbox.Dropbox(access_token) + + # Verify access token + try: + dbx.users_get_current_account() + except AuthError as err: + print("ERROR: Invalid access token; please check your access token.") + return None + try: + file_upload(dbx=dbx, local_path=local_path, remote_path=dropbox_path) + print("Upload successful.") + except ApiError as err: + print(f"API error: {err}") + return None + + +def file_upload(dbx: dropbox.Dropbox, local_path: pathlib.Path, remote_path: str): + CHUNKSIZE = 100 * 1024 * 1024 + upload_session_start_result = dbx.files_upload_session_start(b'') + cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id, offset=0) + with local_path.open("rb") as f: + while True: + data = f.read(CHUNKSIZE) + if data == b"": + break + logger.debug("Pushing %d bytes", len(data)) + dbx.files_upload_session_append_v2(data, cursor) + cursor.offset += len(data) + commit = dropbox.files.CommitInfo(path=remote_path) + dbx.files_upload_session_finish(b'', cursor, commit) + + +def create_shared_link(dbx, dropbox_path): + """Create or get existing shared link. + + :param dbx: Dropbox object + :param dropbox_path: File path on Dropbox + :return: Shared link URL + + """ + try: + links = dbx.sharing_list_shared_links(path=dropbox_path, direct_only=True).links + if links: + # If shared link already exists, return the first one + return links[0].url + else: + # Create a new shared link + link = dbx.sharing_create_shared_link_with_settings(dropbox_path) + return link.url + except ApiError as err: + print(f"Error creating shared link: {err}") + return None + + +def get_link(data_fname, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH): + DROPBOX_DEST_PATH = DROPBOX_DEST_PATH + "/" + data_fname + + upload_file_to_dropbox(dropbox_path=DROPBOX_DEST_PATH, access_token=ACCESS_TOKEN, local_path=local_path) + + # Create Dropbox object to get shared link + dbx = dropbox.Dropbox(ACCESS_TOKEN) + # Get shared link + shared_link = create_shared_link(dbx, DROPBOX_DEST_PATH) + if shared_link: + # Dropbox shared link defaults to `dl=0` at the end, which means preview in browser. + # change it to `dl=1`. + download_link = shared_link.replace('&dl=0', '&dl=1') + print(f"Download link: {download_link}") + return download_link + else: + print("Unable to get shared link.") + + +def get_ans(data: sc.AnnData, tissue: str, dataset_id: str, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH): + # keys=["species","tissue","dataset","split","celltype_fname","celltype_url","data_fname","data_url"] + ans = {} + ans["species"] = "human" + ans["tissue"] = tissue.capitalize() + ans["dataset"] = data.n_obs + ans["split"] = "train" + ans["celltype_fname"] = "" + ans["celltype_url"] = "" + ans["data_fname"] = f"train_human_{tissue.capitalize()}{dataset_id}_data.h5ad" + ans["data_url"] = get_link(data_fname=ans["data_fname"].split("_", 1)[1], local_path=local_path, + ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH) + ans["is_ALL_Integer"] = np.all(np.equal(data.X.data, data.X.data.astype(int))) + return ans diff --git a/examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml b/examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml new file mode 100644 index 00000000..fb607022 --- /dev/null +++ b/examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml @@ -0,0 +1,73 @@ +type: preprocessor +tune_mode: pipeline_params +pipeline_tuning_top_k: 3 +parameter_tuning_freq_n: 20 +pipeline: + - type: filter.gene + include: + - FilterGenesPercentile + - FilterGenesScanpyOrder + - FilterGenesPlaceHolder + default_params: + FilterGenesScanpyOrder: + order: ["min_counts", "min_cells", "max_counts", "max_cells"] + min_counts: 0.01 + max_counts: 0.99 + min_cells: 0.01 + max_cells: 0.99 + - type: normalize + include: + - ScaleFeature + - ScTransform + - Log1P + - NormalizeTotal + - NormalizePlaceHolder + default_params: + ScTransform: + processes_num: 8 + - type: filter.gene + include: + - HighlyVariableGenesLogarithmizedByMeanAndDisp + - HighlyVariableGenesRawCount + - HighlyVariableGenesLogarithmizedByTopGenes + - FilterGenesTopK + - FilterGenesRegression + - FilterGenesNumberPlaceHolder + default_params: + FilterGenesTopK: + num_genes: 3000 + FilterGenesRegression: + num_genes: 3000 + HighlyVariableGenesRawCount: + n_top_genes: 3000 + HighlyVariableGenesLogarithmizedByTopGenes: + n_top_genes: 3000 + - type: feature.cell + include: + - WeightedFeaturePCA + - WeightedFeatureSVD + - CellPCA + - CellSVD + - GaussRandProjFeature # Registered custom preprocessing func + - FeatureCellPlaceHolder + params: + out: feature.cell + log_level: INFO + default_params: + WeightedFeaturePCA: + split_name: train + WeightedFeatureSVD: + split_name: train + - type: misc + target: SetConfig + params: + config_dict: + feature_channel: feature.cell + label_channel: cell_type +wandb: + entity: xzy11632 + project: dance-dev + method: grid #try grid to provide a comprehensive search + metric: + name: acc # val/acc + goal: maximize diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml new file mode 100644 index 00000000..fc68d057 --- /dev/null +++ b/examples/atlas/config/commands.yaml @@ -0,0 +1,2 @@ +cta_actinn: + command:"python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" diff --git a/examples/atlas/config/run_config.yaml b/examples/atlas/config/run_config.yaml new file mode 100644 index 00000000..c88f3378 --- /dev/null +++ b/examples/atlas/config/run_config.yaml @@ -0,0 +1,8 @@ +runs: + - algorithm_name: cta_actinn + dataset_id: "0bc7235a-ae5a-479d-a487-510435377e55" + species: human + tissue: Brain + filetype: h5ad + count: 800 + device: cuda:0 diff --git a/examples/atlas/setup_run.py b/examples/atlas/setup_run.py new file mode 100644 index 00000000..d29d9e37 --- /dev/null +++ b/examples/atlas/setup_run.py @@ -0,0 +1,71 @@ +import argparse +import os +import shutil +import sys + +import yaml + +from dance.settings import DANCEDIR + + +def load_commands(config_path): + with open(config_path, encoding='utf-8') as f: + return yaml.safe_load(f) + + +def load_run_configs(run_config_path): + with open(run_config_path, encoding='utf-8') as f: + return yaml.safe_load(f) + + +def main(): + parser = argparse.ArgumentParser(description='Setup run parameters') + parser.add_argument('--config', type=str, default="config/run_config.yaml", help='Run configuration YAML file') + + args = parser.parse_args() + + run_configs = load_run_configs(args.config) + + commands_config = load_commands("commands.yaml") + + for run in run_configs.get("runs", []): + algorithm_name = run.get('algorithm_name') + dataset_id = run.get('dataset_id') + species = run.get('species') + tissue = run.get('tissue') + filetype = run.get('filetype') + count = run.get('count') + device = run.get('device') + + # Define paths + template_path = os.path.join("config/atlas_template_yamls", + f"{algorithm_name}/pipeline_params_tuning_config.yaml") + config_dir = f"{DANCEDIR}/examples/tuning/{algorithm_name}/{dataset_id}" + os.makedirs(config_dir, exist_ok=True) + config_filename = f"pipeline_params_tuning_config.yaml" + config_path = os.path.join(config_dir, config_filename) + + # Copy configuration file + shutil.copy(template_path, config_path) + print(f"Template copied to {config_path}") + + if algorithm_name not in commands_config.get("algorithms", {}): + print(f"Error: Command not found for algorithm '{algorithm_name}'. Please check commands.yaml file.") + continue + + command_template = commands_config["algorithms"][algorithm_name]["command"] + run_command = command_template.format(dataset_id=dataset_id, species=species, tissue=tissue, filetype=filetype, + count=count, device=device) + + # Append the run command to run.sh + run_sh_path = f"{DANCEDIR}/examples/tuning/{algorithm_name}/run.sh" + with open(run_sh_path, "a", encoding='utf-8') as run_script: + run_script.write(f"{run_command}\n") + + print(f"Run command appended to {run_sh_path}: {run_command}") + + print("All run configurations have been processed.") + + +if __name__ == "__main__": + main() diff --git a/examples/atlas/upload_data.py b/examples/atlas/upload_data.py new file mode 100644 index 00000000..29b45a18 --- /dev/null +++ b/examples/atlas/upload_data.py @@ -0,0 +1,69 @@ +import argparse +import json +import pathlib + +import pandas as pd +import scanpy as sc + +from dance.atlas.data_dropbox_upload import get_ans, get_data + +if __name__ == "__main__": + args = argparse.ArgumentParser() + args.add_argument("--maindir", type=str) + args.add_argument("--filedir", type=str) + args.add_argument("--tissues", type=str, nargs="+") + args.add_argument("--access_token", type=str) + args.add_argument("--dropbox_dest_path", type=str, + default="/preprocessing_benchmarking/cell_type_annotation/TEMP_Tran_5_Datasets/human") + args = args.parse_args() + MAINDIR = pathlib.Path(args.maindir) + FILEDIR = pathlib.Path(args.filedir) + tissues = args.tissues + # tissues=["kidney","lung","pancreas"] + # Configuration parameters + ACCESS_TOKEN = args.access_token + DROPBOX_DEST_PATH = args.dropbox_dest_path # Destination path on Dropbox + + def get_data(dataset_id, in_atlas=False, large=False): + if large: + if in_atlas: + local_path = MAINDIR / f"sampled-10000/{tissue}/{dataset_id}.h5ad" + else: + local_path = FILEDIR / f"sampled-10000/{tissue}/{dataset_id}.h5ad" + else: + local_path = MAINDIR / f"{tissue}/{dataset_id}.h5ad" + data = sc.read_h5ad(local_path) + return data, local_path + + ans_all = [] + + with open(FILEDIR / "results/atlas_result.json") as f: + result = json.load(f) + with open(FILEDIR / "results/query_result.json") as f: + query_result = json.load(f) + for tissue in tissues: + large_dataset_ids = result[tissue][0] + small_dataset_ids = result[tissue][1] + for large_dataset_id in large_dataset_ids: + data, local_path = get_data(dataset_id=large_dataset_id, in_atlas=True, large=True) + ans_all.append( + get_ans(dataset_id=large_dataset_id, tissue=tissue, data=data, local_path=local_path, + ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH)) + for small_dataset_id in small_dataset_ids: + data, local_path = get_data(dataset_id=small_dataset_id, in_atlas=True, large=False) + ans_all.append( + get_ans(dataset_id=small_dataset_id, tissue=tissue, data=data, local_path=local_path, + ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH)) + large_query_dataset_ids = query_result[tissue][0] + small_query_dataset_ids = query_result[tissue][1] + for large_query_dataset_id in large_query_dataset_ids: + data, local_path = get_data(dataset_id=large_query_dataset_id, in_atlas=False, large=True) + ans_all.append( + get_ans(dataset_id=large_query_dataset_id, tissue=tissue, data=data, local_path=local_path, + ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH)) + for small_query_dataset_id in small_query_dataset_ids: + data, local_path = get_data(dataset_id=small_query_dataset_id, in_atlas=False, large=False) + ans_all.append( + get_ans(dataset_id=small_query_dataset_id, tissue=tissue, data=data, local_path=local_path, + ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH)) + pd.DataFrame(ans_all).set_index("species").to_csv(",".join(tissues) + "scdeeepsort.csv") diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py index 896a0d0a..9fe72ada 100644 --- a/examples/result_analysis/get_important_pattern_sweep.py +++ b/examples/result_analysis/get_important_pattern_sweep.py @@ -50,7 +50,7 @@ def get_additional_sweep(sweep_id): #last run command run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None) additional_sweep_ids = [sweep_id] - if run is None: #check summary data num,note aznph5wt,数量可能不一致。 + if run is None: # check summary data count, note aznph5wt, quantities may be inconsistent return additional_sweep_ids run_id = run.id web_abs = requests.get(f"https://api.wandb.ai/files/{run.entity}/{run.project}/{run_id}/wandb-metadata.json") @@ -73,7 +73,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F if pd.isna(min_metric): return { "error": - f"All {metric_name} values ​​are NaN and the minimum cannot be calculated. Please check your data." + f"All {metric_name} values are NaN and the minimum cannot be calculated. Please check your data." } step2_data[metric_name] = step2_data[metric_name].fillna(0) #if ascending=False else: @@ -81,7 +81,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F if pd.isna(max_metric): return { "error": - f"All {metric_name} values ​​are NaN and the maximum cannot be calculated. Please check your data." + f"All {metric_name} values are NaN and the maximum cannot be calculated. Please check your data." } print(f"\nmax {metric_name}:{max_metric}") buffer_percentage = 0.2 # 20% From 7738d7d0d24ace2b61b9d91d1037336e9a0f3ceb Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Dec 2024 17:27:47 +0800 Subject: [PATCH 177/203] minor --- examples/atlas/upload_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/atlas/upload_data.py b/examples/atlas/upload_data.py index 29b45a18..2a6f3f8b 100644 --- a/examples/atlas/upload_data.py +++ b/examples/atlas/upload_data.py @@ -13,8 +13,7 @@ args.add_argument("--filedir", type=str) args.add_argument("--tissues", type=str, nargs="+") args.add_argument("--access_token", type=str) - args.add_argument("--dropbox_dest_path", type=str, - default="/preprocessing_benchmarking/cell_type_annotation/TEMP_Tran_5_Datasets/human") + args.add_argument("--dropbox_dest_path", type=str) args = args.parse_args() MAINDIR = pathlib.Path(args.maindir) FILEDIR = pathlib.Path(args.filedir) From 35b105c06b813677dbc6738aa08eb3991867594f Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Dec 2024 21:25:04 +0800 Subject: [PATCH 178/203] add atlas --- examples/atlas/config/commands.yaml | 5 ++-- examples/atlas/config/run_config.csv | 9 ++++++++ examples/atlas/config/run_config.yaml | 8 ------- examples/atlas/setup_run.py | 33 +++++++++++++++------------ 4 files changed, 31 insertions(+), 24 deletions(-) create mode 100644 examples/atlas/config/run_config.csv delete mode 100644 examples/atlas/config/run_config.yaml diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml index fc68d057..32c66dd0 100644 --- a/examples/atlas/config/commands.yaml +++ b/examples/atlas/config/commands.yaml @@ -1,2 +1,3 @@ -cta_actinn: - command:"python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" +algorithms: + cta_actinn: + ommand: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" diff --git a/examples/atlas/config/run_config.csv b/examples/atlas/config/run_config.csv new file mode 100644 index 00000000..e08184cc --- /dev/null +++ b/examples/atlas/config/run_config.csv @@ -0,0 +1,9 @@ +algorithm_name,dataset_id,species,tissue,filetype,count,device +cta_actinn,6a30bf44-c490-41ac-965b-0bb58432b10a,human,Kidney,h5ad,800,cuda:0 +cta_actinn,f801b7a9-80a6-4d09-9161-71474deb58ae,human,Kidney,h5ad,800,cuda:1 +cta_actinn,20d87640-4be8-487f-93d4-dce38378d00f,human,Kidney,h5ad,800,cuda:2 +cta_actinn,2d31c0ca-0233-41ce-bd1a-05aa8404b073,human,Kidney,h5ad,800,cuda:3 +cta_actinn,fd072bc3-2dfb-46f8-b4e3-467cb3223182,human,Kidney,h5ad,800,cuda:4 +cta_actinn,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Kidney),human,Kidney,h5ad,800,cuda:5 +cta_actinn,0b75c598-0893-4216-afe8-5414cab7739d,human,Kidney,h5ad,800,cuda:6 +cta_actinn,2aa1c93c-4ef3-4e9a-98e7-0bd37933953c,human,Kidney,h5ad,800,cuda:7 diff --git a/examples/atlas/config/run_config.yaml b/examples/atlas/config/run_config.yaml deleted file mode 100644 index c88f3378..00000000 --- a/examples/atlas/config/run_config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -runs: - - algorithm_name: cta_actinn - dataset_id: "0bc7235a-ae5a-479d-a487-510435377e55" - species: human - tissue: Brain - filetype: h5ad - count: 800 - device: cuda:0 diff --git a/examples/atlas/setup_run.py b/examples/atlas/setup_run.py index d29d9e37..4b3bdf5f 100644 --- a/examples/atlas/setup_run.py +++ b/examples/atlas/setup_run.py @@ -3,9 +3,11 @@ import shutil import sys +import pandas as pd import yaml from dance.settings import DANCEDIR +from dance.utils import logger def load_commands(config_path): @@ -14,34 +16,37 @@ def load_commands(config_path): def load_run_configs(run_config_path): - with open(run_config_path, encoding='utf-8') as f: - return yaml.safe_load(f) + return pd.read_csv(run_config_path) def main(): parser = argparse.ArgumentParser(description='Setup run parameters') - parser.add_argument('--config', type=str, default="config/run_config.yaml", help='Run configuration YAML file') + parser.add_argument('--config', type=str, default="config/run_config.csv", help='Run configuration CSV file') args = parser.parse_args() - run_configs = load_run_configs(args.config) + run_configs_df = load_run_configs(args.config) - commands_config = load_commands("commands.yaml") + commands_config = load_commands("config/commands.yaml") - for run in run_configs.get("runs", []): - algorithm_name = run.get('algorithm_name') - dataset_id = run.get('dataset_id') - species = run.get('species') - tissue = run.get('tissue') - filetype = run.get('filetype') - count = run.get('count') - device = run.get('device') + for _, run in run_configs_df.iterrows(): + algorithm_name = run['algorithm_name'] + dataset_id = run['dataset_id'] + species = run['species'] + tissue = run['tissue'] + filetype = run['filetype'] + count = run['count'] + device = run['device'] # Define paths template_path = os.path.join("config/atlas_template_yamls", f"{algorithm_name}/pipeline_params_tuning_config.yaml") config_dir = f"{DANCEDIR}/examples/tuning/{algorithm_name}/{dataset_id}" - os.makedirs(config_dir, exist_ok=True) + try: + os.makedirs(config_dir, exist_ok=False) + except FileExistsError: + logger.warning(f"Error: Directory {config_dir} already exists. Please remove it before running again.") + continue config_filename = f"pipeline_params_tuning_config.yaml" config_path = os.path.join(config_dir, config_filename) From 4d3a0d263c3d80dd70fac35677dcf49705951daa Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Dec 2024 21:40:09 +0800 Subject: [PATCH 179/203] update scdeepsort --- dance/metadata/scdeepsort.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 44091d47..39e41209 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -166,7 +166,7 @@ human,Kidney,6044,train,,,train_human_Kidneyf801b7a9-80a6-4d09-9161-71474deb58ae human,Kidney,7802,train,,,train_human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad,https://www.dropbox.com/scl/fi/xmzomvt0c8bza3fy8me0p/human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad?rlkey=iqzword5254z5rujjdey1u8hc&dl=1 human,Kidney,6847,train,,,train_human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad,https://www.dropbox.com/scl/fi/rhngz2alde48jotpy5c5v/human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad?rlkey=u0x4dsnt569wq07l3h1rqjzum&dl=1 human,Kidney,10000,train,,,train_human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad,https://www.dropbox.com/scl/fi/ybml7y2bth0qjnv3x1ieg/human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad?rlkey=qkjgdqttk3s10ht54109a4cad&dl=1 -human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/553s0af5q2nibafj4nkux/human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=p85qlsjixsuuutgwnms3w4y30&dl=1 +human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Kidney)_data.h5ad,https://www.dropbox.com/scl/fi/553s0af5q2nibafj4nkux/human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Kidney-_data.h5ad?rlkey=p85qlsjixsuuutgwnms3w4y30&st=igznlz90&dl=1 human,Kidney,10000,train,,,train_human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad,https://www.dropbox.com/scl/fi/feklth6jvnc5qqwvgaydy/human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad?rlkey=28vpy2m90lnri9aekfthrsvr1&dl=1 human,Kidney,5848,train,,,train_human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad,https://www.dropbox.com/scl/fi/1jq1wrqo1rcl041antcm8/human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad?rlkey=ssgfsiobqfah3pxgqnrsaff6l&dl=1 human,Kidney,9641,train,,,train_human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad,https://www.dropbox.com/scl/fi/o2cnntkrd5j6coeqehv8b/human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad?rlkey=5tbupfd3cdvqzy2rix6scvwzu&dl=1 From c7ac320b1e23b38830f53a80002938b3669a9e3d Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Dec 2024 22:01:42 +0800 Subject: [PATCH 180/203] minor --- examples/atlas/config/commands.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml index 32c66dd0..41a3f2ef 100644 --- a/examples/atlas/config/commands.yaml +++ b/examples/atlas/config/commands.yaml @@ -1,3 +1,5 @@ algorithms: cta_actinn: - ommand: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" + command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" + cta_singlecellnet: + command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &" From 2cf8e366ec7a46e427f2567ee675498be15e1702 Mon Sep 17 00:00:00 2001 From: xingzhongyu Date: Thu, 19 Dec 2024 22:21:38 +0800 Subject: [PATCH 181/203] minor --- examples/atlas/config/commands.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml index 41a3f2ef..92b66890 100644 --- a/examples/atlas/config/commands.yaml +++ b/examples/atlas/config/commands.yaml @@ -3,3 +3,5 @@ algorithms: command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" cta_singlecellnet: command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &" + cta_celltypist: + command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &" From 5b3d2de3bb164f09381fe4068a76d5f7134cd8ae Mon Sep 17 00:00:00 2001 From: xzy Date: Thu, 19 Dec 2024 20:33:04 -0500 Subject: [PATCH 182/203] minor --- examples/atlas/config/commands.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml index 92b66890..7d058c51 100644 --- a/examples/atlas/config/commands.yaml +++ b/examples/atlas/config/commands.yaml @@ -5,3 +5,5 @@ algorithms: command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &" cta_celltypist: command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &" + cta_scdeepsort: + command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id} --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &" From a18f3ae9878fdf836289e8c792f0aa2136c266af Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 09:48:19 +0800 Subject: [PATCH 183/203] add notes --- examples/atlas/setup_run.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/atlas/setup_run.py b/examples/atlas/setup_run.py index 4b3bdf5f..06217714 100644 --- a/examples/atlas/setup_run.py +++ b/examples/atlas/setup_run.py @@ -11,11 +11,14 @@ def load_commands(config_path): + """Load YAML configuration file containing command templates for different + algorithms.""" with open(config_path, encoding='utf-8') as f: return yaml.safe_load(f) def load_run_configs(run_config_path): + """Load CSV file containing run configurations for different experiments.""" return pd.read_csv(run_config_path) @@ -25,11 +28,13 @@ def main(): args = parser.parse_args() + # Load configuration files run_configs_df = load_run_configs(args.config) - commands_config = load_commands("config/commands.yaml") + # Process each run configuration for _, run in run_configs_df.iterrows(): + # Extract parameters for current run algorithm_name = run['algorithm_name'] dataset_id = run['dataset_id'] species = run['species'] @@ -38,15 +43,18 @@ def main(): count = run['count'] device = run['device'] - # Define paths + # Setup directory structure for the algorithm configuration template_path = os.path.join("config/atlas_template_yamls", f"{algorithm_name}/pipeline_params_tuning_config.yaml") config_dir = f"{DANCEDIR}/examples/tuning/{algorithm_name}/{dataset_id}" + + # Create configuration directory if it doesn't exist try: os.makedirs(config_dir, exist_ok=False) except FileExistsError: logger.warning(f"Error: Directory {config_dir} already exists. Please remove it before running again.") continue + config_filename = f"pipeline_params_tuning_config.yaml" config_path = os.path.join(config_dir, config_filename) @@ -54,15 +62,17 @@ def main(): shutil.copy(template_path, config_path) print(f"Template copied to {config_path}") + # Validate algorithm exists in commands configuration if algorithm_name not in commands_config.get("algorithms", {}): print(f"Error: Command not found for algorithm '{algorithm_name}'. Please check commands.yaml file.") continue + # Format command template with run parameters command_template = commands_config["algorithms"][algorithm_name]["command"] run_command = command_template.format(dataset_id=dataset_id, species=species, tissue=tissue, filetype=filetype, count=count, device=device) - # Append the run command to run.sh + # Append generated command to run script run_sh_path = f"{DANCEDIR}/examples/tuning/{algorithm_name}/run.sh" with open(run_sh_path, "a", encoding='utf-8') as run_script: run_script.write(f"{run_command}\n") From dbd1fa322ba5b0575627c6b1b0a180a451a43442 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 10:07:48 +0800 Subject: [PATCH 184/203] add notes --- examples/get_result_web.py | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/examples/get_result_web.py b/examples/get_result_web.py index c69e9d4e..2c0413da 100644 --- a/examples/get_result_web.py +++ b/examples/get_result_web.py @@ -15,6 +15,15 @@ def check_identical_strings(string_list): + """ + Check if all strings in the list are identical + Args: + string_list: List of strings to compare + Returns: + The common string if all strings are identical + Raises: + ValueError if list is empty or strings are different + """ if not string_list: raise ValueError("The list is empty") @@ -34,6 +43,14 @@ def check_identical_strings(string_list): def get_sweep_url(step_csv: pd.DataFrame, single=True): + """ + Extract wandb sweep URL from a DataFrame containing run IDs + Args: + step_csv: DataFrame containing run IDs + single: If True, only process the first run + Returns: + The sweep URL + """ ids = step_csv["id"] sweep_urls = [] for run_id in tqdm(reversed(ids), @@ -51,6 +68,13 @@ def get_sweep_url(step_csv: pd.DataFrame, single=True): def spilt_web(url: str): + """ + Parse wandb URL to extract entity, project and sweep ID + Args: + url: wandb sweep URL + Returns: + Tuple of (entity, project, sweep_id) or None if parsing fails + """ pattern = r"https://wandb\.ai/([^/]+)/([^/]+)/sweeps/([^/]+)" match = re.search(pattern, url) @@ -70,6 +94,14 @@ def spilt_web(url: str): def get_best_method(urls, metric_col="test_acc"): + """ + Find the best performing method across multiple sweeps + Args: + urls: List of sweep URLs to compare + metric_col: Metric column name to use for comparison + Returns: + Tuple of (best_step_name, best_run, best_metric_value) + """ all_best_run = None all_best_step_name = None step_names = ["step2", "step3_0", "step3_1", "step3_2"] @@ -105,6 +137,15 @@ def get_metric(run): def get_best_yaml(step_name, best_run, file_path): + """ + Generate YAML configuration for the best performing run + Args: + step_name: Name of the step ('step2' or 'step3_X') + best_run: Best wandb run object + file_path: Path to configuration files + Returns: + YAML string containing the best configuration + """ if step_name == "step2": conf = OmegaConf.load(f"{file_path}/pipeline_params_tuning_config.yaml") for i, fun in enumerate(conf["pipeline"]): @@ -141,6 +182,13 @@ def get_best_yaml(step_name, best_run, file_path): def check_exist(file_path): + """ + Check if results directory exists and contains multiple files + Args: + file_path: Path to check + Returns: + Boolean indicating if valid results exist + """ file_path = f"{file_path}/results/params/" if os.path.exists(file_path) and os.path.isdir(file_path): file_num = len(os.listdir(file_path)) @@ -150,6 +198,12 @@ def check_exist(file_path): def write_ans(tissue): + """ + Process results for a specific tissue type and write to CSV + Args: + tissue: Name of the tissue to process + Writes results to a CSV file named '{tissue}_ans.csv' + """ ans = [] collect_datasets = all_datasets[tissue] @@ -182,9 +236,12 @@ def write_ans(tissue): if __name__ == "__main__": + # Initialize wandb and set global configuration wandb = try_import("wandb") entity = "xzy11632" project = "dance-dev" + + # Load dataset configuration and process results for tissue file_root = str(Path(__file__).resolve().parent) with open(f"{file_root}/dataset_server.json") as f: all_datasets = json.load(f) From b096c259e2b5db781c2c996ce5d5da9115476de0 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 10:45:05 +0800 Subject: [PATCH 185/203] add note --- .../result_analysis/get_important_pattern.py | 114 ++++++++++++++---- 1 file changed, 92 insertions(+), 22 deletions(-) diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py index e44a871c..af742553 100644 --- a/examples/result_analysis/get_important_pattern.py +++ b/examples/result_analysis/get_important_pattern.py @@ -30,9 +30,33 @@ #use get_important_pattern_sweep.py -#asceding need to think #Negative pattern, just need to change the order def get_important_pattern(test_accs, ascending, vis=True, alpha=0.05, title=""): + """Identify important patterns in test accuracies using statistical tests. + + Given multiple groups of test accuracies, this function performs Kruskal-Wallis test followed by + Dunn's post-hoc test to identify statistically significant differences between groups. The results + are then used to rank the groups based on their relative performance. + + Parameters + ---------- + test_accs + List of test accuracy groups to compare. + ascending + Boolean indicating whether to sort results in ascending order. + vis + Whether to visualize the results using box plots. + alpha + Significance level for statistical tests. + title + Title for the visualization plot. + + Returns + ------- + list + List of ranks indicating the relative importance of each group. + + """ if vis: fig = plt.figure(figsize=(12, 4)) @@ -91,6 +115,30 @@ def are_all_elements_same_direct(list_2d): def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1, multi_mod=False): + """Extract frequent patterns from top performing pipeline configurations. + + Given a DataFrame containing pipeline configurations and their performance metrics, this function + identifies frequent patterns in the top performing configurations using the Apriori algorithm. + + Parameters + ---------- + step2_data + DataFrame containing pipeline configurations and metrics. + metric_name + Name of the performance metric to optimize. + ascending + Boolean indicating whether to sort in ascending order. + threshold_per + Percentage of top configurations to consider. + multi_mod + Whether to use multiple modalities (not implemented). + + Returns + ------- + list + List of dictionaries containing frequent itemsets and their support values. + + """ if multi_mod: raise NotImplementedError("need multimod") threshold = int(len(step2_data) * threshold_per) @@ -176,6 +224,28 @@ def get_significant_items(data): def get_forest_model_pattern(step2_data, metric_name): + """Analyze feature importance using Random Forest and SHAP values. + + Given pipeline configurations and their performance metrics, this function trains a Random Forest model + and uses SHAP values to identify important feature interactions. It also computes point-biserial + correlations to validate the importance of identified patterns. + + Parameters + ---------- + step2_data + DataFrame containing pipeline configurations and metrics. + metric_name + Target metric to predict. + + Returns + ------- + dict + Dictionary containing: + - Important feature interactions and their SHAP values + - Point-biserial correlation statistics + - Best model parameters and MSE + + """ columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) X = step2_data.loc[:, columns] y = step2_data.loc[:, metric_name] @@ -287,24 +357,24 @@ def list_files(directories, metric_name, ascending, file_name="best_test_acc.csv return ans_all -if __name__ == "__main__": - directories = [] - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("task", default="cluster") - parser.add_argument("metric_name", default="acc") - parser.add_argument("ascending", default=False) - args = parser.parse_args() - task = args.task - metric_name = args.metric_name - ascending = args.ascending - file_root = Path(__file__).resolve().parent.parent / "tuning" - for path in file_root.iterdir(): - if path.is_dir(): - if str(path.name).startswith(task): - directories.append(path) - ans_all = list_files(directories, metric_name, ascending) - df = pd.DataFrame(ans_all) - pivot_df = df.pivot(index="dataset", columns="method", values="ans") - pivot_df.to_csv(f"{task}_pattern.csv") - - # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_actinn/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True)) +# if __name__ == "__main__": +# directories = [] +# parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +# parser.add_argument("task", default="cluster") +# parser.add_argument("metric_name", default="acc") +# parser.add_argument("ascending", default=False) +# args = parser.parse_args() +# task = args.task +# metric_name = args.metric_name +# ascending = args.ascending +# file_root = Path(__file__).resolve().parent.parent / "tuning" +# for path in file_root.iterdir(): +# if path.is_dir(): +# if str(path.name).startswith(task): +# directories.append(path) +# ans_all = list_files(directories, metric_name, ascending) +# df = pd.DataFrame(ans_all) +# pivot_df = df.pivot(index="dataset", columns="method", values="ans") +# pivot_df.to_csv(f"{task}_pattern.csv") + +# # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_actinn/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True)) From 4fb61450325cf1311ca6c3ef80ac1516ca0842e9 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 10:56:38 +0800 Subject: [PATCH 186/203] add notes --- .../get_important_pattern_sweep.py | 66 ++++++++++++++++--- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py index 9fe72ada..93f30adc 100644 --- a/examples/result_analysis/get_important_pattern_sweep.py +++ b/examples/result_analysis/get_important_pattern_sweep.py @@ -15,10 +15,14 @@ from dance.pipeline import flatten_dict from dance.utils import try_import +# Define basic configuration parameters entity = "xzy11632" project = "dance-dev" +# List of tasks to analyze tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"] +# Corresponding metrics for each task mertic_names = ["test_acc", "acc", "MRE", "ARI", "MSE"] +# Whether higher values are better for each metric ascendings = [False, False, True, False, True] multi_mod = False @@ -43,13 +47,27 @@ def get_additional_sweep(sweep_id): - # if sweep has piror runs - # every run get command , get additional sweep id - # or last run command + """Recursively retrieve all related sweep IDs from a given sweep. + + Given a sweep ID, this function recursively finds all related sweep IDs by examining the command + arguments of the runs within each sweep. It handles cases where sweeps may have prior runs or + additional sweep references. + + Parameters + ---------- + sweep_id : str + The initial sweep ID to start the search from. + + Returns + ------- + list + A list containing all related sweep IDs, including the input sweep_id. + + """ sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + additional_sweep_ids = [sweep_id] #last run command run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None) - additional_sweep_ids = [sweep_id] if run is None: # check summary data count, note aznph5wt, quantities may be inconsistent return additional_sweep_ids run_id = run.id @@ -63,7 +81,34 @@ def get_additional_sweep(sweep_id): def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=False): - # try: + """Analyze patterns in pipeline configurations and their impact on performance + metrics. + + This function examines the relationship between pipeline configurations and their corresponding + performance metrics. It handles missing values differently based on whether higher or lower + metric values are better, and can optionally visualize the results. + + Parameters + ---------- + step2_origin_data : pd.DataFrame + DataFrame containing pipeline configurations and their results. + metric_name : str + Name of the performance metric to analyze. + ascending : bool + Whether higher metric values indicate better performance. + alpha : float, optional + Significance level for statistical tests, by default 0.05. + vis : bool, optional + Whether to generate visualizations, by default False. + + Returns + ------- + dict + A dictionary containing either: + - Error message if all metric values are NaN + - Pattern analysis results including forest model and/or APR analysis + + """ columns = sorted([col for col in step2_origin_data.columns if col.startswith("pipeline")]) step2_data = step2_origin_data.loc[:, columns + [metric_name]] # com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha) @@ -92,23 +137,24 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F return {"forest_model": get_forest_model_pattern(step2_data, metric_name), "apr_ans": apr_ans} else: return {"apr_ans": apr_ans} - # except Exception as e: - # print(e) - # return str(e) if __name__ == "__main__": start = True ans_all = [] for i, task in enumerate(tasks): - + # Skip tasks not in choose_tasks list if task not in choose_tasks: continue + + # Read and preprocess results from Excel file data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str) data = data.ffill().set_index(['Methods']) + + # Iterate through each method and dataset combination for row_idx in range(data.shape[0]): for col_idx in range(data.shape[1]): - + # Extract metadata method = data.index[row_idx] dataset = data.columns[col_idx] value = data.iloc[row_idx, col_idx] From 059984e1c9f2f43920482cab0109fc8f9005b8ca Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 11:42:10 +0800 Subject: [PATCH 187/203] add notes --- .../atlas/sc_similarity_examples/cal_w1_w2.py | 94 ++++++++ .../example_usage_anndata.py | 121 ++++++++++ .../sc_similarity_examples/sim_query_atlas.py | 0 examples/atlas/sc_similarity_examples/vis.py | 112 +++++++++ examples/result_analysis/get_num.py | 19 ++ examples/tuning/get_important_pattern.py | 215 ------------------ 6 files changed, 346 insertions(+), 215 deletions(-) create mode 100644 examples/atlas/sc_similarity_examples/cal_w1_w2.py create mode 100644 examples/atlas/sc_similarity_examples/example_usage_anndata.py rename examples/{ => atlas}/sc_similarity_examples/sim_query_atlas.py (100%) create mode 100644 examples/atlas/sc_similarity_examples/vis.py delete mode 100644 examples/tuning/get_important_pattern.py diff --git a/examples/atlas/sc_similarity_examples/cal_w1_w2.py b/examples/atlas/sc_similarity_examples/cal_w1_w2.py new file mode 100644 index 00000000..63b0e3f4 --- /dev/null +++ b/examples/atlas/sc_similarity_examples/cal_w1_w2.py @@ -0,0 +1,94 @@ +import ast +import re +from pathlib import Path + +import numpy as np +import pandas as pd + +from dance.utils import try_import + +wandb = try_import("wandb") +entity = "xzy11632" +project = "dance-dev" +query_datasets = [ + "c7775e88-49bf-4ba2-a03b-93f00447c958", + "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", + # "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" +] +methods = ["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"] +file_root = Path(__file__).resolve().parent +feature_names = ["wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "mmd"] + + +def get_ans(): + ans = {} + for query_dataset in query_datasets: + data = pd.read_excel(file_root / "Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0) + ans[query_dataset] = data + return ans + + +def get_rank(): + for query_dataset, data in ans.items(): + for method in methods: + rank_col = 'rank_' + method + data.loc[rank_col, :] = data.loc[method, :].rank(ascending=False, method='min', na_option='bottom') + # data.loc[rank_col,:] = data.loc[rank_col,:].fillna(10000) + + +def convert_to_complex(s): + if isinstance(s, float) or isinstance(s, int): + return float(s) + try: + return ast.literal_eval(s) + except (ValueError, SyntaxError): + return np.nan + + +def objective(w1, feature_name): + w2 = 1 - w1 + total_rank = 0 + for query_dataset, data in ans.items(): + df_A = data.copy() + if feature_name == "bures": + df_A.loc[feature_name, :] = df_A.loc[feature_name, :].apply(convert_to_complex) + df_A.loc[feature_name, :] = df_A.loc[feature_name, :].apply(lambda x: x.real + if isinstance(x, complex) else np.nan) + print(df_A.loc[feature_name, :]) + df_A.loc['score_similarity', :] = w1 * df_A.loc[feature_name, :].values.astype(float) + w2 * df_A.loc[ + 'metadata_sim', :].values.astype(float) + # df_A.loc['score_similarity',:]= df_A.loc['score_similarity',:].fillna(0) + max_idx = df_A.loc['score_similarity', :].idxmax() + max_B = df_A.loc[:, max_idx] + ranks = [] + for method in methods: + ranks.append(max_B.loc['rank_' + method]) + total_rank += np.sum(ranks) + return total_rank + + +ans = get_ans() +get_rank() +all_results = [] +for query_dataset, data in ans.items(): + data.to_csv(f"ranks/{query_dataset}_rank.csv") +for feature_name in feature_names: + w1_values = np.linspace(0, 1, 101) + results = [] + for w1 in w1_values: + total_rank = objective(w1, feature_name) + results.append({'feature_name': feature_name, 'w1': w1, 'total_rank': total_rank}) + all_results.extend(results) +# for w1 in w1_values: +# total_rank = objective(w1) +# results.append({'w1': w1, 'total_rank': total_rank}) + +results_df = pd.DataFrame(all_results) +results_df.to_csv("temp/results_df.csv") +best_result = results_df.loc[results_df['total_rank'].idxmin()] + +print('最佳相似性特征:', best_result['feature_name']) +print('最佳 w1:', best_result['w1']) +print('对应的总排名:', best_result['total_rank']) diff --git a/examples/atlas/sc_similarity_examples/example_usage_anndata.py b/examples/atlas/sc_similarity_examples/example_usage_anndata.py new file mode 100644 index 00000000..8b124f10 --- /dev/null +++ b/examples/atlas/sc_similarity_examples/example_usage_anndata.py @@ -0,0 +1,121 @@ +import argparse +import json +import os +from pathlib import Path + +import numpy as np +import pandas as pd +import scanpy as sc +import torch +from anndata import AnnData +from scipy.sparse import issparse +from torch.utils.data import TensorDataset + +from dance.atlas.sc_similarity import AnnDataSimilarity, get_anndata +from dance.otdd.pytorch.distance import DatasetDistance +from dance.utils import set_seed + +data_root = "/home/zyxing/dance/examples/tuning/temp_data/train/human" + +target_files = [ + "01209dce-3575-4bed-b1df-129f57fbc031", "055ca631-6ffb-40de-815e-b931e10718c0", + "2a498ace-872a-4935-984b-1afa70fd9886", "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", + "3faad104-2ab8-4434-816d-474d8d2641db", "471647b3-04fe-4c76-8372-3264feb950e8", + "4c4cd77c-8fee-4836-9145-16562a8782fe", "84230ea4-998d-4aa8-8456-81dd54ce23af", + "8a554710-08bc-4005-87cd-da9675bdc2e7", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f", + "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c", + "d3566d6a-a455-4a15-980f-45eb29114cab", "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", + "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569" +] +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument( + "--source_files", default=[ + "71be997d-ff75-41b9-8a9f-1288c865f921", "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8", + "c7775e88-49bf-4ba2-a03b-93f00447c958" + ]) +parser.add_argument("--data_dir", default="../tuning/temp_data") +args = parser.parse_args() +source_files = args.source_files +data_dir = args.data_dir +file_root = Path(__file__).resolve().parent +set_seed(42) + + +class CustomEncoder(json.JSONEncoder): + + def default(self, obj): + if isinstance(obj, (np.float32, np.float64)): + return float(obj) + if isinstance(obj, (np.int32, np.int64)): + return int(obj) + if isinstance(obj, pd.DataFrame): + return obj.to_dict(orient='records') + return super().default(obj) + + +def dataset_from_anndata(adata: AnnData, label_key: str = 'cell_type', classes=None): + X = adata.X + if issparse(X): + X = X.toarray() + X_tensor = torch.from_numpy(X).float() + Y = adata.obs[label_key].values + if pd.api.types.is_numeric_dtype(Y): + targets = torch.LongTensor(Y) + if classes is None: + classes = sorted(np.unique(Y)) + else: + unique_classes = sorted(np.unique(Y)) + # class_to_idx = {cls: idx for idx, cls in enumerate(unique_classes)} + # Y_encoded = np.array([class_to_idx[cls] for cls in Y]) + targets = torch.LongTensor(Y.codes) + if classes is None: + classes = unique_classes + ds = TensorDataset(X_tensor, targets) + ds.targets = targets + ds.classes = classes + return ds + + +def run_test_otdd(): + for target_file in target_files: + source_data = sc.read_h5ad(f"{data_root}/human_Blood{source_file}_data.h5ad") + target_data = sc.read_h5ad(f"{data_root}/human_Blood{target_file}_data.h5ad") + source_ds = dataset_from_anndata(source_data) + target_ds = dataset_from_anndata(target_data) + dist = DatasetDistance(source_ds, target_ds) + dist.distance() + + +def run_test_case(source_file): + ans = {} + for target_file in target_files: + # source_data=sc.read_h5ad(f"{data_root}/{source_file}.h5ad") + # target_data=sc.read_h5ad(f"{data_root}/{target_file}.h5ad") + source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir) + target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir) + similarity_calculator = AnnDataSimilarity(adata1=source_data, adata2=target_data, sample_size=10, + init_random_state=42, n_runs=1, + ground_truth_conf_path="Cell Type Annotation Atlas.xlsx", + adata1_name=source_file, adata2_name=target_file) + ans[target_file] = similarity_calculator.get_similarity_matrix_A2B(methods=[ + "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num", + "ground_truth", "mmd", "metadata_sim" + ]) + # with open(f'sim_{source_file}.json', 'w') as f: + # json.dump(ans, f,indent=4,cls=CustomEncoder) + ans = pd.DataFrame(ans) + ans.to_csv(f'sim_{source_file}.csv') + return ans + + +query_data = os.listdir(file_root / "query_data") +with pd.ExcelWriter(file_root / "Blood_similarity.xlsx", engine='openpyxl') as writer: + for source_file in source_files: + query_ans = [ + pd.read_csv(file_root / "query_data" / element, index_col=0) for element in query_data + if element.split("_")[-3] == source_file + ] + ans = run_test_case(source_file) + merged_df = pd.concat(query_ans + [ans], join='inner') + merged_df.to_excel(writer, sheet_name=source_file[:4], index=True) diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py similarity index 100% rename from examples/sc_similarity_examples/sim_query_atlas.py rename to examples/atlas/sc_similarity_examples/sim_query_atlas.py diff --git a/examples/atlas/sc_similarity_examples/vis.py b/examples/atlas/sc_similarity_examples/vis.py new file mode 100644 index 00000000..a547851c --- /dev/null +++ b/examples/atlas/sc_similarity_examples/vis.py @@ -0,0 +1,112 @@ +import re +import sys +from pathlib import Path + +import numpy as np +import pandas as pd + +from dance.utils import try_import + +sys.path.append("..") +import json +from collections import defaultdict + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from get_result_web import spilt_web + +wandb = try_import("wandb") +entity = "xzy11632" +project = "dance-dev" +query_datasets = [ + "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865", + "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8", + "71be997d-ff75-41b9-8a9f-1288c865f921" +] +file_root = Path(__file__).resolve().parent +ground_truth_conf = pd.read_excel(file_root / "Cell Type Annotation Atlas.xlsx", sheet_name="blood", index_col=0) +methods = ["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"] +feature_name = "spectral" + + +def get_accs(sweep): + ans = [] + for run in sweep.runs: + if "test_acc" in run.summary: + ans.append(run.summary["test_acc"]) + return ans + + +def get_runs(sweep_record): + step_links = {} + pattern = r'(step\d+):((?:https?://[^|,]+(?:,)?)+)' + matches = re.finditer(pattern, sweep_record) + for match in matches: + step = match.group(1) # e.g., 'step2' + links_str = match.group(2) # e.g., 'https://...y31tzbnv' + links = links_str.split(',') + step_links[step] = links + ans = [] + for step, links in step_links.items(): + for sweep_url in links: + _, _, sweep_id = spilt_web(sweep_url) + sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}") + ans += get_accs(sweep) + return ans + + +def get_atlas_ans(query_dataset, method): + data = pd.read_excel("Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0) + weight1 = 1.0 + weight2 = 0.0 + weighted_sum = data.loc[feature_name, :] * weight1 + data.loc["metadata_sim", :] * weight2 + atlas_dataset_res = weighted_sum.idxmax() + max_value = weighted_sum.max() + return data.loc[:, atlas_dataset_res][method] + + +def vis(data, target_value, title, ax): + # sns.boxplot(data=data, color='skyblue',ax=ax) + # if target_value is not np.nan: + # ax.axhline(y=target_value, color='red', linestyle='--', linewidth=2, label=f'atlas_value = {target_value}') + # ax.text(0, target_value + (max(data)-min(data))*0.01, f'{target_value}', color='red', ha='center',size=16) + + data = np.array(data) + data_df = pd.DataFrame({'test_acc': data}) + sns.violinplot(y='test_acc', data=data_df, inner=None, color='skyblue', ax=ax) + median = np.median(data) + ax.axhline(median, color='gray', linestyle='--', label=f'Median: {median:.1f}') + if not np.isnan(target_value): + percentile = (np.sum(data < float(target_value)) / len(data)) * 100 + ax.scatter(0, float(target_value), color='red', s=100, zorder=5, + label=f'Specific Value: {target_value}\n({percentile:.1f} percentile)') + ax.set_title(str(title)) + ax.set_ylabel('test_acc') + ax.title.set_size(16) + ax.yaxis.label.set_size(14) + ax.tick_params(axis='both', which='major', labelsize=10) + ax.legend() + + +if __name__ == "__main__": + # ans_all=defaultdict(dict) + # for query_dataset in query_datasets: + # for method in methods: + # sweep_record=ground_truth_conf.loc[query_dataset,method] + # ans_all[query_dataset][method]=get_runs(sweep_record) + # with open("runs.json","w") as f: + # json.dump(ans_all,f) + + with open("runs.json") as f: + runs = json.load(f) + plt.style.use("default") + + for query_dataset in query_datasets: + fig, axes = plt.subplots(2, 2, figsize=(15, 10)) + axes = axes.flatten() + for i, method in enumerate(methods): + vis(runs[query_dataset][method], get_atlas_ans(query_dataset, method), f"{query_dataset}_{method}", axes[i]) + plt.tight_layout() + plt.savefig(f"imgs/{query_dataset}.png", dpi=300) + plt.show() diff --git a/examples/result_analysis/get_num.py b/examples/result_analysis/get_num.py index 6573bd70..2432ae38 100644 --- a/examples/result_analysis/get_num.py +++ b/examples/result_analysis/get_num.py @@ -1,3 +1,22 @@ +"""Count the total number of experiment runs across different tasks in W&B project. + +This script analyzes experiment results stored in a W&B project by: +1. Reading task data from Excel sheets +2. Extracting sweep URLs for each task +3. Querying W&B API to count runs in each sweep +4. Computing the total number of experimental runs + +Parameters +---------- +None + +Returns +------- +int + Total number of runs across all tasks and sweeps + +""" + import sys from pathlib import Path diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py deleted file mode 100644 index e542b068..00000000 --- a/examples/tuning/get_important_pattern.py +++ /dev/null @@ -1,215 +0,0 @@ -import itertools -import pathlib -from itertools import combinations -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import scikit_posthocs as sp -import seaborn as sns -from mlxtend.frequent_patterns import apriori, association_rules -from mlxtend.preprocessing import TransactionEncoder -from networkx import parse_adjlist -from scipy import stats - -metric_name = "acc" -ascending = False - - -def get_important_pattern(test_accs, vis=True, alpha=0.8, title="", test_acc_names=None): - medians = [np.median(group) for group in test_accs] - _, p_value = stats.kruskal(*test_accs) - if vis: - fig = plt.figure(figsize=(12, 4)) - sns.boxplot(data=test_accs) - plt.xticks(list(range(len(test_accs))), - ([f"{i}" for i in range(len(test_accs))] if test_acc_names is None else test_acc_names), rotation=45, - fontsize=10) - plt.title(title) - plt.show() - if p_value < alpha: - data = test_accs - p_values_matrix = sp.posthoc_dunn(a=data) - sorted_indices = np.argsort(np.argsort(medians * -1 if ascending else medians)) - ranks = { - index: { - "rank": rank, - "before": None, - "after": [], - "real_rank": rank - } - for index, rank in enumerate(sorted_indices) - } - for (rank1, rank2) in combinations(range(max(sorted_indices) + 1), 2): - for idx1 in [index for index, value in ranks.items() if value["rank"] == rank1]: - for idx2 in [index for index, value in ranks.items() if value["rank"] == rank2]: - if p_values_matrix.iloc[idx1, idx2] > alpha: - if ranks[idx2]["before"] is None: - ranks[idx1]["after"].append(idx2) - ranks[idx2]["before"] = idx1 - - def change_real_rank(rank_item, real_rank): - rank_item["real_rank"] = real_rank - for idx in rank_item["after"]: - change_real_rank(ranks[idx], real_rank) - - for rank_item in ranks.values(): - if rank_item["before"] is None: - for idx in rank_item["after"]: - change_real_rank(ranks[idx], rank_item["real_rank"]) - return [v["real_rank"] for k, v in ranks.items()] - else: - if vis: - print("No significant differences found between the groups.") - return [] - - -def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True): - ans = [] - for com in itertools.combinations(columns, r): - test_accs_arrays = [] - for g in step2_data.groupby(by=list(com)): - test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])}) - test_accs = [i[metric_name] for i in test_accs_arrays] - test_acc_names = [i["name"] for i in test_accs_arrays] - final_ranks = get_important_pattern( - test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis, - test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names]) - if len(final_ranks) > 0: - max_rank = max(final_ranks) - max_rank_count = final_ranks.count(max_rank) - if max_rank_count < len(final_ranks) / 2: - for index, (test_acc_name, rank) in enumerate(zip(test_acc_names, final_ranks)): - if rank == max_rank: - if vis: - print(f"index={index},name={test_acc_name},rank={rank}") - ans.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, )) - return ans - - -def draw_graph(rules, rules_to_show): - import networkx as nx - G1 = nx.DiGraph() - - color_map = [] - N = 50 - colors = np.random.rand(N) - strs = ['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11'] - - for i in range(rules_to_show): - G1.add_nodes_from(["R" + str(i)]) - - for a in rules.iloc[i]['antecedents']: - - G1.add_nodes_from([a]) - - G1.add_edge(a, "R" + str(i), color=colors[i], weight=2) - - for c in rules.iloc[i]['consequents']: - - G1.add_nodes_from([c]) - - G1.add_edge("R" + str(i), c, color=colors[i], weight=2) - - for node in G1: - found_a_string = False - for item in strs: - if node == item: - found_a_string = True - if found_a_string: - color_map.append('yellow') - else: - color_map.append('green') - - edges = G1.edges() - colors = [G1[u][v]['color'] for u, v in edges] - weights = [G1[u][v]['weight'] for u, v in edges] - - pos = nx.spring_layout(G1, k=16, scale=1) - nx.draw(G1, pos, node_color=color_map, edge_color=colors, width=weights, font_size=16, with_labels=False) - - for p in pos: # raise text positions - pos[p][1] += 0.07 - nx.draw_networkx_labels(G1, pos) - plt.show() - - -def get_frequent_itemsets(step2_data, threshold_per=0.1, vis=False): - threshold = int(len(step2_data) * threshold_per) - df_sorted = step2_data.sort_values(metric_name, ascending=ascending) - top_10_percent = df_sorted.head(threshold) - columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) - transactions = top_10_percent[columns].values.tolist() - te = TransactionEncoder() - te_ary = te.fit(transactions).transform(transactions) - df = pd.DataFrame(te_ary, columns=te.columns_) - frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True) - # print(frequent_itemsets) - rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) - if vis: - # print(frequent_itemsets) - # print(frequent_itemsets) - # draw_graph(rules=rules,rules_to_show=10) - frequent_itemsets_copy = frequent_itemsets.copy() - frequent_itemsets_copy = frequent_itemsets_copy.sort_values(by="support") - frequent_itemsets_copy.plot(x="itemsets", y="support", kind="bar") - plt.xticks(rotation=30, fontsize=7) - # print(type(rules)) - return [tuple(a) for a in frequent_itemsets["itemsets"]] - - -def get_com_all(step2_data, vis=True, alpha=0.8): - ans = [] - columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")]) - for i in range(1, len(columns)): - ans += get_com(step2_data, i, columns=columns, vis=vis, alpha=alpha) - return ans - - -def summary_pattern(data_path, alpha=0.8, vis=False): - step2_origin_data = pd.read_csv(data_path) - step2_data = step2_origin_data.dropna() - com_ans = get_com_all(step2_data, vis=vis, alpha=alpha) - apr_ans = get_frequent_itemsets(step2_data, vis=vis) - return list(set(com_ans) & set(apr_ans)) - - -# def list_files(directory,file_name="best_test_acc.csv",save_path="summary_file"): -# ans=[] -# path = Path(directory) -# for file_path in path.rglob('*'): -# if file_path.is_file(): -# if file_path.name==file_name: -# algorithm,dataset=file_path.relative_to(directory).parts[:2] -# ans.append({"algorithm":algorithm,"dataset":dataset,"summary_pattern":summary_pattern(file_path)}) -# pd.DataFrame(ans).to_csv(save_path) -def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False): - for directory in directories: - path = Path(directory) - for file_path in path.rglob('*'): - if file_path.is_file(): - if file_path.name == file_name: - print(file_path) - with open(Path(file_path.parent.resolve(), "pipeline_summary_pattern.txt"), 'w') as f: - f.write(str(summary_pattern(file_path, alpha=alpha, vis=vis))) - - -if __name__ == "__main__": - # directories = [] - # for path in Path('/home/zyxing/dance/examples/tuning').iterdir(): - # if path.is_dir(): - # if str(path.name).startswith("cluster"): - # directories.append(path) - # list_files(directories) - # directories = [] - # for path in Path('/home/zyxing/dance/examples/tuning').iterdir(): - # if path.is_dir(): - # if str(path.name).startswith("cluster"): - # directories.append(path) - # list_files(directories) - - print( - summary_pattern( - "/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv", - alpha=0.3, vis=False)) From 104934929a897af41f89be5f42d117c8d845fc76 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 11:52:04 +0800 Subject: [PATCH 188/203] add notes --- .../atlas/sc_similarity_examples/cal_w1_w2.py | 66 ++++++++++++++++++- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/examples/atlas/sc_similarity_examples/cal_w1_w2.py b/examples/atlas/sc_similarity_examples/cal_w1_w2.py index 63b0e3f4..710b5787 100644 --- a/examples/atlas/sc_similarity_examples/cal_w1_w2.py +++ b/examples/atlas/sc_similarity_examples/cal_w1_w2.py @@ -1,3 +1,21 @@ +"""Calculate optimal weights for combining similarity metrics in cell type annotation. + +This script analyzes different similarity metrics (like Wasserstein, Hausdorff, etc.) and metadata similarity +to find optimal weights that minimize the total rank of correct cell type predictions across multiple datasets. + +The script: +1. Loads similarity scores from Excel files +2. Computes rankings for different cell type annotation methods +3. Finds optimal weights (w1, w2) for combining feature-based and metadata-based similarity +4. Outputs the best performing feature and its corresponding weight + +Returns +------- +DataFrame + Results containing feature names, weights, and corresponding total ranks + +""" + import ast import re from pathlib import Path @@ -23,6 +41,14 @@ def get_ans(): + """Load similarity scores from Excel files for each dataset. + + Returns + ------- + dict + Dictionary mapping dataset IDs to their similarity score DataFrames + + """ ans = {} for query_dataset in query_datasets: data = pd.read_excel(file_root / "Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0) @@ -31,6 +57,12 @@ def get_ans(): def get_rank(): + """Calculate rankings for each cell type annotation method. + + Updates the input DataFrames with rank columns for each method, where lower ranks + indicate better performance. + + """ for query_dataset, data in ans.items(): for method in methods: rank_col = 'rank_' + method @@ -39,6 +71,19 @@ def get_rank(): def convert_to_complex(s): + """Convert string representations of complex numbers to float values. + + Parameters + ---------- + s : str or float + Input value to convert + + Returns + ------- + float + Real part of complex number or NaN if conversion fails + + """ if isinstance(s, float) or isinstance(s, int): return float(s) try: @@ -48,6 +93,21 @@ def convert_to_complex(s): def objective(w1, feature_name): + """Calculate total rank score for given weights and feature. + + Parameters + ---------- + w1 : float + Weight for the feature-based similarity (0-1) + feature_name : str + Name of the similarity feature to evaluate + + Returns + ------- + float + Total rank score (lower is better) + + """ w2 = 1 - w1 total_rank = 0 for query_dataset, data in ans.items(): @@ -89,6 +149,6 @@ def objective(w1, feature_name): results_df.to_csv("temp/results_df.csv") best_result = results_df.loc[results_df['total_rank'].idxmin()] -print('最佳相似性特征:', best_result['feature_name']) -print('最佳 w1:', best_result['w1']) -print('对应的总排名:', best_result['total_rank']) +print('Best similarity feature:', best_result['feature_name']) +print('Best w1:', best_result['w1']) +print('Corresponding total rank:', best_result['total_rank']) From 609db041b071c1c2f8f379353a4959f6d5b71cc3 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 12:51:50 +0800 Subject: [PATCH 189/203] translate notes --- .../sc_similarity_examples/sim_query_atlas.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/examples/atlas/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py index de783f20..8e78393c 100644 --- a/examples/atlas/sc_similarity_examples/sim_query_atlas.py +++ b/examples/atlas/sc_similarity_examples/sim_query_atlas.py @@ -26,40 +26,42 @@ def find_unique_matching_row(df, config_col, input_dict_list): - """在 DataFrame 中查找指定列中与输入字典列表匹配的唯一一行。 + """Find a unique matching row in DataFrame based on the specified column and input + dictionary list. - :param df: pandas.DataFrame,包含要搜索的数据。 - :param config_col: str,DataFrame 中包含字典列表字符串的列名。 - :param input_dict_list: list of dicts,输入的字典列表,用于匹配。 - :return: pandas.Series,匹配的行。 - :raises ValueError: 如果匹配的行数不等于1。 + :param df: pandas.DataFrame, containing the data to search. + :param config_col: str, name of the DataFrame column containing dictionary list + strings. + :param input_dict_list: list of dicts, input dictionary list for matching. + :return: pandas.Series, the matching row. + :raises ValueError: if the number of matching rows is not equal to 1. """ - # 定义一个函数,用于解析字符串并比较 + # Define a function for parsing strings and comparing def is_match(config_str): try: - # 使用 ast.literal_eval 安全地解析字符串为 Python 对象 + # Safely parse string to Python object using ast.literal_eval config = ast.literal_eval(config_str) return config == input_dict_list except (ValueError, SyntaxError): - # 如果解析失败,则不匹配 + # If parsing fails, no match return False - # 应用比较函数,得到一个布尔系列 + # Apply comparison function to get a boolean series matches = df[config_col].apply(is_match) - # 获取所有匹配的行 + # Get all matching rows matching_rows = df[matches] - # 检查匹配的行数 + # Check number of matching rows num_matches = len(matching_rows) if num_matches == 1: return matching_rows.iloc[0] elif num_matches == 0: - raise ValueError("未找到匹配的行。") + raise ValueError("No matching rows found.") else: - raise ValueError(f"找到 {num_matches} 行匹配,预期恰好一行。") + raise ValueError(f"Found {num_matches} matching rows, expected exactly one.") wandb = try_import("wandb") From b26151f987d52cd75f096e34f5befdf5dca3368e Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 15:16:42 +0800 Subject: [PATCH 190/203] add notes --- .../example_usage_anndata.py | 38 +++++++++- examples/atlas/sc_similarity_examples/vis.py | 70 ++++++++++++++++++- 2 files changed, 103 insertions(+), 5 deletions(-) diff --git a/examples/atlas/sc_similarity_examples/example_usage_anndata.py b/examples/atlas/sc_similarity_examples/example_usage_anndata.py index 8b124f10..651ab4a1 100644 --- a/examples/atlas/sc_similarity_examples/example_usage_anndata.py +++ b/examples/atlas/sc_similarity_examples/example_usage_anndata.py @@ -55,6 +55,23 @@ def default(self, obj): def dataset_from_anndata(adata: AnnData, label_key: str = 'cell_type', classes=None): + """Convert AnnData object to PyTorch TensorDataset. + + Parameters + ---------- + adata : AnnData + Input AnnData object + label_key : str, default='cell_type' + Column name in adata.obs containing cell type labels + classes : list, optional + Predefined class labels. If None, will be inferred from data + + Returns + ------- + TensorDataset + PyTorch dataset with features and labels + + """ X = adata.X if issparse(X): X = X.toarray() @@ -88,22 +105,39 @@ def run_test_otdd(): def run_test_case(source_file): + """Calculate similarity matrices between source and target datasets. + + Parameters + ---------- + source_file : str + Name of the source dataset file + + Returns + ------- + pandas.DataFrame + Similarity scores for different metrics + + """ ans = {} for target_file in target_files: # source_data=sc.read_h5ad(f"{data_root}/{source_file}.h5ad") # target_data=sc.read_h5ad(f"{data_root}/{target_file}.h5ad") source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir) target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir) + + # Initialize similarity calculator with multiple metrics similarity_calculator = AnnDataSimilarity(adata1=source_data, adata2=target_data, sample_size=10, init_random_state=42, n_runs=1, ground_truth_conf_path="Cell Type Annotation Atlas.xlsx", adata1_name=source_file, adata2_name=target_file) + + # Calculate similarity using multiple methods ans[target_file] = similarity_calculator.get_similarity_matrix_A2B(methods=[ "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num", "ground_truth", "mmd", "metadata_sim" ]) - # with open(f'sim_{source_file}.json', 'w') as f: - # json.dump(ans, f,indent=4,cls=CustomEncoder) + + # Convert results to DataFrame and save ans = pd.DataFrame(ans) ans.to_csv(f'sim_{source_file}.csv') return ans diff --git a/examples/atlas/sc_similarity_examples/vis.py b/examples/atlas/sc_similarity_examples/vis.py index a547851c..a9f28ec5 100644 --- a/examples/atlas/sc_similarity_examples/vis.py +++ b/examples/atlas/sc_similarity_examples/vis.py @@ -28,9 +28,29 @@ ground_truth_conf = pd.read_excel(file_root / "Cell Type Annotation Atlas.xlsx", sheet_name="blood", index_col=0) methods = ["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"] feature_name = "spectral" +"""Visualization script for comparing model performance across different datasets and +methods. + +This script loads experiment results from wandb and compares them with atlas-based +predictions, generating violin plots to visualize the distribution of accuracies. + +""" def get_accs(sweep): + """Extract test accuracies from a wandb sweep. + + Parameters + ---------- + sweep : wandb.Sweep + Sweep object containing multiple runs + + Returns + ------- + list + List of test accuracies from all runs + + """ ans = [] for run in sweep.runs: if "test_acc" in run.summary: @@ -39,6 +59,19 @@ def get_accs(sweep): def get_runs(sweep_record): + """Parse sweep URLs and collect all run results. + + Parameters + ---------- + sweep_record : str + String containing sweep URLs for different steps + + Returns + ------- + list + Combined list of test accuracies from all sweeps + + """ step_links = {} pattern = r'(step\d+):((?:https?://[^|,]+(?:,)?)+)' matches = re.finditer(pattern, sweep_record) @@ -57,16 +90,45 @@ def get_runs(sweep_record): def get_atlas_ans(query_dataset, method): + """Calculate atlas-based prediction accuracy for a given dataset and method. + + Parameters + ---------- + query_dataset : str + Dataset identifier + method : str + Method name to evaluate + + Returns + ------- + float + Predicted accuracy based on atlas similarity + + """ data = pd.read_excel("Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0) - weight1 = 1.0 - weight2 = 0.0 + weight1 = 1.0 # Weight for feature-based similarity + weight2 = 0.0 # Weight for metadata similarity weighted_sum = data.loc[feature_name, :] * weight1 + data.loc["metadata_sim", :] * weight2 - atlas_dataset_res = weighted_sum.idxmax() + atlas_dataset_res = weighted_sum.idxmax() # Get most similar dataset max_value = weighted_sum.max() return data.loc[:, atlas_dataset_res][method] def vis(data, target_value, title, ax): + """Create violin plot comparing distribution of accuracies with atlas prediction. + + Parameters + ---------- + data : list + List of accuracy values + target_value : float + Atlas-predicted accuracy value + title : str + Plot title + ax : matplotlib.axes.Axes + Axes object to plot on + + """ # sns.boxplot(data=data, color='skyblue',ax=ax) # if target_value is not np.nan: # ax.axhline(y=target_value, color='red', linestyle='--', linewidth=2, label=f'atlas_value = {target_value}') @@ -102,9 +164,11 @@ def vis(data, target_value, title, ax): runs = json.load(f) plt.style.use("default") + # Generate visualization for each dataset for query_dataset in query_datasets: fig, axes = plt.subplots(2, 2, figsize=(15, 10)) axes = axes.flatten() + # Create subplot for each method for i, method in enumerate(methods): vis(runs[query_dataset][method], get_atlas_ans(query_dataset, method), f"{query_dataset}_{method}", axes[i]) plt.tight_layout() From 6878afa65c1ee765fc943e90005c14da6179ba97 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 15:41:25 +0800 Subject: [PATCH 191/203] add notes --- dance/atlas/data_dropbox_upload.py | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/dance/atlas/data_dropbox_upload.py b/dance/atlas/data_dropbox_upload.py index 718e07c9..87b6d78e 100644 --- a/dance/atlas/data_dropbox_upload.py +++ b/dance/atlas/data_dropbox_upload.py @@ -12,6 +12,23 @@ def upload_file_to_dropbox(dropbox_path, access_token, local_path): + """Upload a local file to Dropbox. + + Parameters + ---------- + dropbox_path : str + Destination path in Dropbox + access_token : str + Dropbox API access token + local_path : str or pathlib.Path + Path to local file to upload + + Returns + ------- + None + Returns None if upload fails + + """ dbx = dropbox.Dropbox(access_token) # Verify access token @@ -29,6 +46,18 @@ def upload_file_to_dropbox(dropbox_path, access_token, local_path): def file_upload(dbx: dropbox.Dropbox, local_path: pathlib.Path, remote_path: str): + """Upload large files to Dropbox using chunked upload. + + Parameters + ---------- + dbx : dropbox.Dropbox + Authenticated Dropbox client + local_path : pathlib.Path + Path to local file + remote_path : str + Destination path in Dropbox + + """ CHUNKSIZE = 100 * 1024 * 1024 upload_session_start_result = dbx.files_upload_session_start(b'') cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id, offset=0) @@ -86,10 +115,35 @@ def get_link(data_fname, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH): def get_ans(data: sc.AnnData, tissue: str, dataset_id: str, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH): + """Generate metadata dictionary for dataset and upload to Dropbox. + + Parameters + ---------- + data : sc.AnnData + Annotated data matrix + tissue : str + Tissue type + dataset_id : str + Unique identifier for dataset + local_path : str or pathlib.Path + Path to local data file + ACCESS_TOKEN : str + Dropbox API access token + DROPBOX_DEST_PATH : str + Base path in Dropbox for uploads + + Returns + ------- + dict + Metadata dictionary containing dataset information and Dropbox URLs + + """ # keys=["species","tissue","dataset","split","celltype_fname","celltype_url","data_fname","data_url"] + # Create metadata dictionary with dataset info ans = {} ans["species"] = "human" ans["tissue"] = tissue.capitalize() + # Store number of observations (cells) in dataset ans["dataset"] = data.n_obs ans["split"] = "train" ans["celltype_fname"] = "" From 04ab7eb2138754e0f9c656c5e46ca674a9e143a7 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 15:45:28 +0800 Subject: [PATCH 192/203] add notes --- .../atlas/sc_similarity/anndata_similarity.py | 79 +++++++++++++++++-- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/dance/atlas/sc_similarity/anndata_similarity.py b/dance/atlas/sc_similarity/anndata_similarity.py index 88784a9d..b7d392bf 100644 --- a/dance/atlas/sc_similarity/anndata_similarity.py +++ b/dance/atlas/sc_similarity/anndata_similarity.py @@ -31,6 +31,32 @@ def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = " class AnnDataSimilarity: + """A class to compute various similarity metrics between two AnnData objects. + + Parameters + ---------- + adata1 : anndata.AnnData + First AnnData object for comparison + adata2 : anndata.AnnData + Second AnnData object for comparison + sample_size : Optional[int] + Number of cells to sample from each dataset. If None, uses min(adata1.n_obs, adata2.n_obs) + init_random_state : Optional[int] + Random seed for reproducibility + n_runs : int + Number of times to run each similarity computation + ground_truth_conf_path : Optional[str] + Path to ground truth configuration file + adata1_name : Optional[str] + Name identifier for first dataset + adata2_name : Optional[str] + Name identifier for second dataset + methods : List[str] + List of cell type annotation methods to use + tissue : str + Tissue type being analyzed + + """ def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None, init_random_state: Optional[int] = None, n_runs: int = 10, @@ -52,6 +78,14 @@ def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size self.n_runs = n_runs def filter_gene(self, n_top_genes=3000): + """Filter genes to keep only highly variable genes common between datasets. + + Parameters + ---------- + n_top_genes : int + Number of top variable genes to select + + """ sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3') sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3') @@ -159,6 +193,14 @@ def jsd(p, q): return np.nanmean(similarity_matrix) def compute_mmd(self) -> float: + """Compute Maximum Mean Discrepancy between datasets. + + Returns + ------- + float + Normalized MMD similarity score between 0 and 1 + + """ X = self.X Y = self.Y kernel = "rbf" @@ -196,8 +238,14 @@ def data_company(self): raise NotImplementedError("data company") def wasserstein_dist(self) -> float: - """Computes the average Wasserstein distance between all pairs of cells from the - two datasets.""" + """Compute Wasserstein distance between datasets. + + Returns + ------- + float + Normalized Wasserstein similarity score between 0 and 1 + + """ X = self.X Y = self.Y a = np.ones((X.shape[0], )) / X.shape[0] @@ -272,6 +320,15 @@ def spectral_distance(self): return 1 / (1 + np.linalg.norm(eig_A - eig_B)) def get_dataset_meta_sim(self): + """Compute metadata similarity between datasets based on discrete and continuous + features. + + Returns + ------- + float + Average similarity score across all metadata features + + """ # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general'] con_cols = [ "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean", @@ -357,11 +414,19 @@ def compute_similarity( 'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim" ] ) -> Dict[str, float]: - """Computes the specified similarity measure. Parameters: - - methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd' - Returns: - Dictionary containing the similarity matrices + """Compute multiple similarity metrics between datasets. + + Parameters + ---------- + random_state : int + Random seed for cell sampling + methods : List[str] + List of similarity methods to compute + + Returns + ------- + Dict[str, float] + Dictionary mapping method names to similarity scores """ self.adata1 = self.origin_adata1.copy() From 844d8839a7e43f0152e4ea362fe7737162125e23 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 16:10:16 +0800 Subject: [PATCH 193/203] add notes --- .../sc_similarity_examples/sim_query_atlas.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/examples/atlas/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py index 8e78393c..a424df17 100644 --- a/examples/atlas/sc_similarity_examples/sim_query_atlas.py +++ b/examples/atlas/sc_similarity_examples/sim_query_atlas.py @@ -26,15 +26,26 @@ def find_unique_matching_row(df, config_col, input_dict_list): - """Find a unique matching row in DataFrame based on the specified column and input - dictionary list. - - :param df: pandas.DataFrame, containing the data to search. - :param config_col: str, name of the DataFrame column containing dictionary list - strings. - :param input_dict_list: list of dicts, input dictionary list for matching. - :return: pandas.Series, the matching row. - :raises ValueError: if the number of matching rows is not equal to 1. + """Find a unique matching row in DataFrame based on specified criteria. + + Parameters + ---------- + df : pandas.DataFrame + DataFrame containing the data to search + config_col : str + Name of the DataFrame column containing dictionary list strings + input_dict_list : list of dict + Input dictionary list for matching + + Returns + ------- + pandas.Series + The matching row from the DataFrame + + Raises + ------ + ValueError + If the number of matching rows is not exactly one """ From 1512c01e9931c8ec39b4b688c1ae08ecaf93bc3b Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 16:20:20 +0800 Subject: [PATCH 194/203] add notes --- examples/atlas/upload_data.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/atlas/upload_data.py b/examples/atlas/upload_data.py index 2a6f3f8b..6713aff8 100644 --- a/examples/atlas/upload_data.py +++ b/examples/atlas/upload_data.py @@ -24,6 +24,25 @@ DROPBOX_DEST_PATH = args.dropbox_dest_path # Destination path on Dropbox def get_data(dataset_id, in_atlas=False, large=False): + """Load h5ad dataset from local path. + + Parameters + ---------- + dataset_id : str + Identifier for the dataset + in_atlas : bool + Whether dataset is from atlas (True) or query (False) + large : bool + Whether dataset is large (>10000 cells) requiring sampling + + Returns + ------- + AnnData + Loaded single cell data + Path + Local path to the data file + + """ if large: if in_atlas: local_path = MAINDIR / f"sampled-10000/{tissue}/{dataset_id}.h5ad" From 7549f1d28fcb57f7f5c9ff3e1c06f12a1be72d71 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 16:50:23 +0800 Subject: [PATCH 195/203] minor --- examples/tuning/joint_embedding_dcca/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index dd8f9f76..af76f595 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -12,10 +12,10 @@ import scipy import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing import dance.utils.metrics as metrics +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA @@ -122,8 +122,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) # feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"], # feature_channel=["counts", "counts", None, None, "size_factors", # "size_factors"], label_channel="labels") - #TODO 感觉layers中的counts才是raw - #TODO 的确感觉layers中的counts才是raw,不知道反过来影响大不大 + # TODO Feels like counts in layers should be raw + # TODO Indeed feels like counts in layers should be raw, not sure how big the reverse impact would be (x_train, y_train, x_train_raw, y_train_raw, x_train_size, y_train_size), train_labels = data.get_train_data(return_type="torch") (x_test, y_test, x_test_raw, y_test_raw, x_test_size, @@ -201,7 +201,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) logger.info(f"Variable '{var}' does not exist, continuing...") torch.cuda.empty_cache() gc.collect() - #主要是报错时没有执行这些命令导致的,我感觉 + # This is mainly caused by these commands not being executed when errors occur, I think entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( From f0b8db47f24db4985efd7c3ce048711882e071fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 08:51:32 +0000 Subject: [PATCH 196/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_dcca/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py index af76f595..42465bcb 100644 --- a/examples/tuning/joint_embedding_dcca/main.py +++ b/examples/tuning/joint_embedding_dcca/main.py @@ -12,10 +12,10 @@ import scipy import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing import dance.utils.metrics as metrics -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.dcca import DCCA From 3c3f527c3f52ad61b0ec1e6863b235dcf2f9f825 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 16:55:00 +0800 Subject: [PATCH 197/203] minor --- examples/tuning/joint_embedding_scmogcn/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index 39152202..14161f93 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper @@ -100,7 +100,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0]) # train_size = len(data.get_split_idx("train")) - #按理说meta1应该包括mod1前半部分的所有内容,可能中途打乱了顺序 + # In theory, meta1 should include all content from the first half of mod1, the order might have been shuffled during processing data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data) data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data) # data.set_config( From 93d96c1759869db34215b73eae9b4fb0e8eebffd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 08:55:54 +0000 Subject: [PATCH 198/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmogcn/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py index 14161f93..d860be0e 100644 --- a/examples/tuning/joint_embedding_scmogcn/main.py +++ b/examples/tuning/joint_embedding_scmogcn/main.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper From e2bd54078403f0fea42f128a7613e947ad2d67f9 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 16:58:11 +0800 Subject: [PATCH 199/203] minor --- examples/tuning/joint_embedding_scmvae/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index e20be682..e4c0f3aa 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -9,9 +9,9 @@ import pandas as pd import torch import torch.utils.data as data_utils -import wandb from sklearn import preprocessing +import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE @@ -134,7 +134,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) x_test = torch.cat([x_train, x_test]) y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 + labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]) + ) # This might be problematic, likely due to dimensionality reduction issues model = scMVAE( encoder_1=[Nfeature1, 1024, 128, 128], hidden_1=128, From 312ab95f01e9cf5eea66985774373ef6a7ee1951 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 16:58:28 +0800 Subject: [PATCH 200/203] minor --- examples/tuning/predict_modality_babel/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py index 112317f7..5f47bbbb 100644 --- a/examples/tuning/predict_modality_babel/main.py +++ b/examples/tuning/predict_modality_babel/main.py @@ -6,8 +6,8 @@ import pandas as pd import torch -import wandb +import wandb from dance import logger from dance.datasets.multimodality import ModalityPredictionDataset from dance.modules.multi_modality.predict_modality.babel import BabelWrapper @@ -77,7 +77,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer) x_test, y_test = data.get_test_data(return_type="torch") x_train, y_train, x_test, y_test = x_train.float(), y_train.float(), x_test.float(), y_test.float() # Train and evaluate the model - #突然想到,或许有些算法可以降维,而有些算法不能降维,所以还是要依据算法而定 + # Just realized some algorithms can do dimensionality reduction while others cannot, so it depends on the algorithm model = BabelWrapper(args, dim_in=x_train.shape[1], dim_out=y_train.shape[1]) model.fit(x_train, y_train, val_ratio=0.15) wandb.log({'rmse': model.score(x_test, y_test)}) From 5e51a664505f0704fa844eae00b0f074b832920e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 08:59:11 +0000 Subject: [PATCH 201/203] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tuning/joint_embedding_scmvae/main.py | 2 +- examples/tuning/predict_modality_babel/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py index e4c0f3aa..5c1e264c 100644 --- a/examples/tuning/joint_embedding_scmvae/main.py +++ b/examples/tuning/joint_embedding_scmvae/main.py @@ -9,9 +9,9 @@ import pandas as pd import torch import torch.utils.data as data_utils +import wandb from sklearn import preprocessing -import wandb from dance import logger from dance.datasets.multimodality import JointEmbeddingNIPSDataset from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py index 5f47bbbb..97062150 100644 --- a/examples/tuning/predict_modality_babel/main.py +++ b/examples/tuning/predict_modality_babel/main.py @@ -6,8 +6,8 @@ import pandas as pd import torch - import wandb + from dance import logger from dance.datasets.multimodality import ModalityPredictionDataset from dance.modules.multi_modality.predict_modality.babel import BabelWrapper From 1a98457e0b1498391e9f1bcf38c0b4491204755e Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 17:03:45 +0800 Subject: [PATCH 202/203] minor --- .../joint_embedding_scmvae/main.py | 208 ------------------ 1 file changed, 208 deletions(-) delete mode 100644 examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py diff --git a/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py b/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py deleted file mode 100644 index 9fb85885..00000000 --- a/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py +++ /dev/null @@ -1,208 +0,0 @@ -import argparse -import gc -import os -import pprint -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -import torch -import torch.utils.data as data_utils -import wandb -from sklearn import preprocessing - -from dance import logger -from dance.datasets.multimodality import JointEmbeddingNIPSDataset -from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE -from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data -from dance.transforms.preprocess import calculate_log_library_size -from dance.utils import set_seed - - -def parameter_setting(): - parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis") - - parser.add_argument("--workdir", "-wk", type=str, default="./new_test", help="work path") - parser.add_argument("--outdir", "-od", type=str, default="./new_test", help="Output path") - - parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate") - parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay") - parser.add_argument("--eps", type=float, default=0.01, help="eps") - parser.add_argument("--runs", type=int, default=1, help="Number of repetitions") - - parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size") - parser.add_argument('-seed', '--seed', type=int, default=1, help='Random seed for repeat results') - parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim") - parser.add_argument("--max_epoch", "-me", type=int, default=25, help="Max epoches") - parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration") - parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch") - parser.add_argument("--epoch_per_test", "-ept", type=int, default=1, - help="Epoch per test, must smaller than max iteration.") - parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI") - parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2") - parser.add_argument("-device", "--device", default="cuda") - parser.add_argument("--final_rate", type=float, default=1e-4) - parser.add_argument("--scale_factor", type=float, default=4) - - parser.add_argument("--cache", action="store_true", help="Cache processed data.") - parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"]) - parser.add_argument("--count", type=int, default=2) - parser.add_argument("--sweep_id", type=str, default=None) - parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str) - parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str) - - return parser - - -if __name__ == "__main__": - parser = parameter_setting() - args = parser.parse_args() - assert args.max_iteration > args.epoch_per_test - device = torch.device(args.device) - args.lr = 0.001 - args.anneal_epoch = 200 - res = None - logger.info(f"\n{pprint.pformat(vars(args))}") - file_root_path = Path(args.root_path, args.subtask).resolve() - logger.info(f"\n files is saved in {file_root_path}") - pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml") - os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000" - - def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer): - wandb.init(settings=wandb.Settings(start_method='thread')) - set_seed(args.seed) - wandb_config = wandb.config - if "run_kwargs" in pipeline_planer.config: - if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs): - wandb_config = wandb_config["run_kwargs"] - else: - wandb.log({"skip": 1}) - wandb.finish() - return - try: - dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding") - data = dataset.load_data() - - le = preprocessing.LabelEncoder() - labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"]) - data.mod["mod1"].obsm["labels"] = labels - - # Prepare preprocessing pipeline and apply it to data - kwargs = {tune_mode: dict(wandb_config)} - preprocessing_pipeline = pipeline_planer.generate(**kwargs) - print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") - preprocessing_pipeline(data) - train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names] - train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name] - test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx))) - - # train_size=data.mod["meta1"].shape[0] - # test_size=data.mod["mod1"].shape[0]-train_size - data.set_split_idx("train", train_idx) - data.set_split_idx("test", test_idx) - (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch") - (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch") - # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels) - lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()])) - lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()])) - lib_mean1 = torch.from_numpy(lib_mean1) - lib_var1 = torch.from_numpy(lib_var1) - lib_mean2 = torch.from_numpy(lib_mean2) - lib_var2 = torch.from_numpy(lib_var2) - - Nfeature1 = x_train.shape[1] - Nfeature2 = y_train.shape[1] - # train_size = len(data.get_split_idx("train")) - # train_size=x_train.shape[0] - train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx], - lib_var2[train_idx], y_train) - - valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx], - lib_var2[test_idx], y_test) - - total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test])) - - total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False) - - x_test = torch.cat([x_train, x_test]) - y_test = torch.cat([y_train, y_test]) - labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])) #这里大概会有问题,很可能就是降维的问题 - model = scMVAE( - encoder_1=[Nfeature1, 1024, 128, 128], - hidden_1=128, - Z_DIMS=22, - decoder_share=[22, 128, 256], - share_hidden=128, - decoder_1=[128, 128, 1024], - hidden_2=1024, - encoder_l=[Nfeature1, 128], - hidden3=128, - encoder_2=[Nfeature2, 1024, 128, 128], - hidden_4=128, - encoder_l1=[Nfeature2, 128], - hidden3_1=128, - decoder_2=[128, 128, 1024], - hidden_5=1024, - drop_rate=0.1, - log_variational=True, - Type="ZINB", - device=device, - n_centroids=22, - penality="GMM", - model=1, - ) - model.to(device) - model.init_gmm_params(total_loader) - model.fit(args, train, valid, args.final_rate, args.scale_factor, device) - - # embeds = model.predict(x_test, y_test).cpu().numpy() - score = model.score(x_test, y_test, labels) - # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems")) - score["ARI"] = score["dance_ari"] - del score["dance_ari"] - wandb.log(score) - wandb.finish() - finally: - locals_keys = list(locals().keys()) - for var in locals_keys: - try: - exec(f"del {var}") - logger.info(f"Deleted '{var}'") - except NameError: - logger.info(f"Variable '{var}' does not exist, continuing...") - torch.cuda.empty_cache() - gc.collect() - # score.update({ - # 'seed': args.seed + k, - # 'subtask': args.subtask, - # 'method': 'scmvae', - # }) - - # if res is not None: - # res = res.append(score, ignore_index=True) - # else: - # for s in score: - # score[s] = [score[s]] - # res = pd.DataFrame(score) - - entity, project, sweep_id = pipeline_planer.wandb_sweep_agent( - evaluate_pipeline, sweep_id=args.sweep_id, count=args.count) #Score can be recorded for each epoch - save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path) - if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params": - get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer, - conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml", - root_path=file_root_path, - required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod", - "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI") - if args.tune_mode == "pipeline_params": - run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer) -"""To reproduce scMVAE on other samples, please refer to command lines belows: - -GEX-ADT: -$ python scmvae.py --subtask openproblems_bmmc_cite_phase2 --device cuda - -GEX-ATAC: -$ python scmvae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda - -""" From a19aa536eac2683f35292d30d967016259e066b9 Mon Sep 17 00:00:00 2001 From: xzy Date: Fri, 20 Dec 2024 21:38:00 +0800 Subject: [PATCH 203/203] update data --- dance/metadata/scdeepsort.csv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv index 39e41209..446cc2f9 100644 --- a/dance/metadata/scdeepsort.csv +++ b/dance/metadata/scdeepsort.csv @@ -170,13 +170,13 @@ human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4b human,Kidney,10000,train,,,train_human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad,https://www.dropbox.com/scl/fi/feklth6jvnc5qqwvgaydy/human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad?rlkey=28vpy2m90lnri9aekfthrsvr1&dl=1 human,Kidney,5848,train,,,train_human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad,https://www.dropbox.com/scl/fi/1jq1wrqo1rcl041antcm8/human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad?rlkey=ssgfsiobqfah3pxgqnrsaff6l&dl=1 human,Kidney,9641,train,,,train_human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad,https://www.dropbox.com/scl/fi/o2cnntkrd5j6coeqehv8b/human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad?rlkey=5tbupfd3cdvqzy2rix6scvwzu&dl=1 -human,Lung,10000,train,,,train_human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/w0n6axa32nej87tw4rk49/human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=8lgoi54y9wtxtfwpmnumpmzex&dl=1 +human,Lung,10000,train,,,train_human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/w0n6axa32nej87tw4rk49/human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data-Lung.h5ad?rlkey=8lgoi54y9wtxtfwpmnumpmzex&st=1ofxszp7&dl=1 human,Lung,10000,train,,,train_human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad,https://www.dropbox.com/scl/fi/dqhei15s96dg3q8bdd31b/human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad?rlkey=ykpxbucys97t327fwehflkoa2&dl=1 human,Lung,10000,train,,,train_human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad,https://www.dropbox.com/scl/fi/pwhyse079mo9radk2xzuw/human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad?rlkey=t60bp7w5mf3k877q1i430oc14&dl=1 human,Lung,10000,train,,,train_human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad,https://www.dropbox.com/scl/fi/w2r13kqrkzdxecvhizm0i/human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad?rlkey=6s4wbv2ii1d8ged5l8s8lwt6l&dl=1 -human,Lung,10000,train,,,train_human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad,https://www.dropbox.com/scl/fi/ubcw0cyn5uvaq034ysgxl/human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad?rlkey=m57pb8bx4936fnao2yyljqdgz&dl=1 -human,Lung,10000,train,,,train_human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/0vqe7wmb0afoubwnb5srb/human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=7necb5o9afgpnppsj74tga5y2&dl=1 -human,Lung,10000,train,,,train_human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/sbe6h2v5dijlu36qd6nyw/human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=gunxweprd7r8e0xlk9mo2kkv3&dl=1 +human,Lung,10000,train,,,train_human_Lung4ed927e9-c099-49af-b8ce-a2652d069333(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/ubcw0cyn5uvaq034ysgxl/human_Lung4ed927e9-c099-49af-b8ce-a2652d069333-Lung-_data.h5ad?rlkey=m57pb8bx4936fnao2yyljqdgz&st=hz86nc4q&dl=1 +human,Lung,10000,train,,,train_human_Lung01209dce-3575-4bed-b1df-129f57fbc031(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/0vqe7wmb0afoubwnb5srb/human_Lung01209dce-3575-4bed-b1df-129f57fbc031-Lung-_data.h5ad?rlkey=7necb5o9afgpnppsj74tga5y2&st=1a1tjz2c&dl=1 +human,Lung,10000,train,,,train_human_Lungc5d88abe-f23a-45fa-a534-788985e93dad(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/sbe6h2v5dijlu36qd6nyw/human_Lungc5d88abe-f23a-45fa-a534-788985e93dad-Lung-_data.h5ad?rlkey=gunxweprd7r8e0xlk9mo2kkv3&st=yn1er34y&dl=1 human,Lung,10000,train,,,train_human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad,https://www.dropbox.com/scl/fi/mz6umlbnjoxynhqklwyxg/human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad?rlkey=upom03ch71gebjvxq15x59gk9&dl=1 human,Lung,10000,train,,,train_human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad,https://www.dropbox.com/scl/fi/b2e6gr542wah0t5xtgshh/human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad?rlkey=7pkq1kh7wz6z0qzj4wdj94i79&dl=1 human,Lung,10000,train,,,train_human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad,https://www.dropbox.com/scl/fi/ymmdfevzihlcyjosugyuq/human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad?rlkey=71rly1fkb21yl8gxy8af42ke7&dl=1