From 2f09a8565a51bed1c028ef6d83bd717f0fefa5c3 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 22 Jul 2024 11:23:25 +0800
Subject: [PATCH 001/203] minor change

---
 dance/pipeline.py                    |  4 ++--
 dance/transforms/filter.py           |  4 ++--
 dance/transforms/graph/dstg_graph.py | 14 +++++++++++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/dance/pipeline.py b/dance/pipeline.py
index d831ab10..8a5d02d8 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -1086,9 +1086,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane
     step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N)
     # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random.
     # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact.
-    step3_start_k = default(step2_pipeline_planer.config.step3_start_k, 0)
+    step3_start_k = step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0
     #Some sweep_ids of step3 that have already been run
-    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids
+    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None
     step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else (
         step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids)))
 
diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py
index 78e6d83f..1388d86d 100644
--- a/dance/transforms/filter.py
+++ b/dance/transforms/filter.py
@@ -145,9 +145,9 @@ def prepCounts(self, x):
             elif self._FILTER_TARGET == "cells":
                 n_counts = np.sum(x, axis=1)
             if isinstance(self.min_counts, float) and 0 <= self.min_counts <= 1:
-                min_counts = np.percentile(n_counts, self.min_counts)
+                min_counts = np.percentile(n_counts, self.min_counts * 100)
             else:
-                max_counts = np.percentile(n_counts, self.max_counts)
+                max_counts = np.percentile(n_counts, self.max_counts * 100)
             return min_counts, max_counts
         else:
             return self.min_counts, self.max_counts
diff --git a/dance/transforms/graph/dstg_graph.py b/dance/transforms/graph/dstg_graph.py
index b261fede..e51a3393 100644
--- a/dance/transforms/graph/dstg_graph.py
+++ b/dance/transforms/graph/dstg_graph.py
@@ -1,3 +1,5 @@
+from typing import Sequence
+
 import networkx as nx
 import numpy as np
 import pandas as pd
@@ -32,17 +34,23 @@ class DSTGraph(BaseTransform):
 
     _DISPLAY_ATTRS = ("k_filter", "num_cc", "ref_split", "inf_split")
 
-    def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test", **kwargs):
+    def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test",
+                 channels: Sequence[str | None] = (None, None), channel_types: Sequence[str | None] = ("obsm", "obsm"),
+                 **kwargs):
         super().__init__(**kwargs)
 
         self.k_filter = k_filter
         self.num_cc = num_cc
         self.ref_split = ref_split
         self.inf_split = inf_split
+        self.channels = channels
+        self.channel_types = channel_types
 
     def __call__(self, data):
-        x_ref = data.get_feature(return_type="numpy", split_name=self.ref_split)
-        x_inf = data.get_feature(return_type="numpy", split_name=self.inf_split)
+        x_ref = data.get_feature(return_type="numpy", split_name=self.ref_split, channel=self.channels[0],
+                                 channel_type=self.channel_types[0])
+        x_inf = data.get_feature(return_type="numpy", split_name=self.inf_split, channel=self.channels[1],
+                                 channel_type=self.channel_types[1])
 
         adj = compute_dstg_adj(x_ref, x_inf, k_filter=self.k_filter, num_cc=self.num_cc)
         data.data.obsp[self.out] = adj

From 15c29dbd0e1b79f6f213b45687d94b09ae9b77d4 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 22 Jul 2024 11:29:06 +0800
Subject: [PATCH 002/203] minor change

---
 dance/pipeline.py          | 4 ++--
 dance/transforms/filter.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dance/pipeline.py b/dance/pipeline.py
index d831ab10..542e32a1 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -1086,9 +1086,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane
     step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N)
     # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random.
     # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact.
-    step3_start_k = default(step2_pipeline_planer.config.step3_start_k, 0)
+    step3_start_k=step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0
     #Some sweep_ids of step3 that have already been run
-    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids
+    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None
     step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else (
         step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids)))
 
diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py
index 78e6d83f..fe420093 100644
--- a/dance/transforms/filter.py
+++ b/dance/transforms/filter.py
@@ -145,9 +145,9 @@ def prepCounts(self, x):
             elif self._FILTER_TARGET == "cells":
                 n_counts = np.sum(x, axis=1)
             if isinstance(self.min_counts, float) and 0 <= self.min_counts <= 1:
-                min_counts = np.percentile(n_counts, self.min_counts)
+                min_counts = np.percentile(n_counts, self.min_counts*100)
             else:
-                max_counts = np.percentile(n_counts, self.max_counts)
+                max_counts = np.percentile(n_counts, self.max_counts*100)
             return min_counts, max_counts
         else:
             return self.min_counts, self.max_counts

From 675e0bfdfef65bb8e60bce73852fe32defd959cb Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 22 Jul 2024 11:33:53 +0800
Subject: [PATCH 003/203] minor change

---
 dance/pipeline.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dance/pipeline.py b/dance/pipeline.py
index 59a7aa28..8a5d02d8 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -1086,10 +1086,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane
     step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N)
     # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random.
     # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact.
-    step3_start_k=step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0
+    step3_start_k = step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0
     #Some sweep_ids of step3 that have already been run
     step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None
-    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None
     step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else (
         step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids)))
 

From 1eb305c43a8c73a39fff64db83c93dc06e35d980 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 22 Jul 2024 14:50:00 +0800
Subject: [PATCH 004/203] minor changes

---
 dance/datasets/multimodality.py               |  11 +
 .../multi_modality/joint_embedding/dcca.py    |   7 +-
 dance/pipeline.py                             |  10 +-
 dance/transforms/cell_feature.py              |  18 +-
 dance/transforms/filter.py                    |  77 +++++--
 dance/transforms/misc.py                      |  22 ++
 dance/transforms/normalize.py                 |  50 ++++-
 dance/utils/wrappers.py                       |  50 +++++
 .../multi_modality/joint_embedding/dcca.py    |   3 +-
 examples/tuning/joint_embedding_dcca/main.py  | 202 ++++++++++++++++++
 .../tuning/predict_modality_babel/main.py     | 114 ++++++++++
 11 files changed, 533 insertions(+), 31 deletions(-)
 create mode 100644 examples/tuning/joint_embedding_dcca/main.py
 create mode 100644 examples/tuning/predict_modality_babel/main.py

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
index 477aff94..7df32f7d 100644
--- a/dance/datasets/multimodality.py
+++ b/dance/datasets/multimodality.py
@@ -7,7 +7,10 @@
 import mudata as md
 import numpy as np
 import scanpy as sc
+import scipy
 import scipy.sparse as sp
+import sklearn
+from sklearn.utils import issparse
 
 from dance import logger
 from dance.data import Data
@@ -572,6 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre
 
     def _raw_to_dance(self, raw_data):
         mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data)
+        self.to_array([mod1, mod2, meta1, meta2, test_sol])
 
         assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned"
         mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol})
@@ -581,6 +585,13 @@ def _raw_to_dance(self, raw_data):
 
         return data
 
+    def to_array(self, datas):
+        for data in datas:
+            if scipy.sparse.issparse(data.X):
+                data.X = np.array(data.X.todense()).astype(float)
+            if "counts" in data.layers and scipy.sparse.issparse(data.layers["counts"]):
+                data.layers["counts"] = np.array(data.layers["counts"].todense()).astype(float)
+
     def _maybe_preprocess(self, raw_data):
         if self.preprocess is None:
             return raw_data
diff --git a/dance/modules/multi_modality/joint_embedding/dcca.py b/dance/modules/multi_modality/joint_embedding/dcca.py
index b5356ada..bb6d9a69 100644
--- a/dance/modules/multi_modality/joint_embedding/dcca.py
+++ b/dance/modules/multi_modality/joint_embedding/dcca.py
@@ -11,6 +11,7 @@
 import collections
 import math
 import os
+import sys
 import time
 import warnings
 from collections import OrderedDict
@@ -385,7 +386,7 @@ def fit(self, train_loader, test_loader, total_loader, model_pre, args, criterio
 
         train_loss_list = []
         reco_epoch_test = 0
-        test_like_max = 100000
+        test_like_max = sys.maxsize
         flag_break = 0
 
         patience_epoch = 0
@@ -394,7 +395,7 @@ def fit(self, train_loader, test_loader, total_loader, model_pre, args, criterio
         model_pre.eval()
 
         start = time.time()
-
+        best_dict = None
         for epoch in range(1, args.max_epoch + 1):
 
             self.train()
@@ -636,7 +637,7 @@ def fit(self, train_loader, test_loader, total_loader, model_pre, args, criterio
                     break
 
         duration = time.time() - start
-        self.load_state_dict(best_dict)
+        self.load_state_dict(best_dict if best_dict is not None else self.state_dict())
 
         print('Finish training, total time is: ' + str(duration) + 's')
         self.eval()
diff --git a/dance/pipeline.py b/dance/pipeline.py
index d831ab10..09595c52 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -1056,6 +1056,12 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_
                             for target, d_p in p1.default_params.items():
                                 if target == p2["target"]:
                                     p2["params"] = d_p
+        for p1, p2 in zip(step2_pipeline_planer.config.pipeline, pipeline):  #need order
+            if "params" in p1:
+                for key, value in p1.params.items():
+                    if "params" not in p2:
+                        p2.params = {}
+                    p2.params[key] = value
         temp_conf = conf.copy()
         temp_conf.pipeline = pipeline
         temp_conf.wandb = step2_pipeline_planer.config.wandb
@@ -1086,9 +1092,9 @@ def run_step3(root_path, evaluate_pipeline, step2_pipeline_planer: PipelinePlane
     step3_k = default(step2_pipeline_planer.config.parameter_tuning_freq_n, DEFAULT_PARAMETER_TUNING_FREQ_N)
     # Skip some of the already run step3 because in pandas, when you sort columns with exactly the same values, the results are not random.
     # Instead, pandas preserves the order of the original data. So we can skip it without causing any impact.
-    step3_start_k = default(step2_pipeline_planer.config.step3_start_k, 0)
+    step3_start_k = step2_pipeline_planer.config.step3_start_k if "step3_start_k" in step2_pipeline_planer.config else 0
     #Some sweep_ids of step3 that have already been run
-    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids
+    step3_sweep_ids = step2_pipeline_planer.config.step3_sweep_ids if "step3_sweep_ids" in step2_pipeline_planer.config else None
     step3_sweep_ids = [None] * (pipeline_top_k - step3_start_k) if step3_sweep_ids is None else (
         step3_sweep_ids + [None] * (pipeline_top_k - step3_start_k - len(step3_sweep_ids)))
 
diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py
index 465a12b0..4e90c97d 100644
--- a/dance/transforms/cell_feature.py
+++ b/dance/transforms/cell_feature.py
@@ -8,9 +8,11 @@
 from dance.typing import Optional, Union
 from dance.utils.matrix import normalize
 from dance.utils.status import deprecated
+from dance.utils.wrappers import add_mod_and_transform
 
 
 @register_preprocessor("feature", "cell")
+@add_mod_and_transform
 class WeightedFeaturePCA(BaseTransform):
     """Compute the weighted gene PCA as cell features.
 
@@ -66,6 +68,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("feature", "cell")
+@add_mod_and_transform
 class WeightedFeatureSVD(BaseTransform):
     """Compute the weighted gene SVD as cell features.
 
@@ -127,6 +130,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("feature", "cell")
+@add_mod_and_transform
 class CellPCA(BaseTransform):
     """Reduce cell feature matrix with PCA.
 
@@ -145,10 +149,9 @@ def __init__(self, n_components: Union[float, int] = 400, *, channel: Optional[s
 
         self.n_components = n_components
         self.channel = channel
-        self.mod = mod
 
     def __call__(self, data):
-        feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod)
+        feat = data.get_feature(return_type="numpy", channel=self.channel)
         if self.n_components > min(feat.shape):
             self.logger.warning(
                 f"n_components={self.n_components} must be between 0 and min(n_samples, n_features)={min(feat.shape)} with svd_solver='full'"
@@ -167,6 +170,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("feature", "cell")
+@add_mod_and_transform
 class CellSVD(BaseTransform):
     """Reduce cell feature matrix with SVD.
 
@@ -185,10 +189,9 @@ def __init__(self, n_components: Union[float, int] = 400, *, channel: Optional[s
 
         self.n_components = n_components
         self.channel = channel
-        self.mod = mod
 
     def __call__(self, data):
-        feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod)
+        feat = data.get_feature(return_type="numpy", channel=self.channel)
         if isinstance(self.n_components, float):
             n_components = min(feat.shape) - 1
             svd = TruncatedSVD(n_components=n_components)
@@ -215,7 +218,8 @@ def __call__(self, data):
 
 
 @register_preprocessor("feature", "cell")
-@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
+@add_mod_and_transform
+# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
 class FeatureCellPlaceHolder(BaseTransform):
     """Used as a placeholder to skip the process.
 
@@ -229,13 +233,12 @@ class FeatureCellPlaceHolder(BaseTransform):
     def __init__(self, n_components: int = 400, *, channel: Optional[str] = None, mod: Optional[str] = None, **kwargs):
         super().__init__(**kwargs)
         self.channel = channel
-        self.mod = mod
         self.logger.info(
             "n_components in FeatureCellPlaceHolder is used to make the parameters consistent and will not have any actual effect."
         )
 
     def __call__(self, data):
-        feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod)
+        feat = data.get_feature(return_type="numpy", channel=self.channel)
         cell_feat = feat
         gene_feat = feat.T
         data.data.obsm[self.out] = cell_feat
@@ -305,6 +308,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("feature", "cell")  # NOTE: register any custom preprocessing function to be used for tuning
+@add_mod_and_transform
 class GaussRandProjFeature(BaseTransform):
     """Custom preprocessing to extract cell feature via Gaussian random projection."""
 
diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py
index 78e6d83f..8ec0ac1f 100644
--- a/dance/transforms/filter.py
+++ b/dance/transforms/filter.py
@@ -3,6 +3,7 @@
 from typing import get_args
 
 import anndata as ad
+import mudata as md
 import numpy as np
 import pandas as pd
 import scanpy as sc
@@ -20,6 +21,7 @@
 from dance.typing import Dict, GeneSummaryMode, List, Literal, Logger, Optional, Tuple, Union
 from dance.utils import default
 from dance.utils.status import deprecated
+from dance.utils.wrappers import add_mod_and_transform
 
 
 def get_count(count_or_ratio: Optional[Union[float, int]], total: int) -> Optional[int]:
@@ -48,6 +50,7 @@ def get_count(count_or_ratio: Optional[Union[float, int]], total: int) -> Option
 
 
 @register_preprocessor("filter")
+@add_mod_and_transform
 class FilterScanpy(BaseTransform):
     """Scanpy filtering transformation with additional options."""
 
@@ -145,9 +148,9 @@ def prepCounts(self, x):
             elif self._FILTER_TARGET == "cells":
                 n_counts = np.sum(x, axis=1)
             if isinstance(self.min_counts, float) and 0 <= self.min_counts <= 1:
-                min_counts = np.percentile(n_counts, self.min_counts)
+                min_counts = np.percentile(n_counts, self.min_counts * 100)
             else:
-                max_counts = np.percentile(n_counts, self.max_counts)
+                max_counts = np.percentile(n_counts, self.max_counts * 100)
             return min_counts, max_counts
         else:
             return self.min_counts, self.max_counts
@@ -268,6 +271,31 @@ def __init__(
                          inplace=inplace, **kwargs)
 
 
+@register_preprocessor("filter", "cell")
+@add_mod_and_transform
+class FilterCellsCommonMod(BaseTransform):
+
+    def __init__(self, mod1: str, mod2: str, sol: Optional[str] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.mod1 = mod1
+        self.mod2 = mod2
+        self.sol = sol
+
+    def __call__(self, data: Data):
+        md_data = data.data
+        data_mod1 = md_data.mod[self.mod1]
+        data_mod2 = md_data.mod[self.mod2]
+        common_cells = list(set(data_mod1.obs.index) & set(data_mod2.obs.index))
+        data_mod1 = data_mod1[common_cells, :]
+        data_mod2 = data_mod2[common_cells, :]
+        data.data.mod[self.mod1] = data_mod1
+        data.data.mod[self.mod2] = data_mod2
+        if self.sol is not None:
+            test_sol = md_data.mod[self.sol]
+            test_sol = test_sol[common_cells, :]
+            data.data.mod[self.sol] = test_sol
+
+
 @register_preprocessor("filter", "gene")
 class FilterGenesCommon(BaseTransform):
     """Filter genes by taking the common genes across batches or splits.
@@ -472,6 +500,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class FilterGenesPercentile(FilterGenes):
     """Filter genes based on percentiles of the summarized gene expressions.
 
@@ -540,6 +569,7 @@ def _get_preserve_mask(self, gene_summary):
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class FilterGenesTopK(FilterGenes):
     """Select top/bottom genes based on the  summarized gene expressions.
 
@@ -708,6 +738,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class FilterGenesRegression(BaseTransform):
     """Select genes based on regression.
 
@@ -733,18 +764,19 @@ class FilterGenesRegression(BaseTransform):
     _DISPLAY_ATTRS = ("num_genes", )
 
     def __init__(self, method: str = "enclasc", num_genes: int = 1000, *, channel: Optional[str] = None,
-                 mod: Optional[str] = None, skip_count_check: bool = False, inplace=True, **kwargs):
+                 channel_type: Optional[str] = None, mod: Optional[str] = None, skip_count_check: bool = False,
+                 inplace=True, **kwargs):
         super().__init__(**kwargs)
 
         self.num_genes = num_genes
         self.channel = channel
-        self.mod = mod
         self.method = method
         self.skip_count_check = skip_count_check
         self.inplace = inplace
+        self.channel_type = channel_type
 
     def __call__(self, data):
-        feat = data.get_feature(return_type="numpy", channel=self.channel, mod=self.mod)
+        feat = data.get_feature(return_type="numpy", channel=self.channel, channel_type=self.channel_type)
 
         if not self.skip_count_check and np.mod(feat, 1).sum():
             warnings.warn("Expecting count data as input, but the input feature matrix does not appear to be count."
@@ -995,6 +1027,7 @@ def gini_func(x, weights=None):
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class FilterGenesScanpyOrder(BaseTransform):
     """Scanpy filtering gene transformation with additional options.
 
@@ -1084,6 +1117,7 @@ def __call__(self, data: Data):
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class HighlyVariableGenesRawCount(AnnDataTransform):
     """Filter for highly variable genes using raw count matrix.
 
@@ -1120,9 +1154,10 @@ class HighlyVariableGenesRawCount(AnnDataTransform):
 
     """
 
-    def __init__(self, layer: Optional[str] = None, n_top_genes: Optional[int] = 1000, span: Optional[float] = 0.3,
-                 subset: bool = True, inplace: bool = True, batch_key: Optional[str] = None, check_values: bool = True,
-                 **kwargs):
+    def __init__(self, channel: Optional[str] = None, channel_type: Optional[str] = None,
+                 n_top_genes: Optional[int] = 1000, span: Optional[float] = 0.3, subset: bool = True,
+                 inplace: bool = True, batch_key: Optional[str] = None, check_values: bool = True, **kwargs):
+        layer = channel if channel_type == "layers" else None
         super().__init__(sc.pp.highly_variable_genes, layer=layer, n_top_genes=n_top_genes, batch_key=batch_key,
                          check_values=check_values, span=span, subset=subset, inplace=inplace, flavor="seurat_v3",
                          **kwargs)
@@ -1158,6 +1193,7 @@ def __call__(self, data):
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class HighlyVariableGenesLogarithmizedByTopGenes(AnnDataTransform):
     """Filter for highly variable genes based on top genes.
 
@@ -1197,16 +1233,19 @@ class HighlyVariableGenesLogarithmizedByTopGenes(AnnDataTransform):
 
     """
 
-    def __init__(self, layer: Optional[str] = None, n_top_genes: Optional[int] = 1000, n_bins: int = 20,
-                 flavor: Literal["seurat", "cell_ranger"] = "seurat", subset: bool = True, inplace: bool = True,
-                 batch_key: Optional[str] = None, **kwargs):
+    def __init__(self, channel: Optional[str] = None, channel_type: Optional[str] = None,
+                 n_top_genes: Optional[int] = 1000, n_bins: int = 20, flavor: Literal["seurat",
+                                                                                      "cell_ranger"] = "seurat",
+                 subset: bool = True, inplace: bool = True, batch_key: Optional[str] = None, **kwargs):
+        layer = channel if channel_type == "layers" else None
         super().__init__(sc.pp.highly_variable_genes, layer=layer, n_top_genes=n_top_genes, n_bins=n_bins,
                          flavor=flavor, subset=subset, inplace=inplace, batch_key=batch_key, **kwargs)
         self.logger.info("Expects logarithmized data")
 
 
 @register_preprocessor("filter", "gene")
-@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
+@add_mod_and_transform
+# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
 class FilterGenesPlaceHolder(BaseTransform):
     """Used as a placeholder to skip the process."""
 
@@ -1237,10 +1276,11 @@ def __call__(self, data: Data) -> Data:
 
 
 @register_preprocessor("filter", "gene")
-@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
+@add_mod_and_transform
+# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
 class FilterGenesNumberPlaceHolder(BaseTransform):
 
-    def __init__(self, **kwargs):
+    def __init__(self, channel=None, channel_type=None, **kwargs):
         super().__init__(**kwargs)
 
     def __call__(self, data: Data) -> Data:
@@ -1248,6 +1288,7 @@ def __call__(self, data: Data) -> Data:
 
 
 @register_preprocessor("filter", "gene")
+@add_mod_and_transform
 class HighlyVariableGenesLogarithmizedByMeanAndDisp(AnnDataTransform):
     """Filter for highly variable genes based on mean and dispersion.
 
@@ -1293,10 +1334,12 @@ class HighlyVariableGenesLogarithmizedByMeanAndDisp(AnnDataTransform):
 
     """
 
-    def __init__(self, layer: Optional[str] = None, min_disp: Optional[float] = 0.5, max_disp: Optional[float] = np.inf,
+    def __init__(self, channel: Optional[str] = None, channel_type: Optional[str] = None,
+                 min_disp: Optional[float] = 0.5, max_disp: Optional[float] = np.inf,
                  min_mean: Optional[float] = 0.0125, max_mean: Optional[float] = 3, n_bins: int = 20,
                  flavor: Literal["seurat", "cell_ranger"] = "seurat", subset: bool = True, inplace: bool = True,
                  batch_key: Optional[str] = None, **kwargs):
+        layer = channel if channel_type == "layers" else None
         super().__init__(sc.pp.highly_variable_genes, layer=layer, min_disp=min_disp, max_disp=max_disp,
                          min_mean=min_mean, max_mean=max_mean, n_bins=n_bins, flavor=flavor, subset=subset,
                          inplace=inplace, batch_key=batch_key, **kwargs)
@@ -1304,7 +1347,8 @@ def __init__(self, layer: Optional[str] = None, min_disp: Optional[float] = 0.5,
 
 
 @register_preprocessor("filter", "cell")
-@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
+@add_mod_and_transform
+# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
 class FilterCellsPlaceHolder(BaseTransform):
     """Used as a placeholder to skip the process."""
 
@@ -1335,6 +1379,7 @@ def __call__(self, data: Data) -> Data:
 
 
 @register_preprocessor("filter", "cell")
+@add_mod_and_transform
 class FilterCellsScanpyOrder(BaseTransform):
     """Scanpy filtering cell transformation with additional options.
 
diff --git a/dance/transforms/misc.py b/dance/transforms/misc.py
index fa765928..8b47c8b5 100644
--- a/dance/transforms/misc.py
+++ b/dance/transforms/misc.py
@@ -1,6 +1,10 @@
 from pprint import pformat
+from typing import Optional
+
+import mudata as md
 
 from dance import logger
+from dance.data.base import Data
 from dance.registry import register_preprocessor
 from dance.transforms.base import BaseTransform
 from dance.typing import Any, Dict, Tuple
@@ -153,3 +157,21 @@ def __init__(self, *, split_name: str, **kwargs):
     def __call__(self, data):
         self.logger.info("Popping split: {self.split_name!r}")
         data.pop(split_name=self.split_name)
+
+
+@register_preprocessor("misc")
+class AlignMod(BaseTransform):
+    """Aligning mods and metadata in multimodal data."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, data: Data) -> Data:
+        mod1, mod2, meta1, meta2, test_sol = data.data.mod.values()
+        meta1 = meta1[:, mod1.var.index]
+        meta2 = meta2[:, mod2.var.index]
+        test_sol = test_sol[:, mod1.var.index]
+        data.data.mod["meta1"] = meta1
+        data.data.mod["meta2"] = meta2
+        data.data.mod["test_sol"] = test_sol
+        return data
diff --git a/dance/transforms/normalize.py b/dance/transforms/normalize.py
index f2deedae..b0fb7af6 100644
--- a/dance/transforms/normalize.py
+++ b/dance/transforms/normalize.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import scanpy as sc
+import scipy
 import scipy.sparse as sp
 import statsmodels.discrete.discrete_model
 import statsmodels.nonparametric.kernel_regression
@@ -18,9 +19,11 @@
 from dance.typing import Dict, Iterable, List, Literal, LogLevel, NormMode, Number, Optional, Union
 from dance.utils.matrix import normalize
 from dance.utils.status import deprecated
+from dance.utils.wrappers import add_mod_and_transform
 
 
 @register_preprocessor("normalize")
+@add_mod_and_transform
 class ScaleFeature(BaseTransform):
     """Scale the feature matrix in the AnnData object.
 
@@ -169,6 +172,37 @@ def __call__(self, data: Data) -> Data:
 
 
 @register_preprocessor("normalize")
+@add_mod_and_transform
+class tfidfTransform(BaseTransform):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.idf = None
+        self.fitted = False
+
+    def fit(self, X):
+        self.idf = X.shape[0] / X.sum(axis=0)
+        self.fitted = True
+
+    def transform(self, X):
+        if not self.fitted:
+            raise RuntimeError('Transformer was not fitted on any data')
+        if scipy.sparse.issparse(X):
+            tf = X.multiply(1 / X.sum(axis=1))
+            return tf.multiply(self.idf)
+        else:
+            tf = X / X.sum(axis=1, keepdims=True)
+            return tf * self.idf
+
+    def __call__(self, data):
+        X = data.data.X
+        self.fit(X)
+        data.data.X = self.transform(X)
+        return data
+
+
+@register_preprocessor("normalize")
+@add_mod_and_transform
 class ScTransform(BaseTransform):
     """ScTransform normalization and variance stabiliation.
 
@@ -399,7 +433,8 @@ def __call__(self, data: Data):
         z[gn[genes_step1]] = 1
 
         w = pd.Series(index=gn, data=np.zeros(gn.size, dtype='int'))
-        w[gn] = genes_log_gmean
+        # w[gn] = genes_log_gmean
+        w[gn] = genes_log_gmean.astype(int)  #need to think
         selected_data.var['genes_step1_sct'] = z
         selected_data.var['log10_gmean_sct'] = w
 
@@ -453,6 +488,8 @@ def _parallel_init(igenes_bin_regress, iumi_bin, ign, imm, ips):
 def _parallel_wrapper(j):
     name = gn[genes_bin_regress[j]]
     y = umi_bin[:, j].A.flatten()
+    y[np.isinf(y) | np.isnan(y)] = 0
+    mm[np.isinf(mm) | np.isnan(mm)] = 0
     pr = statsmodels.discrete.discrete_model.Poisson(y, mm)
     res = pr.fit(disp=False)
     mu = res.predict()
@@ -490,6 +527,7 @@ def info(n, th, mu, y, w):
 
 
 @register_preprocessor("normalize")
+@add_mod_and_transform
 class Log1P(AnnDataTransform):
     """Logarithmize the data matrix.
 
@@ -527,6 +565,7 @@ def __init__(self, base: Optional[Number] = None, copy: bool = False, chunked: b
 
 
 @register_preprocessor("normalize")
+@add_mod_and_transform
 class NormalizeTotal(AnnDataTransform):
     """Normalize counts per cell.
 
@@ -583,9 +622,15 @@ def __init__(self, target_sum: Optional[float] = None, max_fraction: float = 0.0
         if max_fraction == 1.0:
             self.logger.info("max_fraction set to 1.0, this is equivalent to setting exclude_highly_expressed=False.")
 
+    def __call__(self, data):
+        if scipy.sparse.issparse(data.data.X):
+            data.data.X = np.array(data.data.X.todense())
+        return super().__call__(data)
+
 
 @register_preprocessor("normalize")
-@deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
+@add_mod_and_transform
+# @deprecated(msg="will be replaced by builtin bypass mechanism in pipeline")
 class NormalizePlaceHolder(BaseTransform):
     """Used as a placeholder to skip the process."""
 
@@ -597,6 +642,7 @@ def __call__(self, data: Data) -> Data:
 
 
 @register_preprocessor("normalize")
+@add_mod_and_transform
 class NormalizeTotalLog1P(BaseTransform):
     """Normalize total counts followed by log1p transformation.
 
diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py
index eb20eff7..11314694 100644
--- a/dance/utils/wrappers.py
+++ b/dance/utils/wrappers.py
@@ -1,11 +1,15 @@
 import datetime
 import functools
 import time
+from typing import Union
 
+import anndata
+import mudata
 import numpy as np
 import torch
 
 from dance import logger
+from dance.data.base import Data
 from dance.typing import Any, Callable
 
 
@@ -85,3 +89,49 @@ def wrapped_func(*args):
         return func(*new_args)
 
     return wrapped_func
+
+
+import functools
+
+
+def add_mod_and_transform(cls):
+    original_init = cls.__init__
+    original_call = cls.__call__
+    cls.add_mod_and_transform = "add_mod_and_transform"
+
+    @functools.wraps(original_init)
+    def new_init(self, *args, **kwargs):
+        mod = kwargs.pop('mod', None)
+        original_init(self, *args, **kwargs)
+        self.mod = mod
+
+    @functools.wraps(original_call)
+    def new_call(self, data: Data, *args, **kwargs):
+        if hasattr(self, 'mod') and self.mod is not None:
+            md_data = data.data
+            ad_data = Data(data=transform_mod_to_anndata(md_data, self.mod))
+            res = original_call(self, ad_data, *args, **kwargs)
+            data.data.mod[self.mod] = ad_data.data
+        else:
+            return original_call(self, data, *args, **kwargs)
+
+    cls.__init__ = new_init
+    cls.__call__ = new_call
+    return cls
+
+
+def transform_mod_to_anndata(mod_data: mudata.MuData, mod_key: str):
+    return mod_data.mod[mod_key]
+
+
+# 使用装饰器
+@add_mod_and_transform
+class MyClass:
+
+    def __init__(self, x, **kwargs):
+        self.x = x
+        print("-------")
+        print(**kwargs)
+
+    def __call__(self, data):
+        return data
diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index 73668160..beefdeb0 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -71,7 +71,8 @@ def parameter_setting():
     le = preprocessing.LabelEncoder()
     labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
     data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense(), 1) / 100
-    data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"]
+    # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"]
+    data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense(), 1) / 100
     data.mod["mod1"].obsm["labels"] = labels
 
     data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1",
diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
new file mode 100644
index 00000000..86d6d1f9
--- /dev/null
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -0,0 +1,202 @@
+import argparse
+import gc
+import os
+import pprint
+import sys
+from pathlib import Path
+
+import anndata as ad
+import numpy as np
+import pandas as pd
+import scipy
+import torch
+import torch.utils.data as data_utils
+from sklearn import preprocessing
+
+import dance.utils.metrics as metrics
+import wandb
+from dance import logger
+from dance.datasets.multimodality import JointEmbeddingNIPSDataset
+from dance.modules.multi_modality.joint_embedding.dcca import DCCA
+from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.utils import set_seed
+
+
+def parameter_setting():
+    parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis")
+
+    parser.add_argument("--latent_fusion", "-olf1", type=str, default="First_simulate_fusion.csv",
+                        help="fusion latent code file")
+    parser.add_argument("--latent_1", "-ol1", type=str, default="scRNA_latent_combine.csv",
+                        help="first latent code file")
+    parser.add_argument("--latent_2", "-ol2", type=str, default="scATAC_latent.csv", help="seconde latent code file")
+    parser.add_argument("--denoised_1", "-od1", type=str, default="scRNA_seq_denoised.csv",
+                        help="outfile for denoised file1")
+    parser.add_argument("--normalized_1", "-on1", type=str, default="scRNA_seq_normalized_combine.tsv",
+                        help="outfile for normalized file1")
+    parser.add_argument("--denoised_2", "-od2", type=str, default="scATAC_seq_denoised.csv",
+                        help="outfile for denoised file2")
+
+    parser.add_argument("--workdir", "-wk", type=str, default="./new_test/", help="work path")
+    parser.add_argument("--outdir", "-od", type=str, default="./new_test/", help="Output path")
+
+    parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay")
+    parser.add_argument("--eps", type=float, default=0.01, help="eps")
+
+    parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size")
+
+    parser.add_argument("--seed", type=int, default=1, help="Random seed for repeat results")
+    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim")
+    parser.add_argument("--max_epoch", "-me", type=int, default=10, help="Max epoches")
+    parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration")
+    parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch")
+    parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test")
+    parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
+    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
+    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("--final_rate", type=float, default=1e-4)
+    parser.add_argument("--scale_factor", type=float, default=4)
+
+    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
+    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
+    parser.add_argument("--count", type=int, default=2)
+    parser.add_argument("--sweep_id", type=str, default=None)
+    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
+    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = parameter_setting()
+    args = parser.parse_args()
+
+    args.sf1 = 5
+    args.sf2 = 1
+    args.cluster1 = args.cluster2 = 4
+    args.lr1 = 0.01
+    args.flr1 = 0.001
+    args.lr2 = 0.005
+    args.flr2 = 0.0005
+
+    res = None
+    logger.info(f"\n{pprint.pformat(vars(args))}")
+    file_root_path = Path(args.root_path, args.subtask).resolve()
+    logger.info(f"\n files is saved in {file_root_path}")
+    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+
+    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
+        wandb.init(settings=wandb.Settings(start_method='thread'))
+        set_seed(args.seed)
+        #         model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=4, layer_d_1=[4, 128], hidden2_1=128,
+        #                      layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=4, layer_d_2=[4], hidden2_2=4, args=args,
+        #                      Type_1="NB", Type_2="Bernoulli", ground_truth1=torch.cat([train_labels, test_labels]), cycle=1,
+        #                      attention_loss="Eucli")  # yapf: disable
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
+        data = dataset.load_data()
+
+        # Prepare preprocessing pipeline and apply it to data
+        kwargs = {tune_mode: dict(wandb.config)}
+        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+        preprocessing_pipeline(data)
+
+        le = preprocessing.LabelEncoder()
+        labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+        data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense() if scipy.sparse.issparse(data.mod["mod2"].X) else data.mod["mod2"].X, 1) / 100
+        # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"]
+        data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense() if scipy.sparse.issparse(data.mod["mod1"].X) else data.mod["mod1"].X, 1) / 100
+        data.mod["mod1"].obsm["labels"] = labels
+
+        # data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1",
+        #                 feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"],
+        #                 feature_channel=["counts", "counts", None, None, "size_factors",
+        #                                 "size_factors"], label_channel="labels")
+        (x_train, y_train, x_train_raw, y_train_raw, x_train_size,
+        y_train_size), train_labels = data.get_train_data(return_type="torch")
+        (x_test, y_test, x_test_raw, y_test_raw, x_test_size,
+        y_test_size), test_labels = data.get_test_data(return_type="torch")
+
+        Nfeature1 = x_train.shape[1]
+        Nfeature2 = y_train.shape[1]
+
+        device = torch.device(args.device)
+        train = data_utils.TensorDataset(x_train.float(), x_train_raw, x_train_size.float(), y_train.float(), y_train_raw,
+                                        y_train_size.float())
+
+        train_loader = data_utils.DataLoader(train, batch_size=args.batch_size, shuffle=True)
+
+        test = data_utils.TensorDataset(x_test.float(), x_test_raw, x_test_size.float(), y_test.float(), y_test_raw,
+                                        y_test_size.float())
+
+        test_loader = data_utils.DataLoader(test, batch_size=args.batch_size, shuffle=False)
+
+        total = data_utils.TensorDataset(
+            torch.cat([x_train, x_test]).float(), torch.cat([x_train_raw, x_test_raw]),
+            torch.cat([x_train_size, x_test_size]).float(),
+            torch.cat([y_train, y_test]).float(), torch.cat([y_train_raw, y_test_raw]),
+            torch.cat([y_train_size, y_test_size]).float())
+
+        total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
+        model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=50, layer_d_1=[50, 128], hidden2_1=128,
+                     layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=50, layer_d_2=[50], hidden2_2=50,
+                     args=args, ground_truth1=torch.cat([train_labels, test_labels]), Type_1="NB", Type_2="Bernoulli",
+                     cycle=1, attention_loss="Eucli").to(device)
+        model.to(device)
+        model.fit(train_loader, test_loader, total_loader, "RNA")
+
+        emb1, emb2 = model.predict(total_loader)
+        embeds = np.concatenate([emb1, emb2], 1)
+        print(embeds)
+
+        adata = ad.AnnData(
+            X=embeds,
+            obs=data.mod["mod1"].obs,
+        )
+        adata_sol = data.mod["test_sol"]
+        adata = adata[adata_sol.obs_names]
+        adata_sol.obsm['X_emb'] = adata.X
+        score = metrics.labeled_clustering_evaluate(adata, adata_sol)
+        # score.update(metrics.integration_openproblems_evaluate(adata_sol))
+        score.update({
+            # 'seed': args.seed + k,
+            'subtask': args.subtask,
+            'method': 'dcca',
+        })
+
+        # if res is not None:
+        #     res = res.append(score, ignore_index=True)
+        # else:
+        #     for s in score:
+        #         score[s] = [score[s]]
+        #     res = pd.DataFrame(score)
+        wandb.log({"ARI":score["dance_ari"]})
+        wandb.finish()
+        torch.cuda.empty_cache()
+        #主要是报错时没有执行这些命令导致的，我感觉
+        del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1
+        del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels
+        del labels,le,dataset,score
+        gc.collect()
+    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
+        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
+    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
+    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
+        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
+                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
+                       root_path=file_root_path, required_funs=["AlignMod","FilterCellsCommonMod","FilterCellsCommonMod","SetConfig"],
+                       required_indexes=[2,11,14,sys.maxsize], metric="ARI")
+        if args.tune_mode == "pipeline_params":
+            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
+
+"""To reproduce DCCA on other samples, please refer to command lines belows:
+
+GEX-ADT:
+$ python dcca.py --subtask openproblems_bmmc_cite_phase2 --device cuda
+
+GEX-ATAC:
+$ python dcca.py --subtask openproblems_bmmc_multiome_phase2 --device cuda
+
+"""
diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py
new file mode 100644
index 00000000..4079608e
--- /dev/null
+++ b/examples/tuning/predict_modality_babel/main.py
@@ -0,0 +1,114 @@
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+
+import pandas as pd
+import torch
+
+import wandb
+from dance import logger
+from dance.datasets.multimodality import ModalityPredictionDataset
+from dance.modules.multi_modality.predict_modality.babel import BabelWrapper
+from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.utils import set_seed
+
+if __name__ == "__main__":
+    OPTIMIZER_DICT = {
+        "adam": torch.optim.Adam,
+        "rmsprop": torch.optim.RMSprop,
+    }
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2_rna")
+    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("-cpu", "--cpus", default=1, type=int)
+    parser.add_argument("-seed", "--seed", default=1, type=int)
+    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("-m", "--model_folder", default="./models")
+    parser.add_argument("--outdir", "-o", default="./logs", help="Directory to output to")
+    parser.add_argument("--lossweight", type=float, default=1., help="Relative loss weight")
+    parser.add_argument("--lr", "-l", type=float, default=0.01, help="Learning rate")
+    parser.add_argument("--batchsize", "-b", type=int, default=64, help="Batch size")
+    parser.add_argument("--hidden", type=int, default=64, help="Hidden dimensions")
+    parser.add_argument("--earlystop", type=int, default=20, help="Early stopping after N epochs")
+    parser.add_argument("--naive", "-n", action="store_true", help="Use a naive model instead of lego model")
+    parser.add_argument("--resume", action="store_true")
+    parser.add_argument("--max_epochs", type=int, default=500)
+
+    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
+    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
+    parser.add_argument("--count", type=int, default=2)
+    parser.add_argument("--sweep_id", type=str, default=None)
+    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
+    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+    args = parser.parse_args()
+    args.resume = True
+
+    torch.set_num_threads(args.cpus)
+    args.outdir = os.path.abspath(args.outdir)
+    os.makedirs(args.model_folder, exist_ok=True)
+    os.makedirs(args.outdir, exist_ok=True)
+    # Specify output log file
+    fh = logging.FileHandler(f"{args.outdir}/training_{args.subtask}_{args.seed}.log", "w")
+    fh.setLevel(logging.INFO)
+    logger.addHandler(fh)
+    file_root_path = Path(args.root_path, args.subtask).resolve()
+    logger.info(f"\n files is saved in {file_root_path}")
+    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+    for arg in vars(args):
+        logger.info(f"Parameter {arg}: {getattr(args, arg)}")
+
+    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
+        wandb.init(settings=wandb.Settings(start_method='thread'))
+        rndseed = args.seed
+        set_seed(rndseed)
+        dataset = ModalityPredictionDataset(args.subtask, preprocess=None)
+        data = dataset.load_data()
+        # Prepare preprocessing pipeline and apply it to data
+        kwargs = {tune_mode: dict(wandb.config)}
+        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+        preprocessing_pipeline(data)
+
+        # Obtain training and testing data
+        x_train, y_train = data.get_train_data(return_type="torch")
+        x_test, y_test = data.get_test_data(return_type="torch")
+        x_train, y_train, x_test, y_test = x_train.float(), y_train.float(), x_test.float(), y_test.float()
+        # Train and evaluate the model
+        #突然想到，或许有些算法可以降维，而有些算法不能降维，所以还是要依据算法而定
+        model = BabelWrapper(args, dim_in=x_train.shape[1], dim_out=y_train.shape[1])
+        model.fit(x_train, y_train, val_ratio=0.15)
+        wandb.log({'rmse': model.score(x_test, y_test)})
+        wandb.finish()
+
+    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
+        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
+    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
+    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
+        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
+                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
+                       root_path=file_root_path,
+                       required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
+                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
+        if args.tune_mode == "pipeline_params":
+            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
+"""To reproduce BABEL on other samples, please refer to command lines belows:
+
+GEX to ADT (subset):
+$ python babel.py --subtask openproblems_bmmc_cite_phase2_rna_subset --device cuda
+
+GEX to ADT:
+$ python babel.py --subtask openproblems_bmmc_cite_phase2_rna --device cuda
+
+ADT to GEX:
+$ python babel.py --subtask openproblems_bmmc_cite_phase2_mod2 --device cuda
+
+GEX to ATAC:
+$ python babel.py --subtask openproblems_bmmc_multiome_phase2_rna --device cuda
+
+ATAC to GEX:
+$ python babel.py --subtask openproblems_bmmc_multiome_phase2_mod2 --device cuda
+
+"""

From 44577317c4317e74ed75a23b8f472a136498add6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 08:47:32 +0000
Subject: [PATCH 005/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_dcca/main.py   | 2 +-
 examples/tuning/predict_modality_babel/main.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 86d6d1f9..9f6fe897 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py
index 4079608e..112317f7 100644
--- a/examples/tuning/predict_modality_babel/main.py
+++ b/examples/tuning/predict_modality_babel/main.py
@@ -6,8 +6,8 @@
 
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import ModalityPredictionDataset
 from dance.modules.multi_modality.predict_modality.babel import BabelWrapper

From b3e0c1b382a32f7f269c69f31909c91f65b76317 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 22 Jul 2024 19:01:39 +0800
Subject: [PATCH 006/203] minor change

---
 dance/transforms/graph/dstg_graph.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/transforms/graph/dstg_graph.py b/dance/transforms/graph/dstg_graph.py
index e51a3393..f5cefe46 100644
--- a/dance/transforms/graph/dstg_graph.py
+++ b/dance/transforms/graph/dstg_graph.py
@@ -1,4 +1,4 @@
-from typing import Sequence
+from typing import Sequence, Union
 
 import networkx as nx
 import numpy as np
@@ -35,7 +35,7 @@ class DSTGraph(BaseTransform):
     _DISPLAY_ATTRS = ("k_filter", "num_cc", "ref_split", "inf_split")
 
     def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test",
-                 channels: Sequence[str | None] = (None, None), channel_types: Sequence[str | None] = ("obsm", "obsm"),
+                 channels: Sequence[Union[str , None]] = (None, None), channel_types: Sequence[Union[str , None]] = ("obsm", "obsm"),
                  **kwargs):
         super().__init__(**kwargs)
 

From 16f3dc1367475772024b6e275037b42deef8109c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:02:49 +0000
Subject: [PATCH 007/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/transforms/graph/dstg_graph.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/transforms/graph/dstg_graph.py b/dance/transforms/graph/dstg_graph.py
index f5cefe46..5b6b792f 100644
--- a/dance/transforms/graph/dstg_graph.py
+++ b/dance/transforms/graph/dstg_graph.py
@@ -35,8 +35,8 @@ class DSTGraph(BaseTransform):
     _DISPLAY_ATTRS = ("k_filter", "num_cc", "ref_split", "inf_split")
 
     def __init__(self, k_filter=200, num_cc=30, *, ref_split: str = "train", inf_split: str = "test",
-                 channels: Sequence[Union[str , None]] = (None, None), channel_types: Sequence[Union[str , None]] = ("obsm", "obsm"),
-                 **kwargs):
+                 channels: Sequence[Union[str, None]] = (None, None),
+                 channel_types: Sequence[Union[str, None]] = ("obsm", "obsm"), **kwargs):
         super().__init__(**kwargs)
 
         self.k_filter = k_filter

From 83032fc1456ee26e3db6859740e69999d47e3425 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 22 Jul 2024 23:38:55 +0800
Subject: [PATCH 008/203] minor change

---
 .../tuning/joint_embedding_scmvae/main.py     | 177 ++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 examples/tuning/joint_embedding_scmvae/main.py

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
new file mode 100644
index 00000000..6017eec2
--- /dev/null
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -0,0 +1,177 @@
+import argparse
+import os
+import pprint
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.utils.data as data_utils
+from sklearn import preprocessing
+
+import wandb
+from dance import logger
+from dance.datasets.multimodality import JointEmbeddingNIPSDataset
+from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE
+from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.transforms.preprocess import calculate_log_library_size
+from dance.utils import set_seed
+
+
+def parameter_setting():
+    parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis")
+
+    parser.add_argument("--workdir", "-wk", type=str, default="./new_test", help="work path")
+    parser.add_argument("--outdir", "-od", type=str, default="./new_test", help="Output path")
+
+    parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay")
+    parser.add_argument("--eps", type=float, default=0.01, help="eps")
+    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+
+    parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size")
+    parser.add_argument('-seed', '--seed', type=int, default=1, help='Random seed for repeat results')
+    parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim")
+    parser.add_argument("--max_epoch", "-me", type=int, default=25, help="Max epoches")
+    parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration")
+    parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch")
+    parser.add_argument("--epoch_per_test", "-ept", type=int, default=1,
+                        help="Epoch per test, must smaller than max iteration.")
+    parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
+    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
+    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("--final_rate", type=float, default=1e-4)
+    parser.add_argument("--scale_factor", type=float, default=4)
+
+    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
+    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
+    parser.add_argument("--count", type=int, default=2)
+    parser.add_argument("--sweep_id", type=str, default=None)
+    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
+    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = parameter_setting()
+    args = parser.parse_args()
+    assert args.max_iteration > args.epoch_per_test
+    device = torch.device(args.device)
+    args.lr = 0.001
+    args.anneal_epoch = 200
+    res = None
+    logger.info(f"\n{pprint.pformat(vars(args))}")
+    file_root_path = Path(args.root_path, args.subtask).resolve()
+    logger.info(f"\n files is saved in {file_root_path}")
+    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+
+    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
+        wandb.init(settings=wandb.Settings(start_method='thread'))
+        set_seed(args.seed)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
+        data = dataset.load_data()
+
+        le = preprocessing.LabelEncoder()
+        labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+        data.mod["mod1"].obsm["labels"] = labels
+        data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["layers", "layers"],
+                        feature_channel=["counts", "counts"], label_channel="labels")
+
+        (x_train, y_train), _ = data.get_train_data(return_type="torch")
+        (x_test, y_test), labels = data.get_test_data(return_type="torch")
+
+        lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()]))
+        lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()]))
+        lib_mean1 = torch.from_numpy(lib_mean1)
+        lib_var1 = torch.from_numpy(lib_var1)
+        lib_mean2 = torch.from_numpy(lib_mean2)
+        lib_var2 = torch.from_numpy(lib_var2)
+
+        Nfeature1 = x_train.shape[1]
+        Nfeature2 = y_train.shape[1]
+        train_size = len(data.get_split_idx("train"))
+        train = data_utils.TensorDataset(x_train, lib_mean1[:train_size], lib_var1[:train_size], lib_mean2[:train_size],
+                                         lib_var2[:train_size], y_train)
+
+        valid = data_utils.TensorDataset(x_test, lib_mean1[train_size:], lib_var1[train_size:], lib_mean2[train_size:],
+                                         lib_var2[train_size:], y_test)
+
+        total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
+
+        total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
+
+        x_test = torch.cat([x_train, x_test])
+        y_test = torch.cat([y_train, y_test])
+        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))
+        model = scMVAE(
+            encoder_1=[Nfeature1, 1024, 128, 128],
+            hidden_1=128,
+            Z_DIMS=22,
+            decoder_share=[22, 128, 256],
+            share_hidden=128,
+            decoder_1=[128, 128, 1024],
+            hidden_2=1024,
+            encoder_l=[Nfeature1, 128],
+            hidden3=128,
+            encoder_2=[Nfeature2, 1024, 128, 128],
+            hidden_4=128,
+            encoder_l1=[Nfeature2, 128],
+            hidden3_1=128,
+            decoder_2=[128, 128, 1024],
+            hidden_5=1024,
+            drop_rate=0.1,
+            log_variational=True,
+            Type="ZINB",
+            device=device,
+            n_centroids=22,
+            penality="GMM",
+            model=1,
+        )
+        model.to(device)
+        model.init_gmm_params(total_loader)
+        model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
+
+        # embeds = model.predict(x_test, y_test).cpu().numpy()
+        score = model.score(x_test, y_test, labels)
+        score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
+        score["ARI"] = score["dance_ari"]
+        del score["dance_ari"]
+        wandb.log(score)
+        wandb.finish()
+        torch.cuda.empty_cache()
+        # score.update({
+        #     'seed': args.seed + k,
+        #     'subtask': args.subtask,
+        #     'method': 'scmvae',
+        # })
+
+        # if res is not None:
+        #     res = res.append(score, ignore_index=True)
+        # else:
+        #     for s in score:
+        #         score[s] = [score[s]]
+        #     res = pd.DataFrame(score)
+
+    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
+        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
+    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
+    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
+        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
+                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
+                       root_path=file_root_path,
+                       required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
+                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
+        if args.tune_mode == "pipeline_params":
+            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
+"""To reproduce scMVAE on other samples, please refer to command lines belows:
+
+GEX-ADT:
+$ python scmvae.py --subtask openproblems_bmmc_cite_phase2 --device cuda
+
+GEX-ATAC:
+$ python scmvae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda
+
+"""

From 4bbe9b71aaab48e0ecd027edfc0cdbdcbad97023 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:39:25 +0000
Subject: [PATCH 009/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmvae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 6017eec2..a5d21251 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -8,9 +8,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE

From d2a4d0b364853b2a2a68be603a65bdf0115028f7 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 19:32:50 +0800
Subject: [PATCH 010/203] minor change

---
 examples/tuning/joint_embedding_scmvae/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 6017eec2..bdd3f029 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -77,8 +77,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         le = preprocessing.LabelEncoder()
         labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
         data.mod["mod1"].obsm["labels"] = labels
-        data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["layers", "layers"],
-                        feature_channel=["counts", "counts"], label_channel="labels")
+        data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["obsm", "obsm"],
+                        feature_channel=["feature.cell", "feature.cell"], label_channel="labels")
 
         (x_train, y_train), _ = data.get_train_data(return_type="torch")
         (x_test, y_test), labels = data.get_test_data(return_type="torch")

From 1a310205a0c8fd4dd20f99a1383b8dc8fedbc769 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 19:39:24 +0800
Subject: [PATCH 011/203] change score

---
 examples/tuning/joint_embedding_dcca/main.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 9f6fe897..032bf0b2 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
@@ -159,12 +159,12 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         adata = adata[adata_sol.obs_names]
         adata_sol.obsm['X_emb'] = adata.X
         score = metrics.labeled_clustering_evaluate(adata, adata_sol)
-        # score.update(metrics.integration_openproblems_evaluate(adata_sol))
-        score.update({
-            # 'seed': args.seed + k,
-            'subtask': args.subtask,
-            'method': 'dcca',
-        })
+        score.update(metrics.integration_openproblems_evaluate(adata_sol))
+        # score.update({
+        #     'seed': args.seed + k,
+        #     'subtask': args.subtask,
+        #     'method': 'dcca',
+        # })
 
         # if res is not None:
         #     res = res.append(score, ignore_index=True)
@@ -172,7 +172,9 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         #     for s in score:
         #         score[s] = [score[s]]
         #     res = pd.DataFrame(score)
-        wandb.log({"ARI":score["dance_ari"]})
+        score["ARI"]=score["dance_ari"]
+        del score["dance_ari"]
+        wandb.log(score)
         wandb.finish()
         torch.cuda.empty_cache()
         #主要是报错时没有执行这些命令导致的，我感觉

From 2419e244bf238c764ff07ea9972bf17b6e48f8ba Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 11:43:31 +0000
Subject: [PATCH 012/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_dcca/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 032bf0b2..0efe7567 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA

From 18187e5e0618e07bc65dd6e68449ee5fd11447f8 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 20:11:00 +0800
Subject: [PATCH 013/203] minor change

---
 examples/tuning/joint_embedding_scmvae/main.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 6cbedb88..996f8093 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -77,8 +77,13 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         le = preprocessing.LabelEncoder()
         labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
         data.mod["mod1"].obsm["labels"] = labels
-        data.set_config(feature_mod=["mod1", "mod2"], label_mod="mod1", feature_channel_type=["obsm", "obsm"],
-                        feature_channel=["feature.cell", "feature.cell"], label_channel="labels")
+       
+        # Prepare preprocessing pipeline and apply it to data
+        kwargs = {tune_mode: dict(wandb.config)}
+        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+        preprocessing_pipeline(data)
+
 
         (x_train, y_train), _ = data.get_train_data(return_type="torch")
         (x_test, y_test), labels = data.get_test_data(return_type="torch")
@@ -105,7 +110,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
 
         x_test = torch.cat([x_train, x_test])
         y_test = torch.cat([y_train, y_test])
-        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))
+        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))#这里大概会有问题
         model = scMVAE(
             encoder_1=[Nfeature1, 1024, 128, 128],
             hidden_1=128,

From 77ad75ae9c34fa56a2eeb246a6c297d57d58eb20 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 12:11:29 +0000
Subject: [PATCH 014/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmvae/main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 996f8093..3b2abcd2 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -77,14 +77,13 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         le = preprocessing.LabelEncoder()
         labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
         data.mod["mod1"].obsm["labels"] = labels
-       
+
         # Prepare preprocessing pipeline and apply it to data
         kwargs = {tune_mode: dict(wandb.config)}
         preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)
 
-
         (x_train, y_train), _ = data.get_train_data(return_type="torch")
         (x_test, y_test), labels = data.get_test_data(return_type="torch")
 
@@ -110,7 +109,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
 
         x_test = torch.cat([x_train, x_test])
         y_test = torch.cat([y_train, y_test])
-        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))#这里大概会有问题
+        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题
         model = scMVAE(
             encoder_1=[Nfeature1, 1024, 128, 128],
             hidden_1=128,

From ea7dff5bc80f9e0c609897d56be0ab9738b8eacd Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 20:13:12 +0800
Subject: [PATCH 015/203] minor changes

---
 examples/tuning/joint_embedding_dcca/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 032bf0b2..96628e68 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -114,6 +114,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         #                 feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"],
         #                 feature_channel=["counts", "counts", None, None, "size_factors",
         #                                 "size_factors"], label_channel="labels")
+        #TODO 感觉layers中的counts才是raw
         (x_train, y_train, x_train_raw, y_train_raw, x_train_size,
         y_train_size), train_labels = data.get_train_data(return_type="torch")
         (x_test, y_test, x_test_raw, y_test_raw, x_test_size,

From 72c0b71a0878ccb55a0f0d66dd7320374e08d800 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 20:15:18 +0800
Subject: [PATCH 016/203] minor changes

---
 examples/tuning/joint_embedding_jae/main.py | 114 ++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 examples/tuning/joint_embedding_jae/main.py

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
new file mode 100644
index 00000000..48059370
--- /dev/null
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -0,0 +1,114 @@
+import argparse
+import os
+import pprint
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+
+import wandb
+from dance import logger
+from dance.datasets.multimodality import JointEmbeddingNIPSDataset
+from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper
+from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.utils import set_seed
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
+                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+    parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
+    parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
+    parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
+    parser.add_argument("-seed", "--seed", default=1, type=int)
+    parser.add_argument("-cpu", "--cpus", default=1, type=int)
+    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("-bs", "--batch_size", default=128, type=int)
+    parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
+    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+
+    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
+    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
+    parser.add_argument("--count", type=int, default=2)
+    parser.add_argument("--sweep_id", type=str, default=None)
+    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
+    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+
+    args = parser.parse_args()
+
+    device = args.device
+    pre_normalize = bool(args.normalize)
+    torch.set_num_threads(args.cpus)
+    rndseed = args.seed
+    set_seed(rndseed)
+
+    res = None
+    logger.info(f"\n{pprint.pformat(vars(args))}")
+    file_root_path = Path(args.root_path, args.subtask).resolve()
+    logger.info(f"\n files is saved in {file_root_path}")
+    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+
+    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
+        wandb.init(settings=wandb.Settings(start_method='thread'))
+        set_seed(args.seed)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder)
+        data = dataset.load_data()
+        # Prepare preprocessing pipeline and apply it to data
+        kwargs = {tune_mode: dict(wandb.config)}
+        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+        preprocessing_pipeline(data)
+
+        (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score,
+                                       G2M_score) = data.get_train_data(return_type="torch")
+        (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch")
+        X_train = torch.cat([X_mod1_train, X_mod2_train], dim=1)
+        phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
+        X_test = torch.cat([X_mod1_test, X_mod2_test], dim=1)
+        X_test = torch.cat([X_train, X_test]).float().to(device)
+        test_id = np.arange(X_test.shape[0])
+        labels = torch.cat([cell_type, cell_type_test]).numpy()
+        adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
+
+        model = JAEWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
+                           num_phases=phase_score.shape[1], num_features=X_train.shape[1])
+        model.fit(X_train, cell_type, batch_label, phase_score, max_epochs=50)
+
+        embeds = model.predict(X_test, test_id).cpu().numpy()
+        print(embeds)
+
+        score = model.score(X_test, test_id, labels, metric="clustering")
+        score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+        score.update({
+            'subtask': args.subtask,
+            'method': 'jae',
+        })
+        score["ARI"] = score["dance_ari"]
+        del score["dance_ari"]
+        wandb.log(score)
+        wandb.finish()
+        torch.cuda.empty_cache()
+
+    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
+        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
+    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
+    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
+        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
+                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
+                       root_path=file_root_path,
+                       required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
+                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
+        if args.tune_mode == "pipeline_params":
+            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
+"""To reproduce JAE on other samples, please refer to command lines belows:
+
+GEX-ADT:
+$ python jae.py --subtask openproblems_bmmc_cite_phase2 --device cuda
+
+GEX-ATAC:
+$ python jae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda
+
+"""

From 4571f5ce7d24b1fce70eb81c9b48a48d69e1f016 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 12:20:34 +0000
Subject: [PATCH 017/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 48059370..a3108167 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -7,8 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper

From 87c7b4e0847f9cec48a0726dd5ad6a18bf633097 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 20:24:18 +0800
Subject: [PATCH 018/203] minor change

---
 examples/tuning/joint_embedding_scmvae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 3b2abcd2..c52f2108 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -109,7 +109,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
 
         x_test = torch.cat([x_train, x_test])
         y_test = torch.cat([y_train, y_test])
-        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题
+        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
         model = scMVAE(
             encoder_1=[Nfeature1, 1024, 128, 128],
             hidden_1=128,

From 0f4f027739dd2fa1708108d0071b30a9a4872f60 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 23 Jul 2024 22:38:01 +0800
Subject: [PATCH 019/203] minor change

---
 .../tuning/joint_embedding_scmogcn/main.py    | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 examples/tuning/joint_embedding_scmogcn/main.py

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
new file mode 100644
index 00000000..d1f681d1
--- /dev/null
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -0,0 +1,121 @@
+import argparse
+import os
+import pprint
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+
+import wandb
+from dance import logger
+from dance.datasets.multimodality import JointEmbeddingNIPSDataset
+from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper
+from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.transforms.graph.cell_feature_graph import CellFeatureBipartiteGraph
+from dance.utils import set_seed
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
+                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+    parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
+    parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
+    parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
+    parser.add_argument("-l", "--layers", default=3, type=int, choices=[3, 4, 5, 6, 7])
+    parser.add_argument("-dis", "--disable_propagation", default=0, type=int, choices=[0, 1, 2])
+    parser.add_argument("-seed", "--seed", default=1, type=int)
+    parser.add_argument("-cpu", "--cpus", default=1, type=int)
+    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("-bs", "--batch_size", default=512, type=int)
+    parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
+    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+
+    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
+    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
+    parser.add_argument("--count", type=int, default=2)
+    parser.add_argument("--sweep_id", type=str, default=None)
+    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
+    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+
+    args = parser.parse_args()
+
+    device = args.device
+    pre_normalize = bool(args.normalize)
+    torch.set_num_threads(args.cpus)
+    rndseed = args.seed
+    set_seed(rndseed)
+
+    res = None
+    logger.info(f"\n{pprint.pformat(vars(args))}")
+    file_root_path = Path(args.root_path, args.subtask).resolve()
+    logger.info(f"\n files is saved in {file_root_path}")
+    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+
+    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
+        wandb.init(settings=wandb.Settings(start_method='thread'))
+        set_seed(args.seed)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True)
+        data = dataset.load_data()
+        train_size = len(data.get_split_idx("train"))
+
+        data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod1")(data)
+        data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod2")(data)
+        # data.set_config(
+        #     feature_mod=["mod1", "mod2"],
+        #     label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"],
+        #     feature_channel=["X_pca", "X_pca"],
+        #     label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
+        # )
+        (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch")
+        phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
+        test_id = np.arange(x_mod1.shape[0])
+        labels = cell_type.numpy()
+        adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
+        model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
+                               num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1])
+        model.fit(
+            g_mod1=data.data["mod1"].uns["g"],
+            g_mod2=data.data["mod2"].uns["g"],
+            train_size=train_size,
+            cell_type=cell_type,
+            batch_label=batch_label,
+            phase_score=phase_score,
+        )
+
+        embeds = model.predict(test_id).cpu().numpy()
+        score = model.score(test_id, labels, metric="clustering")
+        score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+        score.update({
+            'subtask': args.subtask,
+            'method': 'scmogcn',
+        })
+
+        score["ARI"] = score["dance_ari"]
+        del score["dance_ari"]
+        wandb.log(score)
+        wandb.finish()
+        torch.cuda.empty_cache()
+
+    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
+        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
+    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
+    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
+        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
+                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
+                       root_path=file_root_path,
+                       required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
+                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
+        if args.tune_mode == "pipeline_params":
+            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
+"""To reproduce scMoGCN on other samples, please refer to command lines belows:
+
+GEX-ADT:
+$ python scmogcn.py --subtask openproblems_bmmc_cite_phase2 --device cuda
+
+GEX-ATAC:
+$ python scmogcn.py --subtask openproblems_bmmc_multiome_phase2 --device cuda
+
+"""

From 90c01c6b9b1d58066966d8e3098681aa8ae6c778 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 14:38:40 +0000
Subject: [PATCH 020/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmogcn/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index d1f681d1..10edd63b 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -7,8 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper

From 6072fc318024f63f6cc9bb0582da5ecd86352b58 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 00:11:57 +0800
Subject: [PATCH 021/203] minor change

---
 examples/tuning/joint_embedding_scmogcn/main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index 10edd63b..b2b8c1ce 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -19,7 +19,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+                        choices=["GSE140203_BRAIN_atac2gex","openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -57,12 +57,12 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder)
         data = dataset.load_data()
         train_size = len(data.get_split_idx("train"))
 
-        data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod1")(data)
-        data = CellFeatureBipartiteGraph(cell_feature_channel="X_pca", mod="mod2")(data)
+        data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data)
+        data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data)
         # data.set_config(
         #     feature_mod=["mod1", "mod2"],
         #     label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"],

From 597071eb0fe5ee26bebdd1f42f2f91712cb36cb4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:12:27 +0000
Subject: [PATCH 022/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmogcn/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index b2b8c1ce..cbc17295 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -18,8 +18,9 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-                        choices=["GSE140203_BRAIN_atac2gex","openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+    parser.add_argument(
+        "-t", "--subtask", default="openproblems_bmmc_cite_phase2",
+        choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")

From e2532e7b1ee7e76e70650146bd9f650a65e8d5e2 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 00:12:34 +0800
Subject: [PATCH 023/203] minor change

---
 examples/tuning/joint_embedding_jae/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index a3108167..e4b480c6 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -7,8 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper
@@ -17,8 +17,9 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+    parser.add_argument(
+        "-t", "--subtask", default="openproblems_bmmc_cite_phase2",
+        choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")

From ca7b029149dfcc86962f43c751e97565ac61d92c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:14:36 +0000
Subject: [PATCH 024/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index e4b480c6..71308475 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -7,8 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper

From 1aecb98a7fa40e614ffdfaca1570e8a74eabdaac Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 09:31:27 +0800
Subject: [PATCH 025/203] minor changes

---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index e4b480c6..0b58c1cb 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -62,7 +62,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)
-
+        logger.warning(data)
         (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score,
                                        G2M_score) = data.get_train_data(return_type="torch")
         (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch")

From 7d595898d0c808d1d7ae7fdb68150a48e8a1d601 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 11:35:08 +0800
Subject: [PATCH 026/203] minor change

---
 .../tuning/joint_embedding_scmogcn/main.py    | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index cbc17295..8fdbe10f 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -32,6 +32,7 @@
     parser.add_argument("-bs", "--batch_size", default=512, type=int)
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("--preprocess", type=str, default=None)
 
     parser.add_argument("--cache", action="store_true", help="Cache processed data.")
     parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
@@ -58,10 +59,25 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder,preprocess=args.preprocess)
         data = dataset.load_data()
-        train_size = len(data.get_split_idx("train"))
+        # Prepare preprocessing pipeline and apply it to data
+        kwargs = {tune_mode: dict(wandb.config)}
+        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+        preprocessing_pipeline(data)
+        if args.preprocess!="aux":
+            cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
+            cell_type_labels_unique = list(np.unique(cell_type_labels))
+            c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
+            data.data['mod1'].obsm["cell_type"] = c_labels
+            data.data["mod1"].obsm["S_scores"]=np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["G2M_scores"]=np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["batch_label"]=np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["phase_labels"]=np.zeros(data.data['mod1'].shape[0])
 
+        train_size = len(data.get_split_idx("train"))
+        
         data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data)
         data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data)
         # data.set_config(

From 9d035f744055958cc18f64d425c83b3de877a1f9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jul 2024 03:36:04 +0000
Subject: [PATCH 027/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmogcn/main.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index 8fdbe10f..cf593088 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -59,25 +59,25 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder,preprocess=args.preprocess)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
         data = dataset.load_data()
         # Prepare preprocessing pipeline and apply it to data
         kwargs = {tune_mode: dict(wandb.config)}
         preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)
-        if args.preprocess!="aux":
+        if args.preprocess != "aux":
             cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
             cell_type_labels_unique = list(np.unique(cell_type_labels))
             c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
             data.data['mod1'].obsm["cell_type"] = c_labels
-            data.data["mod1"].obsm["S_scores"]=np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["G2M_scores"]=np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["batch_label"]=np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["phase_labels"]=np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
 
         train_size = len(data.get_split_idx("train"))
-        
+
         data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data)
         data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data)
         # data.set_config(

From 86d56f7273fae37b9cb243dc917284e98b2bc0ab Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 11:36:31 +0800
Subject: [PATCH 028/203] minor change

---
 examples/tuning/joint_embedding_jae/main.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index afde44f6..591b71ba 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -7,8 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper
@@ -29,6 +29,7 @@
     parser.add_argument("-bs", "--batch_size", default=128, type=int)
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("--preprocess", type=str, default=None)
 
     parser.add_argument("--cache", action="store_true", help="Cache processed data.")
     parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
@@ -55,14 +56,23 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder)
+        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
         data = dataset.load_data()
+
         # Prepare preprocessing pipeline and apply it to data
         kwargs = {tune_mode: dict(wandb.config)}
         preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)
-        logger.warning(data)
+        if args.preprocess != "aux":
+            cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
+            cell_type_labels_unique = list(np.unique(cell_type_labels))
+            c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
+            data.data['mod1'].obsm["cell_type"] = c_labels
+            data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
+            data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
         (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score,
                                        G2M_score) = data.get_train_data(return_type="torch")
         (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch")

From 8cfcf15f2cad9bf783e69b92cc168caa11ef8cc0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jul 2024 03:39:13 +0000
Subject: [PATCH 029/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 591b71ba..77e662e5 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -7,8 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper

From 514465bd7683afebe028d57a741de3558d9d548d Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 22:29:13 +0800
Subject: [PATCH 030/203] minor change

---
 examples/tuning/joint_embedding_dcca/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 0efe7567..20924128 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
@@ -159,7 +159,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         adata = adata[adata_sol.obs_names]
         adata_sol.obsm['X_emb'] = adata.X
         score = metrics.labeled_clustering_evaluate(adata, adata_sol)
-        score.update(metrics.integration_openproblems_evaluate(adata_sol))
+        # score.update(metrics.integration_openproblems_evaluate(adata_sol))
         # score.update({
         #     'seed': args.seed + k,
         #     'subtask': args.subtask,

From ac137c3164d678902769a676930eb0d2b4ac67a8 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 22:29:16 +0800
Subject: [PATCH 031/203] minor change

---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 591b71ba..be7b962b 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -92,7 +92,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         print(embeds)
 
         score = model.score(X_test, test_id, labels, metric="clustering")
-        score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+        # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
         score.update({
             'subtask': args.subtask,
             'method': 'jae',

From 3c717cbc9e9091ce5e38c797f672d7388f4ebe81 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jul 2024 14:30:51 +0000
Subject: [PATCH 032/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_dcca/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 4a6c3317..b2a9bf8b 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA

From 648b3c154c36039a4d0ae6f72d0568a9a8b2f37e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 22:31:00 +0800
Subject: [PATCH 033/203] minor change

---
 examples/tuning/joint_embedding_dcca/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index da28136e..4a6c3317 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
@@ -160,7 +160,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         adata = adata[adata_sol.obs_names]
         adata_sol.obsm['X_emb'] = adata.X
         score = metrics.labeled_clustering_evaluate(adata, adata_sol)
-        score.update(metrics.integration_openproblems_evaluate(adata_sol))
+        # score.update(metrics.integration_openproblems_evaluate(adata_sol))
         # score.update({
         #     'seed': args.seed + k,
         #     'subtask': args.subtask,

From 7a7f6a165f3f5c127edcaaae0e2d558323bbb5c8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Jul 2024 14:31:58 +0000
Subject: [PATCH 034/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_dcca/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 4a6c3317..b2a9bf8b 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -11,10 +11,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA

From 4518d5f56705f1397221ea8110cc59bd2c6a6136 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 24 Jul 2024 22:36:48 +0800
Subject: [PATCH 035/203] minor change

---
 examples/tuning/joint_embedding_scmogcn/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index cf593088..90ec2e96 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -104,7 +104,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
 
         embeds = model.predict(test_id).cpu().numpy()
         score = model.score(test_id, labels, metric="clustering")
-        score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+        # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
         score.update({
             'subtask': args.subtask,
             'method': 'scmogcn',

From 44e5b47095323ca5334eee467dc4d664f3b75c8a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 25 Jul 2024 21:47:27 +0800
Subject: [PATCH 036/203] minor change

---
 examples/tuning/joint_embedding_dcca/main.py | 190 ++++++++++---------
 1 file changed, 101 insertions(+), 89 deletions(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index b2a9bf8b..d81b6929 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -3,6 +3,7 @@
 import os
 import pprint
 import sys
+from copy import deepcopy
 from pathlib import Path
 
 import anndata as ad
@@ -11,10 +12,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
@@ -94,95 +95,106 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         #                      layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=4, layer_d_2=[4], hidden2_2=4, args=args,
         #                      Type_1="NB", Type_2="Bernoulli", ground_truth1=torch.cat([train_labels, test_labels]), cycle=1,
         #                      attention_loss="Eucli")  # yapf: disable
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
-        data = dataset.load_data()
-
-        # Prepare preprocessing pipeline and apply it to data
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
-        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
-        preprocessing_pipeline(data)
-
-        le = preprocessing.LabelEncoder()
-        labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
-        data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense() if scipy.sparse.issparse(data.mod["mod2"].X) else data.mod["mod2"].X, 1) / 100
-        # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"]
-        data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense() if scipy.sparse.issparse(data.mod["mod1"].X) else data.mod["mod1"].X, 1) / 100
-        data.mod["mod1"].obsm["labels"] = labels
-
-        # data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1",
-        #                 feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"],
-        #                 feature_channel=["counts", "counts", None, None, "size_factors",
-        #                                 "size_factors"], label_channel="labels")
-        #TODO 感觉layers中的counts才是raw
-        (x_train, y_train, x_train_raw, y_train_raw, x_train_size,
-        y_train_size), train_labels = data.get_train_data(return_type="torch")
-        (x_test, y_test, x_test_raw, y_test_raw, x_test_size,
-        y_test_size), test_labels = data.get_test_data(return_type="torch")
-
-        Nfeature1 = x_train.shape[1]
-        Nfeature2 = y_train.shape[1]
-
-        device = torch.device(args.device)
-        train = data_utils.TensorDataset(x_train.float(), x_train_raw, x_train_size.float(), y_train.float(), y_train_raw,
-                                        y_train_size.float())
-
-        train_loader = data_utils.DataLoader(train, batch_size=args.batch_size, shuffle=True)
-
-        test = data_utils.TensorDataset(x_test.float(), x_test_raw, x_test_size.float(), y_test.float(), y_test_raw,
-                                        y_test_size.float())
-
-        test_loader = data_utils.DataLoader(test, batch_size=args.batch_size, shuffle=False)
-
-        total = data_utils.TensorDataset(
-            torch.cat([x_train, x_test]).float(), torch.cat([x_train_raw, x_test_raw]),
-            torch.cat([x_train_size, x_test_size]).float(),
-            torch.cat([y_train, y_test]).float(), torch.cat([y_train_raw, y_test_raw]),
-            torch.cat([y_train_size, y_test_size]).float())
-
-        total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
-        model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=50, layer_d_1=[50, 128], hidden2_1=128,
-                     layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=50, layer_d_2=[50], hidden2_2=50,
-                     args=args, ground_truth1=torch.cat([train_labels, test_labels]), Type_1="NB", Type_2="Bernoulli",
-                     cycle=1, attention_loss="Eucli").to(device)
-        model.to(device)
-        model.fit(train_loader, test_loader, total_loader, "RNA")
-
-        emb1, emb2 = model.predict(total_loader)
-        embeds = np.concatenate([emb1, emb2], 1)
-        print(embeds)
-
-        adata = ad.AnnData(
-            X=embeds,
-            obs=data.mod["mod1"].obs,
-        )
-        adata_sol = data.mod["test_sol"]
-        adata = adata[adata_sol.obs_names]
-        adata_sol.obsm['X_emb'] = adata.X
-        score = metrics.labeled_clustering_evaluate(adata, adata_sol)
-        # score.update(metrics.integration_openproblems_evaluate(adata_sol))
-        # score.update({
-        #     'seed': args.seed + k,
-        #     'subtask': args.subtask,
-        #     'method': 'dcca',
-        # })
-
-        # if res is not None:
-        #     res = res.append(score, ignore_index=True)
-        # else:
-        #     for s in score:
-        #         score[s] = [score[s]]
-        #     res = pd.DataFrame(score)
-        score["ARI"]=score["dance_ari"]
-        del score["dance_ari"]
-        wandb.log(score)
-        wandb.finish()
-        torch.cuda.empty_cache()
+        try:
+            dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
+            data = dataset.load_data()
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb.config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+            preprocessing_pipeline(data)
+            le = preprocessing.LabelEncoder()
+            labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+            data.mod["mod2"].obsm["size_factors"] = np.sum(data.mod["mod2"].X.todense() if scipy.sparse.issparse(data.mod["mod2"].X) else data.mod["mod2"].X, 1) / 100
+            # data.mod["mod1"].obsm["size_factors"] = data.mod["mod1"].obs["size_factors"]
+            data.mod["mod1"].obsm["size_factors"] = np.sum(data.mod["mod1"].X.todense() if scipy.sparse.issparse(data.mod["mod1"].X) else data.mod["mod1"].X, 1) / 100
+            data.mod["mod1"].obsm["labels"] = labels
+
+            # data.set_config(feature_mod=["mod1", "mod2", "mod1", "mod2", "mod1", "mod2"], label_mod="mod1",
+            #                 feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"],
+            #                 feature_channel=["counts", "counts", None, None, "size_factors",
+            #                                 "size_factors"], label_channel="labels")
+            #TODO 感觉layers中的counts才是raw
+            #TODO 的确感觉layers中的counts才是raw，不知道反过来影响大不大
+            (x_train, y_train, x_train_raw, y_train_raw, x_train_size,
+            y_train_size), train_labels = data.get_train_data(return_type="torch")
+            (x_test, y_test, x_test_raw, y_test_raw, x_test_size,
+            y_test_size), test_labels = data.get_test_data(return_type="torch")
+
+            Nfeature1 = x_train.shape[1]
+            Nfeature2 = y_train.shape[1]
+
+            device = torch.device(args.device)
+            train = data_utils.TensorDataset(x_train.float(), x_train_raw, x_train_size.float(), y_train.float(), y_train_raw,
+                                            y_train_size.float())
+
+            train_loader = data_utils.DataLoader(train, batch_size=args.batch_size, shuffle=True)
+
+            test = data_utils.TensorDataset(x_test.float(), x_test_raw, x_test_size.float(), y_test.float(), y_test_raw,
+                                            y_test_size.float())
+
+            test_loader = data_utils.DataLoader(test, batch_size=args.batch_size, shuffle=False)
+
+            total = data_utils.TensorDataset(
+                torch.cat([x_train, x_test]).float(), torch.cat([x_train_raw, x_test_raw]),
+                torch.cat([x_train_size, x_test_size]).float(),
+                torch.cat([y_train, y_test]).float(), torch.cat([y_train_raw, y_test_raw]),
+                torch.cat([y_train_size, y_test_size]).float())
+
+            total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
+            model = DCCA(layer_e_1=[Nfeature1, 128], hidden1_1=128, Zdim_1=50, layer_d_1=[50, 128], hidden2_1=128,
+                        layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=50, layer_d_2=[50], hidden2_2=50,
+                        args=args, ground_truth1=torch.cat([train_labels, test_labels]), Type_1="NB", Type_2="Bernoulli",
+                        cycle=1, attention_loss="Eucli").to(device)
+            model.to(device)
+            model.fit(train_loader, test_loader, total_loader, "RNA")
+
+            emb1, emb2 = model.predict(total_loader)
+            embeds = np.concatenate([emb1, emb2], 1)
+            print(embeds)
+
+            adata = ad.AnnData(
+                X=embeds,
+                obs=data.mod["mod1"].obs,
+            )
+            adata_sol = data.mod["test_sol"]
+            adata = adata[adata_sol.obs_names]
+            adata_sol.obsm['X_emb'] = adata.X
+            score = metrics.labeled_clustering_evaluate(adata, adata_sol)
+            # score.update(metrics.integration_openproblems_evaluate(adata_sol))
+            # score.update({
+            #     'seed': args.seed + k,
+            #     'subtask': args.subtask,
+            #     'method': 'dcca',
+            # })
+
+            # if res is not None:
+            #     res = res.append(score, ignore_index=True)
+            # else:
+            #     for s in score:
+            #         score[s] = [score[s]]
+            #     res = pd.DataFrame(score)
+            score["ARI"]=score["dance_ari"]
+            del score["dance_ari"]
+            wandb.log(score.copy())
+            wandb.finish()
+        finally:
+            # del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1
+            # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels
+            # del labels,le,dataset,score
+            # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score]
+            locals_keys=list(locals().keys())
+            for var in locals_keys:
+                try:
+                    exec(f"del {var}")
+                    logger.info(f"Deleted '{var}'")
+                except NameError:
+                    logger.info(f"Variable '{var}' does not exist, continuing...")
+            torch.cuda.empty_cache()
+            gc.collect()
         #主要是报错时没有执行这些命令导致的，我感觉
-        del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1
-        del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels
-        del labels,le,dataset,score
-        gc.collect()
+
+
     entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
         evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
     save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)

From aad76dc41ad9a14bc0724ade398a313ab249829d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jul 2024 13:50:14 +0000
Subject: [PATCH 037/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_dcca/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index d81b6929..7a1ed2b3 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -12,10 +12,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA

From 4b2c0f6ab86996262ea09ecbb2182331297ad2de Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 25 Jul 2024 21:53:22 +0800
Subject: [PATCH 038/203] minor change

---
 examples/tuning/joint_embedding_jae/main.py | 109 +++++++++++---------
 1 file changed, 62 insertions(+), 47 deletions(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 4a3b70a6..1c115617 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -1,4 +1,5 @@
 import argparse
+import gc
 import os
 import pprint
 import sys
@@ -7,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper
@@ -56,52 +57,66 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
-        data = dataset.load_data()
-
-        # Prepare preprocessing pipeline and apply it to data
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
-        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
-        preprocessing_pipeline(data)
-        if args.preprocess != "aux":
-            cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
-            cell_type_labels_unique = list(np.unique(cell_type_labels))
-            c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
-            data.data['mod1'].obsm["cell_type"] = c_labels
-            data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
-        (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score,
-                                       G2M_score) = data.get_train_data(return_type="torch")
-        (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch")
-        X_train = torch.cat([X_mod1_train, X_mod2_train], dim=1)
-        phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
-        X_test = torch.cat([X_mod1_test, X_mod2_test], dim=1)
-        X_test = torch.cat([X_train, X_test]).float().to(device)
-        test_id = np.arange(X_test.shape[0])
-        labels = torch.cat([cell_type, cell_type_test]).numpy()
-        adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
-
-        model = JAEWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
-                           num_phases=phase_score.shape[1], num_features=X_train.shape[1])
-        model.fit(X_train, cell_type, batch_label, phase_score, max_epochs=50)
-
-        embeds = model.predict(X_test, test_id).cpu().numpy()
-        print(embeds)
-
-        score = model.score(X_test, test_id, labels, metric="clustering")
-        # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
-        score.update({
-            'subtask': args.subtask,
-            'method': 'jae',
-        })
-        score["ARI"] = score["dance_ari"]
-        del score["dance_ari"]
-        wandb.log(score)
-        wandb.finish()
-        torch.cuda.empty_cache()
+
+        try:
+            dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
+            data = dataset.load_data()
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb.config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+            preprocessing_pipeline(data)
+            if args.preprocess != "aux":
+                cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
+                cell_type_labels_unique = list(np.unique(cell_type_labels))
+                c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
+                data.data['mod1'].obsm["cell_type"] = c_labels
+                data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
+                data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
+                data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
+                data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
+            (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score,
+                                           G2M_score) = data.get_train_data(return_type="torch")
+            (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch")
+            X_train = torch.cat([X_mod1_train, X_mod2_train], dim=1)
+            phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
+            X_test = torch.cat([X_mod1_test, X_mod2_test], dim=1)
+            X_test = torch.cat([X_train, X_test]).float().to(device)
+            test_id = np.arange(X_test.shape[0])
+            labels = torch.cat([cell_type, cell_type_test]).numpy()
+            adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
+
+            model = JAEWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
+                               num_phases=phase_score.shape[1], num_features=X_train.shape[1])
+            model.fit(X_train, cell_type, batch_label, phase_score, max_epochs=50)
+
+            embeds = model.predict(X_test, test_id).cpu().numpy()
+            print(embeds)
+
+            score = model.score(X_test, test_id, labels, metric="clustering")
+            # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+            score.update({
+                'subtask': args.subtask,
+                'method': 'jae',
+            })
+            score["ARI"] = score["dance_ari"]
+            del score["dance_ari"]
+            wandb.log(score)
+            wandb.finish()
+        finally:
+            # del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1
+            # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels
+            # del labels,le,dataset,score
+            # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score]
+            locals_keys = list(locals().keys())
+            for var in locals_keys:
+                try:
+                    exec(f"del {var}")
+                    logger.info(f"Deleted '{var}'")
+                except NameError:
+                    logger.info(f"Variable '{var}' does not exist, continuing...")
+            torch.cuda.empty_cache()
+            gc.collect()
 
     entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
         evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch

From d834e89dbd05a887f5ba779b8b8b0f38ac066764 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jul 2024 13:54:00 +0000
Subject: [PATCH 039/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 1c115617..1485c6fa 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper

From ea0e7feebd2be3c72b8573cfdd2b1641c7f7b121 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 29 Jul 2024 17:34:07 +0800
Subject: [PATCH 040/203] minor change

---
 dance/datasets/multimodality.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
index 7df32f7d..090509d6 100644
--- a/dance/datasets/multimodality.py
+++ b/dance/datasets/multimodality.py
@@ -575,7 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre
 
     def _raw_to_dance(self, raw_data):
         mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data)
-        self.to_array([mod1, mod2, meta1, meta2, test_sol])
+        # self.to_array([mod1, mod2, meta1, meta2, test_sol])
 
         assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned"
         mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol})
@@ -755,7 +755,7 @@ def _maybe_preprocess(self, raw_data):
             if mod1.shape[1] > self.selection_threshold:
                 sc.pp.highly_variable_genes(mod1, layer="counts", flavor="seurat_v3",
                                             n_top_genes=self.selection_threshold, span=self.span)
-                mod1 = mod1[:, mod1.var["highly_variable"]]
+                mod1 = mod1[:, mod1.var["highly_variable"]]  # Equivalent to subset=True and _inplace_subset_var
             if mod2.shape[1] > self.selection_threshold:
                 sc.pp.highly_variable_genes(mod2, layer="counts", flavor="seurat_v3",
                                             n_top_genes=self.selection_threshold, span=self.span)

From 50da51ec08fa6ee776702791035305767fa5531c Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 29 Jul 2024 21:05:49 +0800
Subject: [PATCH 041/203] minor changes

---
 dance/datasets/multimodality.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
index 090509d6..14e9c5c6 100644
--- a/dance/datasets/multimodality.py
+++ b/dance/datasets/multimodality.py
@@ -575,7 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre
 
     def _raw_to_dance(self, raw_data):
         mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data)
-        # self.to_array([mod1, mod2, meta1, meta2, test_sol])
+        self.to_array([mod1, mod2, meta1, meta2, test_sol])
 
         assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned"
         mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol})
@@ -588,9 +588,13 @@ def _raw_to_dance(self, raw_data):
     def to_array(self, datas):
         for data in datas:
             if scipy.sparse.issparse(data.X):
-                data.X = np.array(data.X.todense()).astype(float)
+                if not isinstance(data.X, scipy.sparse.csr_matrix):
+                    data.X = data.X.tocsr()
+                # data.X = np.array(data.X.todense()).astype(float)
             if "counts" in data.layers and scipy.sparse.issparse(data.layers["counts"]):
-                data.layers["counts"] = np.array(data.layers["counts"].todense()).astype(float)
+                if not isinstance(data.layers["counts"], scipy.sparse.csr_matrix):
+                    data.layers["counts"] = data.layers["counts"].tocsr()
+                # data.layers["counts"] = np.array(data.layers["counts"].todense()).astype(float)
 
     def _maybe_preprocess(self, raw_data):
         if self.preprocess is None:

From 8fa1959f63fea66b0de3d9e4c3221a89f93aaf83 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 31 Jul 2024 21:10:42 +0800
Subject: [PATCH 042/203] minor change

---
 dance/transforms/normalize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/transforms/normalize.py b/dance/transforms/normalize.py
index b0fb7af6..71ab6622 100644
--- a/dance/transforms/normalize.py
+++ b/dance/transforms/normalize.py
@@ -189,7 +189,7 @@ def transform(self, X):
             raise RuntimeError('Transformer was not fitted on any data')
         if scipy.sparse.issparse(X):
             tf = X.multiply(1 / X.sum(axis=1))
-            return tf.multiply(self.idf)
+            return tf.multiply(self.idf).tocsr()
         else:
             tf = X / X.sum(axis=1, keepdims=True)
             return tf * self.idf

From 46ccfbf86a4cf9a56a0646dc03d67f62c32b81e7 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 1 Aug 2024 21:59:45 +0800
Subject: [PATCH 043/203] minor change

---
 dance/pipeline.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/dance/pipeline.py b/dance/pipeline.py
index 09595c52..a1aa2173 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -8,6 +8,7 @@
 from operator import mul
 from pprint import pformat
 
+import omegaconf
 import pandas as pd
 from omegaconf import DictConfig, OmegaConf
 
@@ -793,6 +794,14 @@ def _params_search_space(self) -> Dict[str, Dict[str, Optional[Union[str, float]
     def wandb_sweep_config(self) -> Dict[str, Any]:
         if self.wandb_config is None:
             raise ValueError("wandb config not specified in the raw config.")
+        if "run_kwargs" in self.config:
+            return {
+                **self.wandb_config, "parameters": {
+                    "run_kwargs": {
+                        "values": omegaconf.OmegaConf.to_object(self.config.run_kwargs)
+                    }
+                }
+            }
         return {**self.wandb_config, "parameters": self.search_space()}
 
     def wandb_sweep(self) -> Tuple[str, str, str]:
@@ -807,7 +816,7 @@ def wandb_sweep(self) -> Tuple[str, str, str]:
                              f"'entity' and 'project': {wandb_entity=!r}, {wandb_project=!r}")
 
         sweep_config = self.wandb_sweep_config()
-        logger.info(f"Sweep config:\n{pformat(sweep_config)}")
+        # logger.info(f"Sweep config:\n{pformat(sweep_config)}")
         wandb_sweep_id = wandb.sweep(sweep=sweep_config, entity=wandb_entity, project=wandb_project)
         logger.info(Color("blue")(f"\n\n\t[*] Sweep ID: {wandb_sweep_id}\n"))
 

From ada544b21eaba34c0d99017ddc146abb63a1038a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 1 Aug 2024 22:03:11 +0800
Subject: [PATCH 044/203] minor change

---
 examples/tuning/joint_embedding_jae/main.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 1485c6fa..8feedca7 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper
@@ -19,8 +19,10 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-        choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+        "-t", "--subtask", default="openproblems_bmmc_cite_phase2", choices=[
+            "GSE140203_BRAIN_atac2gex", "GSE140203_SKIN_atac2gex", "openproblems_bmmc_cite_phase2",
+            "openproblems_bmmc_multiome_phase2"
+        ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -57,12 +59,19 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-
+        wandb_config = wandb.config
+        if "run_kwargs" in pipeline_planer.config:
+            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                wandb_config = wandb_config["run_kwargs"]
+            else:
+                wandb.log({"skip": 1})
+                wandb.finish()
+                return
         try:
             dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
             data = dataset.load_data()
             # Prepare preprocessing pipeline and apply it to data
-            kwargs = {tune_mode: dict(wandb.config)}
+            kwargs = {tune_mode: dict(wandb_config)}
             preprocessing_pipeline = pipeline_planer.generate(**kwargs)
             print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
             preprocessing_pipeline(data)
@@ -113,6 +122,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
                 try:
                     exec(f"del {var}")
                     logger.info(f"Deleted '{var}'")
+
                 except NameError:
                     logger.info(f"Variable '{var}' does not exist, continuing...")
             torch.cuda.empty_cache()

From c0cfd3a11de749cca4d79876cbf1d394cd6cfd6f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 1 Aug 2024 14:03:53 +0000
Subject: [PATCH 045/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 8feedca7..3ba455d0 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper

From 0507539c00526c92d5654bc320af8532faba409f Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 12 Aug 2024 20:47:59 +0800
Subject: [PATCH 046/203] minor change

---
 examples/tuning/step3_default_params.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml
index 2a010328..91ba94c8 100644
--- a/examples/tuning/step3_default_params.yaml
+++ b/examples/tuning/step3_default_params.yaml
@@ -37,6 +37,8 @@ pipeline:
       base:
         min: 1.0
         max: 10.0
+  - type: normalize
+    target: tfidfTransform
   - type: normalize
     target: NormalizeTotal
     params_to_tune:

From 250dfb4793871502995c7c46eb53358eedb1cc77 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 13 Aug 2024 19:31:43 +0800
Subject: [PATCH 047/203] minor change

---
 dance/pipeline.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/dance/pipeline.py b/dance/pipeline.py
index a1aa2173..0841eed7 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -1041,7 +1041,9 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_
     conf = OmegaConf.load(conf_load_path)
     pipeline_top_k = default(step2_pipeline_planer.config.pipeline_tuning_top_k, DEFAULT_PIPELINE_TUNING_TOP_K)
     result = pd.read_csv(result_load_path).sort_values(by=metric, ascending=ascending).head(pipeline_top_k)
-    columns = sorted([col for col in result.columns if col.startswith("pipeline")])
+    columns = sorted(
+        [col for col in result.columns if (col.startswith("pipeline") or col.startswith("run_kwargs_pipeline"))],
+        key=lambda x: float(x.split('.')[1]))
     pipeline_names = result.loc[:, columns].values
     count = 0
     for row in pipeline_names:
@@ -1050,11 +1052,12 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_
         for x in row:
             for k in conf.pipeline:
                 if k["target"] == x:
-                    pipeline.append(k)
+                    pipeline.append(deepcopy(k))
         for i, f in zip(required_indexes, required_funs):
             for k in step2_pipeline_planer.config.pipeline:
                 if "target" in k and k["target"] == f:
-                    pipeline.insert(i, k)
+                    pipeline.insert(i, deepcopy(k))
+                    break
         for p1 in step2_pipeline_planer.config.pipeline:
             if "step3_frozen" in p1 and p1["step3_frozen"]:
                 for p2 in pipeline:
@@ -1065,12 +1068,16 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_
                             for target, d_p in p1.default_params.items():
                                 if target == p2["target"]:
                                     p2["params"] = d_p
-        for p1, p2 in zip(step2_pipeline_planer.config.pipeline, pipeline):  #need order
+        #顺序不对，参考_sanitize_pipeline进行修改 TODO
+        step2_pipeline = step2_pipeline_planer.config.pipeline
+        # step2_pipeline=sorted(step2_pipeline_planer.config.pipeline,key=lambda x: float(x.split('.')[1]))
+        for p1, p2 in zip(step2_pipeline, pipeline):  #need order
             if "params" in p1:
-                for key, value in p1.params.items():
-                    if "params" not in p2:
-                        p2.params = {}
-                    p2.params[key] = value
+                p2.params = p1.params
+                # for key, value in p1.params.items():
+                #     if "params" not in p2:
+                #         p2.params = {}
+                #     p2.params[key] = value
         temp_conf = conf.copy()
         temp_conf.pipeline = pipeline
         temp_conf.wandb = step2_pipeline_planer.config.wandb

From 1a6f14f4b6c9c5bf729b2609cd3010285d30ca31 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 13 Aug 2024 19:34:34 +0800
Subject: [PATCH 048/203] minor change

---
 examples/tuning/step3_default_params.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml
index 91ba94c8..69135b02 100644
--- a/examples/tuning/step3_default_params.yaml
+++ b/examples/tuning/step3_default_params.yaml
@@ -219,19 +219,21 @@ pipeline:
           - [max_genes, max_counts, min_counts, min_genes]
           - [max_genes, max_counts, min_genes, min_counts]
       min_counts:
-        min: 3
-        max: 1000
+        min: 0.0 # Change occurs when joint embedding
+        max: 0.05
       min_genes:
         min: 0.0
         max: 0.05
       max_counts:
-        min: 10000
-        max: 100000
+        min: 0.95
+        max: 1.0
       max_genes:
         min: 0.95
         max: 1.0
   - type: filter.cell
     target: FilterCellsPlaceHolder
+  - type: filter.cell
+    target: FilterCellsCommonMod
   - type: feature.cell
     target: CellPCA
     params:

From 0776e07ee43e2318481b9330a639511113cca1da Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 13 Aug 2024 19:35:54 +0800
Subject: [PATCH 049/203] minor change

---
 examples/tuning/step3_default_params.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml
index 2a010328..70da888c 100644
--- a/examples/tuning/step3_default_params.yaml
+++ b/examples/tuning/step3_default_params.yaml
@@ -54,6 +54,8 @@ pipeline:
         values: [null, 1e3, 1e4, 1e5, 1e6]
       max_fraction:
         values: [0.01, 0.05, 0.5, 0.7, 1.0]
+  - type: normalize
+    target: tfidfTransform
   - type: normalize
     target: NormalizePlaceHolder
   - type: filter.gene

From e2eba29013404914eda7c45a641f13fa667d3e59 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 13 Aug 2024 19:37:51 +0800
Subject: [PATCH 050/203] minor change

---
 examples/tuning/step3_default_params.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml
index a711c951..70da888c 100644
--- a/examples/tuning/step3_default_params.yaml
+++ b/examples/tuning/step3_default_params.yaml
@@ -37,8 +37,6 @@ pipeline:
       base:
         min: 1.0
         max: 10.0
-  - type: normalize
-    target: tfidfTransform
   - type: normalize
     target: NormalizeTotal
     params_to_tune:

From a35cbb001e852603659636a9c1f43da5158c6e39 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 14 Aug 2024 10:50:30 +0800
Subject: [PATCH 051/203] minor change

---
 dance/utils/wrappers.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py
index 11314694..7dd756ba 100644
--- a/dance/utils/wrappers.py
+++ b/dance/utils/wrappers.py
@@ -124,14 +124,4 @@ def transform_mod_to_anndata(mod_data: mudata.MuData, mod_key: str):
     return mod_data.mod[mod_key]
 
 
-# 使用装饰器
-@add_mod_and_transform
-class MyClass:
 
-    def __init__(self, x, **kwargs):
-        self.x = x
-        print("-------")
-        print(**kwargs)
-
-    def __call__(self, data):
-        return data

From eae8bd225e1bb9949b6cc953f838aced011e95f9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 14 Aug 2024 02:51:19 +0000
Subject: [PATCH 052/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/utils/wrappers.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py
index 7dd756ba..28ffefe8 100644
--- a/dance/utils/wrappers.py
+++ b/dance/utils/wrappers.py
@@ -122,6 +122,3 @@ def new_call(self, data: Data, *args, **kwargs):
 
 def transform_mod_to_anndata(mod_data: mudata.MuData, mod_key: str):
     return mod_data.mod[mod_key]
-
-
-

From 5e49b3f00277310a4fd29d1022ac118a0e5be2bf Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 14 Aug 2024 11:01:34 +0800
Subject: [PATCH 053/203] minor change

---
 dance/transforms/cell_feature.py | 40 +++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py
index 4e90c97d..cb7b2ed9 100644
--- a/dance/transforms/cell_feature.py
+++ b/dance/transforms/cell_feature.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.decomposition import PCA, TruncatedSVD,SparsePCA
 from sklearn.random_projection import GaussianRandomProjection
 
 from dance.registry import register_preprocessor
@@ -168,6 +168,44 @@ def __call__(self, data):
 
         return data
 
+@register_preprocessor("feature", "cell")
+@add_mod_and_transform
+class SparsePCA(BaseTransform):
+    """Reduce cell feature matrix with SparsePCA.
+
+    Parameters
+    ----------
+    n_components
+        Number of SparsePCA components to use.
+
+    """
+
+    _DISPLAY_ATTRS = ("n_components", )
+
+    def __init__(self, n_components: Union[float, int] = 400, *, channel: Optional[str] = None,
+                 mod: Optional[str] = None, **kwargs):
+        super().__init__(**kwargs)
+
+        self.n_components = n_components
+        self.channel = channel
+
+    def __call__(self, data):
+        feat = data.get_feature(return_type="numpy", channel=self.channel)
+        # if self.n_components > min(feat.shape):
+        #     self.logger.warning(
+        #         f"n_components={self.n_components} must be between 0 and min(n_samples, n_features)={min(feat.shape)} with svd_solver='full'"
+        #     )
+        #     self.n_components = min(feat.shape)
+        pca = SparsePCA(n_components=self.n_components)
+        cell_feat = pca.fit_transform(feat)
+        self.logger.info(f"Generating cell SparsePCA features {feat.shape} (k={pca.n_components_})")
+        # evr = pca.explained_variance_ratio_
+        # self.logger.info(f"Top 10 explained variances: {evr[:10]}")
+        # self.logger.info(f"Total explained variance: {evr.sum():.2%}")
+        data.data.obsm[self.out] = cell_feat
+
+        return data
+
 
 @register_preprocessor("feature", "cell")
 @add_mod_and_transform

From 685582f4e86c890b42d0ccbf57094ed8ef88b4f9 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 14 Aug 2024 11:02:25 +0800
Subject: [PATCH 054/203] minor change

---
 examples/tuning/step3_default_params.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/tuning/step3_default_params.yaml b/examples/tuning/step3_default_params.yaml
index 4db51aa8..aaca441b 100644
--- a/examples/tuning/step3_default_params.yaml
+++ b/examples/tuning/step3_default_params.yaml
@@ -250,6 +250,14 @@ pipeline:
       n_components:
         min: 100
         max: 1000
+  - type: feature.cell
+    target: SparsePCA
+    params:
+      out: feature.cell
+    params_to_tune:
+      n_components:
+        min: 100
+        max: 1000
   - type: feature.cell
     target: WeightedFeaturePCA
     params:

From 250fd999c3c5e7221e9741bd4e75f0c786cd7f76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 14 Aug 2024 03:02:28 +0000
Subject: [PATCH 055/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/transforms/cell_feature.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py
index cb7b2ed9..2af1c3a6 100644
--- a/dance/transforms/cell_feature.py
+++ b/dance/transforms/cell_feature.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from sklearn.decomposition import PCA, TruncatedSVD,SparsePCA
+from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
 from sklearn.random_projection import GaussianRandomProjection
 
 from dance.registry import register_preprocessor
@@ -168,6 +168,7 @@ def __call__(self, data):
 
         return data
 
+
 @register_preprocessor("feature", "cell")
 @add_mod_and_transform
 class SparsePCA(BaseTransform):

From b6a034f60066553d42d594ab182fa3572c0f99b7 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 26 Aug 2024 16:06:18 +0800
Subject: [PATCH 056/203] minor change

---
 dance/datasets/singlemodality.py | 80 +++++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 6 deletions(-)

diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
index 60998be7..5be20d10 100644
--- a/dance/datasets/singlemodality.py
+++ b/dance/datasets/singlemodality.py
@@ -53,7 +53,7 @@ class CellTypeAnnotationDataset(BaseDataset):
 
     def __init__(self, full_download=False, train_dataset=None, test_dataset=None, species=None, tissue=None,
                  valid_dataset=None, train_dir="train", test_dir="test", valid_dir="valid", map_path="map",
-                 data_dir="./", train_as_valid=False, val_size=0.2):
+                 data_dir="./", train_as_valid=False, val_size=0.2,test_size=None,filetype: str = "csv"):
         super().__init__(data_dir, full_download)
 
         self.data_dir = data_dir
@@ -73,7 +73,8 @@ def __init__(self, full_download=False, train_dataset=None, test_dataset=None, s
             self.valid_dataset = train_dataset
             self.train2valid()
         self.val_size = val_size
-
+        self.test_size=test_size
+        self.filetype=filetype
     def train2valid(self):
         logger.info("Copy train_dataset and use it as valid_dataset")
         temp_ava_data = self.available_data.copy()
@@ -109,12 +110,12 @@ def download_all(self):
                 pass
             os.rename(download_path, move_path)
 
-    def get_all_filenames(self, filetype: str = "csv", feat_suffix: str = "data", label_suffix: str = "celltype"):
+    def get_all_filenames(self, feat_suffix: str = "data", label_suffix: str = "celltype"):
         filenames = []
-        for id in self.train_dataset + self.test_dataset + (self.valid_dataset
+        for id in self.train_dataset + (self.test_dataset if self.test_dataset is not None else []) + (self.valid_dataset
                                                             if self.valid_dataset is not None else []):
-            filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}")
-            filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{filetype}")
+            filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{self.filetype}")
+            filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{self.filetype}")
         return filenames
 
     def download(self, download_map=True):
@@ -175,6 +176,8 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se
         species = self.species
         tissue = self.tissue
         valid_feat = None
+        if self.test_dataset is None:
+            return self._load_raw_data_single_h5ad()
         if self.valid_dataset is not None:
             train_dataset_ids = self.train_dataset
             test_dataset_ids = self.test_dataset
@@ -270,6 +273,71 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se
 
             return adata, labels, idx_to_label, train_size, 0
 
+    def _load_raw_data_single_h5ad(self, ct_col: str = "cell_type") -> Tuple[ad.AnnData, List[Set[str]], List[str], int]:
+        species = self.species
+        tissue = self.tissue
+        valid_feat = None
+        data_dir = self.data_dir
+        train_dir = osp.join(data_dir, self.train_dir)
+        data_path=osp.join(train_dir, species, f"{species}_{tissue}{self.train_dataset[0]}_data.h5ad")
+        adata=sc.read_h5ad(data_path)
+        map_path = osp.join(data_dir, self.map_path, self.species)
+        X_train_temp, X_test = train_test_split(adata, test_size=0.2)
+        X_train, X_val = train_test_split(X_train_temp, test_size=0.25)
+        train_feat,valid_feat,test_feat=X_train.X,X_val.X,X_test.X
+        train_label,valid_label,test_label=X_train.obs,X_val.obs,X_test.obs
+        if valid_feat is not None:
+            # Combine features (only use features that are present in the training data)
+            train_size = train_feat.shape[0]
+            valid_size = valid_feat.shape[0]
+            # Convert cell type labels and map test cell type names to train
+            cell_types = set(train_label[ct_col].unique())
+            idx_to_label = sorted(cell_types)
+            cell_type_mappings: Dict[str, Set[str]] = self.get_map_dict(map_path, tissue)
+            train_labels, valid_labels, test_labels = train_label[ct_col].tolist(), [], []
+            for i in valid_label[ct_col]:
+                valid_labels.append(i if i in cell_types else cell_type_mappings.get(i))
+            for i in test_label[ct_col]:
+                test_labels.append(i if i in cell_types else cell_type_mappings.get(i))
+            labels: List[Set[str]] = train_labels + valid_labels + test_labels
+
+            logger.debug("Mapped valid cell-types:")
+            for i, j, k in zip(valid_label.index, valid_label[ct_col], valid_labels):
+                logger.debug(f"{i}:{j}\t-> {k}")
+
+            logger.debug("Mapped test cell-types:")
+            for i, j, k in zip(test_label.index, test_label[ct_col], test_labels):
+                logger.debug(f"{i}:{j}\t-> {k}")
+
+            logger.info(f"Loaded expression data: {adata}")
+            logger.info(f"Number of training samples: {train_feat.shape[0]:,}")
+            logger.info(f"Number of valid samples: {valid_feat.shape[0]:,}")
+            logger.info(f"Number of testing samples: {test_feat.shape[0]:,}")
+            logger.info(f"Cell-types (n={len(idx_to_label)}):\n{pprint.pformat(idx_to_label)}")
+
+            return adata, labels, idx_to_label, train_size, valid_size
+        else:
+            # Combine features (only use features that are present in the training data)
+            train_size = train_feat.shape[0]
+            cell_types = set(train_label[ct_col].unique())
+            idx_to_label = sorted(cell_types)
+            cell_type_mappings: Dict[str, Set[str]] = self.get_map_dict(map_path, tissue)
+            train_labels, test_labels = train_label[ct_col].tolist(), []
+            for i in test_label[ct_col]:
+                test_labels.append(i if i in cell_types else cell_type_mappings.get(i))
+            labels: List[Set[str]] = train_labels + test_labels
+
+            logger.debug("Mapped test cell-types:")
+            for i, j, k in zip(test_label.index, test_label[ct_col], test_labels):
+                logger.debug(f"{i}:{j}\t-> {k}")
+
+            logger.info(f"Loaded expression data: {adata}")
+            logger.info(f"Number of training samples: {train_feat.shape[0]:,}")
+            logger.info(f"Number of testing samples: {test_feat.shape[0]:,}")
+            logger.info(f"Cell-types (n={len(idx_to_label)}):\n{pprint.pformat(idx_to_label)}")
+
+            return adata, labels, idx_to_label, train_size, 0
+
     def _raw_to_dance(self, raw_data):
         adata, cell_labels, idx_to_label, train_size, valid_size = raw_data
         adata.obsm["cell_type"] = cell_label_to_df(cell_labels, idx_to_label, index=adata.obs.index)

From 455aaff4409e31278f23d6f26fa2baae5d61f081 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 08:06:51 +0000
Subject: [PATCH 057/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/datasets/singlemodality.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
index 5be20d10..e515d466 100644
--- a/dance/datasets/singlemodality.py
+++ b/dance/datasets/singlemodality.py
@@ -53,7 +53,7 @@ class CellTypeAnnotationDataset(BaseDataset):
 
     def __init__(self, full_download=False, train_dataset=None, test_dataset=None, species=None, tissue=None,
                  valid_dataset=None, train_dir="train", test_dir="test", valid_dir="valid", map_path="map",
-                 data_dir="./", train_as_valid=False, val_size=0.2,test_size=None,filetype: str = "csv"):
+                 data_dir="./", train_as_valid=False, val_size=0.2, test_size=None, filetype: str = "csv"):
         super().__init__(data_dir, full_download)
 
         self.data_dir = data_dir
@@ -73,8 +73,9 @@ def __init__(self, full_download=False, train_dataset=None, test_dataset=None, s
             self.valid_dataset = train_dataset
             self.train2valid()
         self.val_size = val_size
-        self.test_size=test_size
-        self.filetype=filetype
+        self.test_size = test_size
+        self.filetype = filetype
+
     def train2valid(self):
         logger.info("Copy train_dataset and use it as valid_dataset")
         temp_ava_data = self.available_data.copy()
@@ -112,8 +113,8 @@ def download_all(self):
 
     def get_all_filenames(self, feat_suffix: str = "data", label_suffix: str = "celltype"):
         filenames = []
-        for id in self.train_dataset + (self.test_dataset if self.test_dataset is not None else []) + (self.valid_dataset
-                                                            if self.valid_dataset is not None else []):
+        for id in self.train_dataset + (self.test_dataset if self.test_dataset is not None else
+                                        []) + (self.valid_dataset if self.valid_dataset is not None else []):
             filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{self.filetype}")
             filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{self.filetype}")
         return filenames
@@ -273,19 +274,20 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se
 
             return adata, labels, idx_to_label, train_size, 0
 
-    def _load_raw_data_single_h5ad(self, ct_col: str = "cell_type") -> Tuple[ad.AnnData, List[Set[str]], List[str], int]:
+    def _load_raw_data_single_h5ad(self,
+                                   ct_col: str = "cell_type") -> Tuple[ad.AnnData, List[Set[str]], List[str], int]:
         species = self.species
         tissue = self.tissue
         valid_feat = None
         data_dir = self.data_dir
         train_dir = osp.join(data_dir, self.train_dir)
-        data_path=osp.join(train_dir, species, f"{species}_{tissue}{self.train_dataset[0]}_data.h5ad")
-        adata=sc.read_h5ad(data_path)
+        data_path = osp.join(train_dir, species, f"{species}_{tissue}{self.train_dataset[0]}_data.h5ad")
+        adata = sc.read_h5ad(data_path)
         map_path = osp.join(data_dir, self.map_path, self.species)
         X_train_temp, X_test = train_test_split(adata, test_size=0.2)
         X_train, X_val = train_test_split(X_train_temp, test_size=0.25)
-        train_feat,valid_feat,test_feat=X_train.X,X_val.X,X_test.X
-        train_label,valid_label,test_label=X_train.obs,X_val.obs,X_test.obs
+        train_feat, valid_feat, test_feat = X_train.X, X_val.X, X_test.X
+        train_label, valid_label, test_label = X_train.obs, X_val.obs, X_test.obs
         if valid_feat is not None:
             # Combine features (only use features that are present in the training data)
             train_size = train_feat.shape[0]

From c58de0a9c60eb3413ad56b8f00b120b9903cf3dd Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 26 Aug 2024 16:14:48 +0800
Subject: [PATCH 058/203] update metadata

---
 dance/metadata/scdeepsort.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 79dfc5c1..473c37af 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -66,3 +66,4 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co
 mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1
 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1
 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1
+human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1
\ No newline at end of file

From 03a1109e6c4cb96d517c93a213a3318902c57bee Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 08:16:14 +0000
Subject: [PATCH 059/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 473c37af..326229fb 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -66,4 +66,4 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co
 mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1
 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1
 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1
-human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1
\ No newline at end of file
+human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1

From 57d643a7b882d6be7dccd0c17b871369bd53fff2 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 26 Aug 2024 22:43:17 +0800
Subject: [PATCH 060/203] minor change

---
 examples/tuning/cta_actinn/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py
index 506f71b9..77b963cc 100644
--- a/examples/tuning/cta_actinn/main.py
+++ b/examples/tuning/cta_actinn/main.py
@@ -30,7 +30,7 @@
     parser.add_argument("--num_epochs", type=int, default=50, help="Number of epochs")
     parser.add_argument("--print_cost", action="store_true", help="Print cost when training")
     parser.add_argument("--species", default="mouse")
-    parser.add_argument("--test_dataset", nargs="+", default=[1759], help="List of testing dataset ids.")
+    parser.add_argument("--test_dataset", nargs="+", default=[], help="List of testing dataset ids.")
     parser.add_argument("--tissue", default="Spleen")
     parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.")
     parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.")
@@ -41,13 +41,14 @@
     parser.add_argument("--sweep_id", type=str, default=None)
     parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
     parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+    parser.add_argument("--filetype", default="csv")
     args = parser.parse_args()
     logger.setLevel(args.log_level)
     logger.info(f"\n{pprint.pformat(vars(args))}")
     file_root_path = Path(
         args.root_path, "_".join([
             "-".join([str(num) for num in dataset])
-            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None
+            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if (dataset is not None and dataset !=[])
         ])).resolve()
     logger.info(f"\n files is saved in {file_root_path}")
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
@@ -68,7 +69,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         # Load data and perform necessary preprocessing
         data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset,
                                          valid_dataset=args.valid_dataset, data_dir="./temp_data", tissue=args.tissue,
-                                         species=args.species).load_data()
+                                         species=args.species,filetype=args.filetype).load_data()
 
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)

From 730ae275bb4af43b6f3c5d2ba02b1784e3f255f0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:45:26 +0000
Subject: [PATCH 061/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/cta_actinn/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py
index 77b963cc..616c0d10 100644
--- a/examples/tuning/cta_actinn/main.py
+++ b/examples/tuning/cta_actinn/main.py
@@ -48,7 +48,8 @@
     file_root_path = Path(
         args.root_path, "_".join([
             "-".join([str(num) for num in dataset])
-            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if (dataset is not None and dataset !=[])
+            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset]
+            if (dataset is not None and dataset != [])
         ])).resolve()
     logger.info(f"\n files is saved in {file_root_path}")
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
@@ -69,7 +70,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         # Load data and perform necessary preprocessing
         data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset,
                                          valid_dataset=args.valid_dataset, data_dir="./temp_data", tissue=args.tissue,
-                                         species=args.species,filetype=args.filetype).load_data()
+                                         species=args.species, filetype=args.filetype).load_data()
 
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)

From 8983aec81547915a693a6c5334b4e1c5b04ada2a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 26 Aug 2024 22:49:47 +0800
Subject: [PATCH 062/203] minor change

---
 dance/datasets/singlemodality.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
index e515d466..c85a293b 100644
--- a/dance/datasets/singlemodality.py
+++ b/dance/datasets/singlemodality.py
@@ -53,7 +53,7 @@ class CellTypeAnnotationDataset(BaseDataset):
 
     def __init__(self, full_download=False, train_dataset=None, test_dataset=None, species=None, tissue=None,
                  valid_dataset=None, train_dir="train", test_dir="test", valid_dir="valid", map_path="map",
-                 data_dir="./", train_as_valid=False, val_size=0.2, test_size=None, filetype: str = "csv"):
+                 data_dir="./", train_as_valid=False, val_size=0.2, test_size=0.2, filetype: str = "csv"):
         super().__init__(data_dir, full_download)
 
         self.data_dir = data_dir
@@ -177,7 +177,7 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se
         species = self.species
         tissue = self.tissue
         valid_feat = None
-        if self.test_dataset is None:
+        if self.test_dataset is None or self.test_dataset==[]:
             return self._load_raw_data_single_h5ad()
         if self.valid_dataset is not None:
             train_dataset_ids = self.train_dataset

From 2432c2c58197532826e66012ef4cae00565ec79a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:51:19 +0000
Subject: [PATCH 063/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/datasets/singlemodality.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
index c85a293b..302b44b8 100644
--- a/dance/datasets/singlemodality.py
+++ b/dance/datasets/singlemodality.py
@@ -177,7 +177,7 @@ def _load_raw_data(self, ct_col: str = "Cell_type") -> Tuple[ad.AnnData, List[Se
         species = self.species
         tissue = self.tissue
         valid_feat = None
-        if self.test_dataset is None or self.test_dataset==[]:
+        if self.test_dataset is None or self.test_dataset == []:
             return self._load_raw_data_single_h5ad()
         if self.valid_dataset is not None:
             train_dataset_ids = self.train_dataset

From 52b9df597e0af067545652cc1440594ac0a91e77 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 6 Sep 2024 22:49:55 +0800
Subject: [PATCH 064/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 326229fb..dda0c5b4 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -66,4 +66,4 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co
 mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1
 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1
 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1
-human,Brain,5500,train,,,train_human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad,https://www.dropbox.com/scl/fi/di32tltpqj49jhd5qfsta/human_Brain517e0281-6720-4a2a-b667-84923e22f38b_data.h5ad?rlkey=atp3emdggops3fcjvkki55tki&st=deevu404&dl=1
+human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1

From 24434eb70411a7055d172b689d3d8aaaa775f57e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:50:51 +0000
Subject: [PATCH 065/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 README.md | 64 +++++++++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index bec823f5..5d1fcfb2 100644
--- a/README.md
+++ b/README.md
@@ -193,14 +193,14 @@ pip install -e .
 
 | BackBone            | Model        | Algorithm                                                                                                    | Year | CheckIn |
 | ------------------- | ------------ | ------------------------------------------------------------------------------------------------------------ | ---- | ------- |
-| GNN                 | GraphSCI     | Imputing Single-cell RNA-seq data by combining Graph Convolution and Autoencoder Neural Networks             | 2021 | ✅       |
+| GNN                 | GraphSCI     | Imputing Single-cell RNA-seq data by combining Graph Convolution and Autoencoder Neural Networks             | 2021 | ✅      |
 | GNN                 | scGNN (2020) | SCGNN: scRNA-seq Dropout Imputation via Induced Hierarchical Cell Similarity Graph                           | 2020 | P1      |
-| GNN                 | scGNN (2021) | scGNN is a novel graph neural network framework for single-cell RNA-Seq analyses                             | 2021 | ✅       |
+| GNN                 | scGNN (2021) | scGNN is a novel graph neural network framework for single-cell RNA-Seq analyses                             | 2021 | ✅      |
 | GNN                 | GNNImpute    | An efficient scRNA-seq dropout imputation method using graph attention network                               | 2021 | P1      |
 | Graph Diffusion     | MAGIC        | MAGIC: A diffusion-based imputation method reveals gene-gene interactions in single-cell RNA-sequencing data | 2018 | P1      |
 | Probabilistic Model | scImpute     | An accurate and robust imputation method scImpute for single-cell RNA-seq data                               | 2018 | P1      |
 | GAN                 | scGAIN       | scGAIN: Single Cell RNA-seq Data Imputation using Generative Adversarial Networks                            | 2019 | P1      |
-| NN                  | DeepImpute   | DeepImpute: an accurate, fast, and scalable deep neural network method to impute single-cell RNA-seq data    | 2019 | ✅       |
+| NN                  | DeepImpute   | DeepImpute: an accurate, fast, and scalable deep neural network method to impute single-cell RNA-seq data    | 2019 | ✅      |
 | NN + TF             | Saver-X      | Transfer learning in single-cell transcriptomics improves data denoising and pattern discovery               | 2019 | P1      |
 
 | Model      | Evaluation Metric | Mouse Brain (current/reported) | Mouse Embryo (current/reported) | PBMC (current/reported) |
@@ -215,12 +215,12 @@ pip install -e .
 
 | BackBone                | Model         | Algorithm                                                                                                     | Year | CheckIn |
 | ----------------------- | ------------- | ------------------------------------------------------------------------------------------------------------- | ---- | ------- |
-| GNN                     | ScDeepsort    | Single-cell transcriptomics with weighted GNN                                                                 | 2021 | ✅       |
-| Logistic Regression     | Celltypist    | Cross-tissue immune cell analysis reveals tissue-specific features in humans.                                 | 2021 | ✅       |
-| Random Forest           | singleCellNet | SingleCellNet: a computational tool to classify single cell RNA-Seq data across platforms and across species. | 2019 | ✅       |
-| Neural Network          | ACTINN        | ACTINN: automated identification of cell types in single cell RNA sequencing.                                 | 2020 | ✅       |
+| GNN                     | ScDeepsort    | Single-cell transcriptomics with weighted GNN                                                                 | 2021 | ✅      |
+| Logistic Regression     | Celltypist    | Cross-tissue immune cell analysis reveals tissue-specific features in humans.                                 | 2021 | ✅      |
+| Random Forest           | singleCellNet | SingleCellNet: a computational tool to classify single cell RNA-Seq data across platforms and across species. | 2019 | ✅      |
+| Neural Network          | ACTINN        | ACTINN: automated identification of cell types in single cell RNA sequencing.                                 | 2020 | ✅      |
 | Hierarchical Clustering | SingleR       | Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage.        | 2019 | P1      |
-| SVM                     | SVM           | A comparison of automatic cell identification methods for single-cell RNA sequencing data.                    | 2018 | ✅       |
+| SVM                     | SVM           | A comparison of automatic cell identification methods for single-cell RNA sequencing data.                    | 2018 | ✅      |
 
 | Model         | Evaluation Metric | Mouse Brain 2695 (current/reported) | Mouse Spleen 1759 (current/reported) | Mouse Kidney 203 (current/reported) |
 | ------------- | ----------------- | ----------------------------------- | ------------------------------------ | ----------------------------------- |
@@ -234,12 +234,12 @@ pip install -e .
 
 | BackBone    | Model         | Algorithm                                                                                                    | Year | CheckIn |
 | ----------- | ------------- | ------------------------------------------------------------------------------------------------------------ | ---- | ------- |
-| GNN         | graph-sc      | GNN-based embedding for clustering scRNA-seq data                                                            | 2022 | ✅       |
-| GNN         | scTAG         | ZINB-based Graph Embedding Autoencoder for Single-cell RNA-seq Interpretations                               | 2022 | ✅       |
-| GNN         | scDSC         | Deep structural clustering for single-cell RNA-seq data jointly through autoencoder and graph neural network | 2022 | ✅       |
+| GNN         | graph-sc      | GNN-based embedding for clustering scRNA-seq data                                                            | 2022 | ✅      |
+| GNN         | scTAG         | ZINB-based Graph Embedding Autoencoder for Single-cell RNA-seq Interpretations                               | 2022 | ✅      |
+| GNN         | scDSC         | Deep structural clustering for single-cell RNA-seq data jointly through autoencoder and graph neural network | 2022 | ✅      |
 | GNN         | scGAC         | scGAC: a graph attentional architecture for clustering single-cell RNA-seq data                              | 2022 | P1      |
-| AutoEncoder | scDeepCluster | Clustering single-cell RNA-seq data with a model-based deep learning approach                                | 2019 | ✅       |
-| AutoEncoder | scDCC         | Model-based deep embedding for constrained clustering analysis of single cell RNA-seq data                   | 2021 | ✅       |
+| AutoEncoder | scDeepCluster | Clustering single-cell RNA-seq data with a model-based deep learning approach                                | 2019 | ✅      |
+| AutoEncoder | scDCC         | Model-based deep embedding for constrained clustering analysis of single cell RNA-seq data                   | 2021 | ✅      |
 | AutoEncoder | scziDesk      | Deep soft K-means clustering with self-training for single-cell RNA sequence data                            | 2020 | P1      |
 
 | Model         | Evaluation Metric | 10x PBMC (current/reported) | Mouse ES (current/reported) | Worm Neuron (current/reported) | Mouse Bladder (current/reported) |
@@ -256,12 +256,12 @@ pip install -e .
 
 | BackBone         | Model                    | Algorithm                                                                                          | Year | CheckIn |
 | ---------------- | ------------------------ | -------------------------------------------------------------------------------------------------- | ---- | ------- |
-| GNN              | ScMoGCN                  | Graph Neural Networks for Multimodal Single-Cell Data Integration                                  | 2022 | ✅       |
+| GNN              | ScMoGCN                  | Graph Neural Networks for Multimodal Single-Cell Data Integration                                  | 2022 | ✅      |
 | GNN              | ScMoLP                   | Link Prediction Variant of ScMoGCN                                                                 | 2022 | P1      |
 | GNN              | GRAPE                    | Handling Missing Data with Graph Representation Learning                                           | 2020 | P1      |
-| Generative Model | SCMM                     | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅       |
-| Auto-encoder     | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders        | 2021 | ✅       |
-| Auto-encoder     | BABEL                    | BABEL enables cross-modality translation between multiomic profiles at single-cell resolution      | 2021 | ✅       |
+| Generative Model | SCMM                     | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅      |
+| Auto-encoder     | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders        | 2021 | ✅      |
+| Auto-encoder     | BABEL                    | BABEL enables cross-modality translation between multiomic profiles at single-cell resolution      | 2021 | ✅      |
 
 | Model                    | Evaluation Metric | GEX2ADT (current/reported) | ADT2GEX (current/reported) | GEX2ATAC (current/reported) | ATAC2GEX (current/reported) |
 | ------------------------ | ----------------- | -------------------------- | -------------------------- | --------------------------- | --------------------------- |
@@ -274,10 +274,10 @@ pip install -e .
 
 | BackBone         | Model                    | Algorithm                                                                                          | Year | CheckIn |
 | ---------------- | ------------------------ | -------------------------------------------------------------------------------------------------- | ---- | ------- |
-| GNN              | ScMoGCN                  | Graph Neural Networks for Multimodal Single-Cell Data Integration                                  | 2022 | ✅       |
+| GNN              | ScMoGCN                  | Graph Neural Networks for Multimodal Single-Cell Data Integration                                  | 2022 | ✅      |
 | GNN/Auto-ecnoder | GLUE                     | Multi-omics single-cell data integration and regulatory inference with graph-linked embedding      | 2021 | P1      |
-| Generative Model | SCMM                     | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅       |
-| Auto-encoder     | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders        | 2021 | ✅       |
+| Generative Model | SCMM                     | SCMM: MIXTURE-OF-EXPERTS MULTIMODAL DEEP GENERATIVE MODEL FOR SINGLE-CELL MULTIOMICS DATA ANALYSIS | 2021 | ✅      |
+| Auto-encoder     | Cross-modal autoencoders | Multi-domain translation between single-cell imaging and sequencing data using autoencoders        | 2021 | ✅      |
 
 | Model                    | Evaluation Metric | GEX2ADT (current/reported) | GEX2ATAC (current/reported) |
 | ------------------------ | ----------------- | -------------------------- | --------------------------- |
@@ -289,11 +289,11 @@ pip install -e .
 
 | BackBone         | Model   | Algorithm                                                                                             | Year | CheckIn |
 | ---------------- | ------- | ----------------------------------------------------------------------------------------------------- | ---- | ------- |
-| GNN              | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration                                     | 2022 | ✅       |
-| Auto-encoder     | scMVAE  | Deep-joint-learning analysis model of single cell transcriptome and open chromatin accessibility data | 2020 | ✅       |
-| Auto-encoder     | scDEC   | Simultaneous deep generative modelling and clustering of single-cell genomic data                     | 2021 | ✅       |
+| GNN              | ScMoGCN | Graph Neural Networks for Multimodal Single-Cell Data Integration                                     | 2022 | ✅      |
+| Auto-encoder     | scMVAE  | Deep-joint-learning analysis model of single cell transcriptome and open chromatin accessibility data | 2020 | ✅      |
+| Auto-encoder     | scDEC   | Simultaneous deep generative modelling and clustering of single-cell genomic data                     | 2021 | ✅      |
 | GNN/Auto-ecnoder | GLUE    | Multi-omics single-cell data integration and regulatory inference with graph-linked embedding         | 2021 | P1      |
-| Auto-encoder     | DCCA    | Deep cross-omics cycle attention model for joint analysis of single-cell multi-omics data             | 2021 | ✅       |
+| Auto-encoder     | DCCA    | Deep cross-omics cycle attention model for joint analysis of single-cell multi-omics data             | 2021 | ✅      |
 
 | Model      | Evaluation Metric | GEX2ADT (current/reported) | GEX2ATAC (current/reported) |
 | ---------- | ----------------- | -------------------------- | --------------------------- |
@@ -329,11 +329,11 @@ pip install -e .
 
 | BackBone                         | Model      | Algorithm                                                                                                                                                                     | Year | CheckIn |
 | -------------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ------- |
-| GNN                              | SpaGCN     | SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network                   | 2021 | ✅       |
-| GNN                              | STAGATE    | Deciphering spatial domains from spatially resolved transcriptomics with adaptive graph attention auto-encoder                                                                | 2021 | ✅       |
+| GNN                              | SpaGCN     | SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network                   | 2021 | ✅      |
+| GNN                              | STAGATE    | Deciphering spatial domains from spatially resolved transcriptomics with adaptive graph attention auto-encoder                                                                | 2021 | ✅      |
 | Bayesian                         | BayesSpace | Spatial transcriptomics at subspot resolution with BayesSpace                                                                                                                 | 2021 | P1      |
-| Pseudo-space-time (PST) Distance | stLearn    | stLearn: integrating spatial location, tissue morphology and gene expression to find cell types, cell-cell interactions and spatial trajectories within undissociated tissues | 2020 | ✅       |
-| Heuristic                        | Louvain    | Fast unfolding of community hierarchies in large networks                                                                                                                     | 2008 | ✅       |
+| Pseudo-space-time (PST) Distance | stLearn    | stLearn: integrating spatial location, tissue morphology and gene expression to find cell types, cell-cell interactions and spatial trajectories within undissociated tissues | 2020 | ✅      |
+| Heuristic                        | Louvain    | Fast unfolding of community hierarchies in large networks                                                                                                                     | 2008 | ✅      |
 
 | Model   | Evaluation Metric | 151673 (current/reported) | 151676 (current/reported) | 151507 (current/reported) |
 | ------- | ----------------- | ------------------------- | ------------------------- | ------------------------- |
@@ -346,10 +346,10 @@ pip install -e .
 
 | BackBone                   | Model        | Algorithm                                                                                                     | Year | CheckIn |
 | -------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------- | ---- | ------- |
-| GNN                        | DSTG         | DSTG: deconvoluting spatial transcriptomics data through graph-based artificial intelligence                  | 2021 | ✅       |
-| logNormReg                 | SpatialDecon | Advances in mixed cell deconvolution enable quantification of cell types in spatial transcriptomic data       | 2022 | ✅       |
-| NNMFreg                    | SPOTlight    | SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots with single-cell transcriptomes | 2021 | ✅       |
-| NN Linear + CAR assumption | CARD         | Spatially informed cell-type deconvolution for spatial transcriptomics                                        | 2022 | ✅       |
+| GNN                        | DSTG         | DSTG: deconvoluting spatial transcriptomics data through graph-based artificial intelligence                  | 2021 | ✅      |
+| logNormReg                 | SpatialDecon | Advances in mixed cell deconvolution enable quantification of cell types in spatial transcriptomic data       | 2022 | ✅      |
+| NNMFreg                    | SPOTlight    | SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots with single-cell transcriptomes | 2021 | ✅      |
+| NN Linear + CAR assumption | CARD         | Spatially informed cell-type deconvolution for spatial transcriptomics                                        | 2022 | ✅      |
 
 | Model        | Evaluation Metric | GSE174746 (current/reported) | CARD Synthetic (current/reported) | SPOTlight Synthetic (current/reported) |
 | ------------ | ----------------- | ---------------------------- | --------------------------------- | -------------------------------------- |

From a80eecb172527d3ba13f55f4bde6a798f1a3108b Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 6 Sep 2024 23:24:06 +0800
Subject: [PATCH 066/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index dda0c5b4..4d25e1db 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -67,3 +67,4 @@ mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/
 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1
 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1
 human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1
+human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1

From 87bedf66a438b4b535c41b41b1969f49d1326563 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sat, 7 Sep 2024 16:38:59 +0800
Subject: [PATCH 067/203] minor change

---
 dance/metadata/scdeepsort.csv | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 4d25e1db..6464ba33 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -68,3 +68,5 @@ mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.
 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1
 human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1
 human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1
+human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1
+human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1

From d44b4bfedf78ea25a7975c004124fbdba389977e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sat, 7 Sep 2024 20:56:30 +0800
Subject: [PATCH 068/203] minor change

---
 dance/metadata/scdeepsort.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 6464ba33..e5d04770 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -70,3 +70,4 @@ human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_d
 human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1
 human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1
 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1
+human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1

From 27b2e9314a756423b9129e022c367265f75e5454 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sat, 7 Sep 2024 21:23:37 +0800
Subject: [PATCH 069/203] minor change

---
 dance/metadata/scdeepsort.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index e5d04770..f2fa3b4a 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -71,3 +71,4 @@ human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_d
 human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1
 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1
 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1
+human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1

From b71b4d1cf26daa9c7fc55c56042c76e4f9791bbf Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sat, 7 Sep 2024 23:02:22 +0800
Subject: [PATCH 070/203] minor change

---
 dance/metadata/scdeepsort.csv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index e5d04770..952e43fc 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -66,8 +66,9 @@ mouse,Brain,3285,train,train_mouse_Brain3285_celltype.csv,https://www.dropbox.co
 mouse,Brain,753,train,train_mouse_Brain753_celltype.csv,https://www.dropbox.com/s/x2katwk93z06sgw?dl=1,train_mouse_Brain753_data.csv,https://www.dropbox.com/s/3f3wbplgo3xa4ww?dl=1
 mouse,Kidney,4682,train,train_mouse_Kidney4682_celltype.csv,https://www.dropbox.com/s/3plrve7g9v428ec?dl=1,train_mouse_Kidney4682_data.csv,https://www.dropbox.com/s/olf5nirtieu1ikq?dl=1
 mouse,Spleen,1970,train,train_mouse_Spleen1970_celltype.csv,https://www.dropbox.com/s/3ea64vk546fjxvr?dl=1,train_mouse_Spleen1970_data.csv,https://www.dropbox.com/s/c4te0fr1qicqki8?dl=1
-human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=pvfbzpnz&dl=1
+human,Blood,5749,train,,,train_human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad,https://www.dropbox.com/scl/fi/sjcrmdpjrfhcd7642igc0/human_Blood471647b3-04fe-4c76-8372-3264feb950e8_data.h5ad?rlkey=8gn9esy6osdikumzbq0tk3stv&st=656m2nhu&dl=1
 human,Blood,5608,train,,,train_human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad,https://www.dropbox.com/scl/fi/h58s7qzb897p7iytkfkaa/human_Blood84230ea4-998d-4aa8-8456-81dd54ce23af_data.h5ad?rlkey=cfgq0o9vyjrez24162u3z0yjb&st=6h4dcda2&dl=1
 human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad,https://www.dropbox.com/scl/fi/3msp96ja6jfh5xlmw7a9x/human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_data.h5ad?rlkey=jpwwi9qs67zsnx1zu854an8q3&st=6c2xelpk&dl=1
 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1
 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1
+human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=z5djqf31&dl=1

From 2a92cf7523feb92c6443d8d686714686a28c6a75 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sun, 8 Sep 2024 11:07:38 +0800
Subject: [PATCH 071/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 1f90cc6c..417aabb7 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -72,3 +72,4 @@ human,Blood,4232,train,,,train_human_Blood8a554710-08bc-4005-87cd-da9675bdc2e7_d
 human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad,https://www.dropbox.com/scl/fi/rkihpmulg6mxg0a1o223b/human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_data.h5ad?rlkey=ez8p7a99a265u1g997m4e0xje&st=eenwcwfp&dl=1
 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1
 human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1
+human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/k8cgud4rd93lm8cjwbqhy/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=6jztgtxdd2u28cyoeikt4glo8&st=wihnoizk&dl=1

From a05b1a0d3ff28eeb9c20e9367ac0844a2cbf9712 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sun, 8 Sep 2024 20:14:05 +0800
Subject: [PATCH 072/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 417aabb7..e5b239db 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -73,3 +73,5 @@ human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_d
 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1
 human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1
 human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/k8cgud4rd93lm8cjwbqhy/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=6jztgtxdd2u28cyoeikt4glo8&st=wihnoizk&dl=1
+human,Blood,10000,train,,,train_human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/2vavg60b8kcs63idtley7/human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=zcphpx6fgip8iuqrdzeyq7r29&st=3og2rjo0&dl=1
+human,Blood,10000,train,,,train_human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad,https://www.dropbox.com/scl/fi/gice80zbl1ljei80la4g4/human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad?rlkey=ha3xj7w79u6ogo0djcuklapd5&st=bk1zjffw&dl=1

From ccd4168da532817bce665868d2af5ca5266ff8bf Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 9 Sep 2024 11:11:49 +0800
Subject: [PATCH 073/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index e5b239db..8b1c2c44 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -73,5 +73,13 @@ human,Blood,7142,train,,,train_human_Bloodd3566d6a-a455-4a15-980f-45eb29114cab_d
 human,Blood,9337,train,,,train_human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad,https://www.dropbox.com/scl/fi/s9scv6cscspqifttksy6i/human_Bloodeeacb0c1-2217-4cf6-b8ce-1f0fedf1b569_data.h5ad?rlkey=yuyu1u2qhm21y9on5i4b8o9rq&st=g19couz6&dl=1
 human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/xfs40azio1v07rs7f6uuy/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=1qnbtnlb0n9p38csc8un6voef&st=4oqen4p6&dl=1
 human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/k8cgud4rd93lm8cjwbqhy/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=6jztgtxdd2u28cyoeikt4glo8&st=wihnoizk&dl=1
-human,Blood,10000,train,,,train_human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/2vavg60b8kcs63idtley7/human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=zcphpx6fgip8iuqrdzeyq7r29&st=3og2rjo0&dl=1
-human,Blood,10000,train,,,train_human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad,https://www.dropbox.com/scl/fi/gice80zbl1ljei80la4g4/human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad?rlkey=ha3xj7w79u6ogo0djcuklapd5&st=bk1zjffw&dl=1
+human,Blood,10000,train,,,train_human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/wqfbtld761iu33xydjih1/human_Blood01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=6fpf8eu9lm6orczc4lzlhu22f&st=joo51ad1&dl=1
+human,Blood,10000,train,,,train_human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad,https://www.dropbox.com/scl/fi/d60o68h6gr42di9x1ogv7/human_Blood055ca631-6ffb-40de-815e-b931e10718c0_data.h5ad?rlkey=hpofxqr2gdue3avafqa09fyw7&st=ykswto0h&dl=1
+human,Blood,10000,train,,,train_human_Blood2a498ace-872a-4935-984b-1afa70fd9886_data.h5ad,https://www.dropbox.com/scl/fi/krlqfjj15wrw8qvz5ra0s/human_Blood2a498ace-872a-4935-984b-1afa70fd9886_data.h5ad?rlkey=4jxlruy78g3hp9yr608pmvs9s&st=org7vf5t&dl=1
+human,Blood,10000,train,,,train_human_Blood2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/xfw2youc9jb2ob9oun26c/human_Blood2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=imvru6vfwile7rrwqjedevhnm&st=khi36f85&dl=1
+human,Blood,10000,train,,,train_human_Blood3faad104-2ab8-4434-816d-474d8d2641db_data.h5ad,https://www.dropbox.com/scl/fi/gygnlfkfdxd0av7vrvub3/human_Blood3faad104-2ab8-4434-816d-474d8d2641db_data.h5ad?rlkey=s3uobbze9qhbrrd2u3qkqbtw6&st=2ignoyi8&dl=1
+human,Blood,10000,train,,,train_human_Blood4c4cd77c-8fee-4836-9145-16562a8782fe_data.h5ad,https://www.dropbox.com/scl/fi/lgqrq59631rxrwrajkii5/human_Blood4c4cd77c-8fee-4836-9145-16562a8782fe_data.h5ad?rlkey=3a1176le2rorzd82rf4mmswyz&st=2bpcrwsq&dl=1
+human,Blood,10000,train,,,train_human_Bloodae29ebd0-1973-40a4-a6af-d15a5f77a80f_data.h5ad,https://www.dropbox.com/scl/fi/ax97ls2ojm3x5asoip2ji/human_Bloodae29ebd0-1973-40a4-a6af-d15a5f77a80f_data.h5ad?rlkey=k51mhhavmzfy4xq8gx52tjtq8&st=14tfycvk&dl=1
+human,Blood,10000,train,,,train_human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad,https://www.dropbox.com/scl/fi/md5mqb2dh0w9655v0c281/human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad?rlkey=afdyzlpcmd44lo7tl5gnzw8u3&st=gt2fdipz&dl=1
+human,Blood,10000,train,,,train_human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad,https://www.dropbox.com/scl/fi/1cih4y8h03dboijqieheg/human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad?rlkey=yupm1kblpt9a8qlmksz1u3xob&st=9jurnfn4&dl=1
+human,Blood,10000,train,,,train_human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad,https://www.dropbox.com/scl/fi/b2hvwk1xmbc6ifhouz4kv/human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad?rlkey=82vzr0qcii75tm4sjsn2xw89g&st=6pwjmxnk&dl=1

From 1c19f369719d7124a4e408242afa971f61b5ad91 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 10 Sep 2024 15:44:21 +0800
Subject: [PATCH 074/203] update main

---
 examples/tuning/cta_celltypist/main.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py
index d3539816..c7a049ec 100644
--- a/examples/tuning/cta_celltypist/main.py
+++ b/examples/tuning/cta_celltypist/main.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist
@@ -25,7 +25,7 @@
                         help="Whether to refine the predicted labels via majority voting after over-clustering.")
     parser.add_argument("--n_jobs", type=int, help="Number of jobs", default=10)
     parser.add_argument("--species", default="mouse", type=str)
-    parser.add_argument("--test_dataset", nargs="+", default=[1759], help="List of testing dataset ids.")
+    parser.add_argument("--test_dataset", nargs="+", default=[], help="List of testing dataset ids.")
     parser.add_argument("--tissue", default="Spleen", type=str)
     parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.")
     parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.")
@@ -38,13 +38,15 @@
     parser.add_argument("--sweep_id", type=str, default=None)
     parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
     parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+    parser.add_argument("--filetype", default="csv")
     args = parser.parse_args()
     logger.setLevel(args.log_level)
     logger.info(f"Running Celltypist with the following parameters:\n{pprint.pformat(vars(args))}")
     file_root_path = Path(
         args.root_path, "_".join([
             "-".join([str(num) for num in dataset])
-            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None
+            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset]
+            if (dataset is not None and dataset != [])
         ])).resolve()
     logger.info(f"\n files is saved in {file_root_path}")
     MAINDIR = Path(__file__).resolve().parent
@@ -64,7 +66,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         # Load data and perform necessary preprocessing
         data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset,
                                          species=args.species, tissue=args.tissue, valid_dataset=args.valid_dataset,
-                                         data_dir="../temp_data").load_data()
+                                         data_dir="../temp_data", filetype=args.filetype).load_data()
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)
 

From 6d1f47874336706a9e6f06021fad189298c1b5a7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:45:13 +0000
Subject: [PATCH 075/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/cta_celltypist/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py
index c7a049ec..c625065f 100644
--- a/examples/tuning/cta_celltypist/main.py
+++ b/examples/tuning/cta_celltypist/main.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist

From 1c86b2bec947261212279546aa775901e9f76db4 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 10 Sep 2024 15:49:26 +0800
Subject: [PATCH 076/203] update main

---
 examples/tuning/cta_actinn/main.py            |  4 ++--
 examples/tuning/cta_scdeepsort/main.py        | 13 +++++++------
 examples/tuning/cta_singlecellnet/main.py     | 13 +++++++------
 examples/tuning/imputation_deepimpute/main.py |  2 +-
 examples/tuning/imputation_graphsci/main.py   |  2 +-
 examples/tuning/imputation_scgnn2/main.py     |  2 +-
 examples/tuning/joint_embedding_dcca/main.py  | 15 ++++++++++++---
 7 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py
index 616c0d10..84c30745 100644
--- a/examples/tuning/cta_actinn/main.py
+++ b/examples/tuning/cta_actinn/main.py
@@ -5,8 +5,8 @@
 from typing import get_args
 
 import numpy as np
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN
@@ -55,7 +55,7 @@
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
 
     logger.setLevel(args.log_level)
-    logger.info(f"Running SVM with the following parameters:\n{pprint.pformat(vars(args))}")
+    logger.info(f"Running ACTINN with the following parameters:\n{pprint.pformat(vars(args))}")
 
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
diff --git a/examples/tuning/cta_scdeepsort/main.py b/examples/tuning/cta_scdeepsort/main.py
index 86cc5d52..1d928be6 100644
--- a/examples/tuning/cta_scdeepsort/main.py
+++ b/examples/tuning/cta_scdeepsort/main.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.scdeepsort import ScDeepSort
@@ -29,10 +29,10 @@
     parser.add_argument("--n_epochs", type=int, default=100, help="number of training epochs")
     parser.add_argument("--n_layers", type=int, default=1, help="number of hidden gcn layers")
     parser.add_argument("--species", default="mouse", type=str)
-    parser.add_argument("--test_dataset", nargs="+", type=int, default=[1759], help="Testing dataset IDs")
+    parser.add_argument("--test_dataset", nargs="+", type=int, default=[], help="Testing dataset IDs")
     parser.add_argument("--test_rate", type=float, default=0.2)
     parser.add_argument("--tissue", default="Spleen", type=str)
-    parser.add_argument("--train_dataset", nargs="+", type=int, default=[1970], help="List of training dataset ids.")
+    parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.")
     parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.")
     parser.add_argument("--weight_decay", type=float, default=5e-4, help="Weight for L2 loss")
     parser.add_argument("--seed", type=int, default=42)
@@ -42,14 +42,15 @@
     parser.add_argument("--sweep_id", type=str, default=None)
     parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
     parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
-
+    parser.add_argument("--filetype", default="csv")
     args = parser.parse_args()
     logger.setLevel(args.log_level)
     logger.info(f"Running ScDeepSort with the following parameters:\n{pprint.pformat(vars(args))}")
     file_root_path = Path(
         args.root_path, "_".join([
             "-".join([str(num) for num in dataset])
-            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None
+            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset]
+            if (dataset is not None and dataset != [])
         ])).resolve()
     logger.info(f"\n files is saved in {file_root_path}")
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
@@ -61,7 +62,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         # Load data and perform necessary preprocessing
         data = CellTypeAnnotationDataset(species=args.species, tissue=args.tissue, test_dataset=args.test_dataset,
                                          train_dataset=args.train_dataset, valid_dataset=args.valid_dataset,
-                                         data_dir="../temp_data").load_data()
+                                         data_dir="../temp_data", filetype=args.filetype).load_data()
         # Prepare preprocessing pipeline and apply it to data
         kwargs = {tune_mode: dict(wandb.config)}
         preprocessing_pipeline = pipeline_planer.generate(**kwargs)
diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py
index 892c6633..b1b87d5a 100644
--- a/examples/tuning/cta_singlecellnet/main.py
+++ b/examples/tuning/cta_singlecellnet/main.py
@@ -6,8 +6,8 @@
 from typing import get_args
 
 import numpy as np
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet
@@ -28,10 +28,9 @@
     parser.add_argument("--num_trees", type=int, default=1000)
     parser.add_argument("--species", default="mouse", type=str)
     parser.add_argument("--stratify", type=bool, default=True)
-    parser.add_argument("--test_dataset", type=int, nargs="+", default=[1759],
-                        help="List testing training dataset ids.")
+    parser.add_argument("--test_dataset", nargs="+", default=[], help="List testing training dataset ids.")
     parser.add_argument("--tissue", default="Spleen", type=str)
-    parser.add_argument("--train_dataset", type=int, nargs="+", default=[1970], help="List of training dataset ids.")
+    parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.")
     parser.add_argument("--valid_dataset", nargs="+", default=None, help="List of valid dataset ids.")
     parser.add_argument("--seed", type=int, default=10)
 
@@ -40,13 +39,15 @@
     parser.add_argument("--sweep_id", type=str, default=None)
     parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
     parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+    parser.add_argument("--filetype", default="csv")
     args = parser.parse_args()
     logger.setLevel(args.log_level)
     logger.info(f"{pprint.pformat(vars(args))}")
     file_root_path = Path(
         args.root_path, "_".join([
             "-".join([str(num) for num in dataset])
-            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset] if dataset is not None
+            for dataset in [args.train_dataset, args.valid_dataset, args.test_dataset]
+            if (dataset is not None and dataset != [])
         ])).resolve()
     logger.info(f"\n files is saved in {file_root_path}")
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
@@ -62,7 +63,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         # Load data and perform necessary preprocessing
         data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset,
                                          species=args.species, tissue=args.tissue, valid_dataset=args.valid_dataset,
-                                         data_dir="../temp_data").load_data(cache=args.cache)
+                                         data_dir="../temp_data", filetype=args.filetype).load_data(cache=args.cache)
         kwargs = {tune_mode: dict(wandb.config)}
         preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
diff --git a/examples/tuning/imputation_deepimpute/main.py b/examples/tuning/imputation_deepimpute/main.py
index 35989205..041ee8b9 100644
--- a/examples/tuning/imputation_deepimpute/main.py
+++ b/examples/tuning/imputation_deepimpute/main.py
@@ -5,8 +5,8 @@
 
 import numpy as np
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import ImputationDataset
 from dance.modules.single_modality.imputation.deepimpute import DeepImpute
diff --git a/examples/tuning/imputation_graphsci/main.py b/examples/tuning/imputation_graphsci/main.py
index 59705b7a..353cb246 100644
--- a/examples/tuning/imputation_graphsci/main.py
+++ b/examples/tuning/imputation_graphsci/main.py
@@ -7,9 +7,9 @@
 import anndata as ad
 import numpy as np
 import torch
-import wandb
 
 import dance.transforms.normalize as NormFuncs
+import wandb
 from dance import logger
 from dance.data import Data
 from dance.datasets.singlemodality import ImputationDataset
diff --git a/examples/tuning/imputation_scgnn2/main.py b/examples/tuning/imputation_scgnn2/main.py
index aa5f31f9..ee75b386 100644
--- a/examples/tuning/imputation_scgnn2/main.py
+++ b/examples/tuning/imputation_scgnn2/main.py
@@ -5,8 +5,8 @@
 from pprint import pformat
 
 import numpy as np
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import ImputationDataset
 from dance.modules.single_modality.imputation.scgnn2 import ScGNN2
diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index 7a1ed2b3..a17d10f6 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -12,10 +12,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
@@ -95,11 +95,19 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         #                      layer_e_2=[Nfeature2, 1500, 128], hidden1_2=128, Zdim_2=4, layer_d_2=[4], hidden2_2=4, args=args,
         #                      Type_1="NB", Type_2="Bernoulli", ground_truth1=torch.cat([train_labels, test_labels]), cycle=1,
         #                      attention_loss="Eucli")  # yapf: disable
+        wandb_config = wandb.config
+        if "run_kwargs" in pipeline_planer.config:
+            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                wandb_config = wandb_config["run_kwargs"]
+            else:
+                wandb.log({"skip": 1})
+                wandb.finish()
+                return
         try:
             dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
             data = dataset.load_data()
             # Prepare preprocessing pipeline and apply it to data
-            kwargs = {tune_mode: dict(wandb.config)}
+            kwargs = {tune_mode: dict(wandb_config)}
             preprocessing_pipeline = pipeline_planer.generate(**kwargs)
             print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
             preprocessing_pipeline(data)
@@ -120,7 +128,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             y_train_size), train_labels = data.get_train_data(return_type="torch")
             (x_test, y_test, x_test_raw, y_test_raw, x_test_size,
             y_test_size), test_labels = data.get_test_data(return_type="torch")
-
+            train_idx=data.get_split_idx("train")
+            test_idx=data.get_split_idx("test")
             Nfeature1 = x_train.shape[1]
             Nfeature2 = y_train.shape[1]
 

From 9cc76e433ae978788acc8e0fa69d9bc56e47631d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:50:43 +0000
Subject: [PATCH 077/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/cta_actinn/main.py            | 2 +-
 examples/tuning/cta_scdeepsort/main.py        | 2 +-
 examples/tuning/cta_singlecellnet/main.py     | 2 +-
 examples/tuning/imputation_deepimpute/main.py | 2 +-
 examples/tuning/imputation_graphsci/main.py   | 2 +-
 examples/tuning/imputation_scgnn2/main.py     | 2 +-
 examples/tuning/joint_embedding_dcca/main.py  | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/tuning/cta_actinn/main.py b/examples/tuning/cta_actinn/main.py
index 84c30745..230b3295 100644
--- a/examples/tuning/cta_actinn/main.py
+++ b/examples/tuning/cta_actinn/main.py
@@ -5,8 +5,8 @@
 from typing import get_args
 
 import numpy as np
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN
diff --git a/examples/tuning/cta_scdeepsort/main.py b/examples/tuning/cta_scdeepsort/main.py
index 1d928be6..40e1e7af 100644
--- a/examples/tuning/cta_scdeepsort/main.py
+++ b/examples/tuning/cta_scdeepsort/main.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.scdeepsort import ScDeepSort
diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py
index b1b87d5a..cc5406d9 100644
--- a/examples/tuning/cta_singlecellnet/main.py
+++ b/examples/tuning/cta_singlecellnet/main.py
@@ -6,8 +6,8 @@
 from typing import get_args
 
 import numpy as np
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet
diff --git a/examples/tuning/imputation_deepimpute/main.py b/examples/tuning/imputation_deepimpute/main.py
index 041ee8b9..35989205 100644
--- a/examples/tuning/imputation_deepimpute/main.py
+++ b/examples/tuning/imputation_deepimpute/main.py
@@ -5,8 +5,8 @@
 
 import numpy as np
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import ImputationDataset
 from dance.modules.single_modality.imputation.deepimpute import DeepImpute
diff --git a/examples/tuning/imputation_graphsci/main.py b/examples/tuning/imputation_graphsci/main.py
index 353cb246..59705b7a 100644
--- a/examples/tuning/imputation_graphsci/main.py
+++ b/examples/tuning/imputation_graphsci/main.py
@@ -7,9 +7,9 @@
 import anndata as ad
 import numpy as np
 import torch
+import wandb
 
 import dance.transforms.normalize as NormFuncs
-import wandb
 from dance import logger
 from dance.data import Data
 from dance.datasets.singlemodality import ImputationDataset
diff --git a/examples/tuning/imputation_scgnn2/main.py b/examples/tuning/imputation_scgnn2/main.py
index ee75b386..aa5f31f9 100644
--- a/examples/tuning/imputation_scgnn2/main.py
+++ b/examples/tuning/imputation_scgnn2/main.py
@@ -5,8 +5,8 @@
 from pprint import pformat
 
 import numpy as np
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import ImputationDataset
 from dance.modules.single_modality.imputation.scgnn2 import ScGNN2
diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index a17d10f6..dd8f9f76 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -12,10 +12,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA

From e3ba672e679ee47ff2ff1846786e27db3d344456 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 13 Sep 2024 23:29:36 +0800
Subject: [PATCH 078/203] update scn

---
 dance/transforms/scn_feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/transforms/scn_feature.py b/dance/transforms/scn_feature.py
index da86e5d1..9a959a3a 100644
--- a/dance/transforms/scn_feature.py
+++ b/dance/transforms/scn_feature.py
@@ -49,7 +49,7 @@ def __call__(self, data):
         # sc.pp.scale(adata, max_value=10)
         # Filtering shouldn't be here
         norm_exp_df = adata.to_df()
-        cell_type_df = cell_type_df.loc[adata.obs_names]  # not necessary, but kept here in case we subsample cells
+        # cell_type_df = cell_type_df.loc[adata.obs_names]  # not necessary, but kept here in case we subsample cells
 
         # Get differentially expressed genes and gene pairs
         cell_type_array = cell_type_df.columns.values[cell_type_df.values.argmax(1)]

From 04a3a4bac72c9b4f5e1fbb9bcbe394f6559c48c1 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Sep 2024 17:19:26 +0800
Subject: [PATCH 079/203] minor change

---
 get_result_web.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 get_result_web.py

diff --git a/get_result_web.py b/get_result_web.py
new file mode 100644
index 00000000..8d79d857
--- /dev/null
+++ b/get_result_web.py
@@ -0,0 +1,85 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from dance.utils import try_import
+
+# os.environ["http_proxy"]="http://121.250.209.147:7890"
+# os.environ["https_proxy"]="http://121.250.209.147:7890"
+wandb = try_import("wandb")
+entity = "xzy11632"
+project = "dance-dev"
+collect_datasets = {
+    "cta_actinn": [
+        "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab",
+        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436"
+    ],
+    "cta_celltypist": [
+        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031",
+        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
+        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
+    ],
+    "cta_singlecellnet": [
+        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031",
+        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
+        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
+    ]
+}
+file_root = "/home/zyxing/dance/examples/tuning"
+
+
+def check_identical_strings(string_list):
+    if not string_list:
+        raise ValueError("列表为空")
+
+    arr = np.array(string_list)
+    if not np.all(arr == arr[0]):
+        raise ValueError("发现不同的字符串")
+
+    return string_list[0]
+
+
+    # if not string_list:
+    #     raise ValueError("列表为空")
+    # first_string = string_list[0]
+    # for s in string_list[1:]:
+    #     if s != first_string:
+    #         raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'")
+    # return first_string
+def get_sweep_url(step_csv: pd.DataFrame):
+    ids = step_csv["id"]
+    sweep_urls = []
+    for run_id in tqdm(ids, leave=False):
+        api = wandb.Api()
+        run = api.run(f"/{entity}/{project}/runs/{run_id}")
+        sweep_urls.append(run.sweep.url)
+    sweep_url = check_identical_strings(sweep_urls)
+    return sweep_url
+
+
+def write_ans():
+    ans = []
+    for method_folder in tqdm(collect_datasets):
+        for dataset_id in collect_datasets[method_folder]:
+            file_path = f"{file_root}/{method_folder}/{dataset_id}/results"
+            step2_url = get_sweep_url(pd.read_csv(f"{file_path}/pipeline/best_test_acc.csv"))
+            step3_urls = []
+            for i in range(3):
+                step3_urls.append(get_sweep_url(pd.read_csv(f"{file_path}/params/{i}_best_test_acc.csv")))
+            step3_str = ",".join(step3_urls)
+            step_str = f"step2:{step2_url}|step3:{step3_str}"
+            ans.append({"Dataset_id": dataset_id, method_folder: step_str})
+    with open('temp_ans.json', 'w') as f:
+        json.dump(ans, f)
+
+
+write_ans()

From ab7e193de5833450e0db3df9dde010c7ced8fc21 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Sep 2024 23:36:13 +0800
Subject: [PATCH 080/203] update_get_result_web

---
 get_result_web.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 8d79d857..cb5018ef 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -7,8 +7,8 @@
 
 from dance.utils import try_import
 
-# os.environ["http_proxy"]="http://121.250.209.147:7890"
-# os.environ["https_proxy"]="http://121.250.209.147:7890"
+os.environ["http_proxy"] = "http://121.250.209.147:7890"
+os.environ["https_proxy"] = "http://121.250.209.147:7890"
 wandb = try_import("wandb")
 entity = "xzy11632"
 project = "dance-dev"
@@ -55,13 +55,16 @@ def check_identical_strings(string_list):
     #     if s != first_string:
     #         raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'")
     # return first_string
-def get_sweep_url(step_csv: pd.DataFrame):
+def get_sweep_url(step_csv: pd.DataFrame, single=True):
     ids = step_csv["id"]
     sweep_urls = []
-    for run_id in tqdm(ids, leave=False):
+    for run_id in tqdm(reversed(ids),
+                       leave=False):  #The reversal of order is related to additional_sweep_ids.append(sweep_id)
         api = wandb.Api()
         run = api.run(f"/{entity}/{project}/runs/{run_id}")
         sweep_urls.append(run.sweep.url)
+        if single:
+            break
     sweep_url = check_identical_strings(sweep_urls)
     return sweep_url
 
@@ -74,7 +77,11 @@ def write_ans():
             step2_url = get_sweep_url(pd.read_csv(f"{file_path}/pipeline/best_test_acc.csv"))
             step3_urls = []
             for i in range(3):
-                step3_urls.append(get_sweep_url(pd.read_csv(f"{file_path}/params/{i}_best_test_acc.csv")))
+                file_csv = f"{file_path}/params/{i}_best_test_acc.csv"
+                if not os.path.exists(file_csv):
+                    print(f"文件 {file_csv} 不存在，跳过。")
+                    continue
+                step3_urls.append(get_sweep_url(pd.read_csv(file_csv)))
             step3_str = ",".join(step3_urls)
             step_str = f"step2:{step2_url}|step3:{step3_str}"
             ans.append({"Dataset_id": dataset_id, method_folder: step_str})

From f1e85b07225d2a8b74936dc4f4f2f4b9e8445b86 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sun, 29 Sep 2024 09:21:22 +0800
Subject: [PATCH 081/203] update 159

---
 .../multi_modality/joint_embedding/scmogcn.py |   2 +-
 .../multi_modality/joint_embedding/scmvae.py  |   5 +-
 .../tuning/joint_embedding_scmogcn/main.py    | 149 +++++++++------
 .../tuning/joint_embedding_scmvae/main.py     | 177 ++++++++++--------
 4 files changed, 197 insertions(+), 136 deletions(-)

diff --git a/dance/modules/multi_modality/joint_embedding/scmogcn.py b/dance/modules/multi_modality/joint_embedding/scmogcn.py
index f0567293..a5fd4bd0 100644
--- a/dance/modules/multi_modality/joint_embedding/scmogcn.py
+++ b/dance/modules/multi_modality/joint_embedding/scmogcn.py
@@ -116,7 +116,7 @@ def fit(self, g_mod1, g_mod2, train_size, cell_type, batch_label, phase_score):
             Bipartite expression feature graph for modality 1.
         g_mod2 : dgl.DGLGraph
             Bipartite expression feature graph for modality 2.
-        train_size : int
+        train_size : int or array_like
             Number of training samples.
         labels : torch.Tensor
             Labels for training samples.
diff --git a/dance/modules/multi_modality/joint_embedding/scmvae.py b/dance/modules/multi_modality/joint_embedding/scmvae.py
index 3905f2fe..36837053 100644
--- a/dance/modules/multi_modality/joint_embedding/scmvae.py
+++ b/dance/modules/multi_modality/joint_embedding/scmvae.py
@@ -369,7 +369,7 @@ def _inference(self, X1=None, X2=None):
 
         if X1 is not None:
             if self.log_variational:
-                X1_ = torch.log(X1_ + 1)
+                X1_ = torch.log(torch.clamp(X1_,min=1e-7)+ 1)
 
             mean_l, logvar_l, library = self.X1_encoder_l(X1_)
 
@@ -380,7 +380,8 @@ def _inference(self, X1=None, X2=None):
 
             if self.Type == 'ZINB':
                 if self.log_variational:
-                    X2_ = torch.log(X2_ + 1)
+                    # X2_ = torch.log(X2_ + 1)
+                    X2_ = torch.log(torch.clamp(X2_,min=1e-7)+ 1)
                     mean_l2, logvar_l2, library2 = self.X2_encoder_l(X2_)
 
         means, logvar = self._encode_modalities(X1_, X2_)
diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index 90ec2e96..bbb6d890 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -1,4 +1,5 @@
 import argparse
+import gc
 import os
 import pprint
 import sys
@@ -20,7 +21,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-        choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+        choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_SKIN_atac2gex","openproblems_2022_multi_atac2gex"])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -55,66 +56,98 @@
     logger.info(f"\n files is saved in {file_root_path}")
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
     os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
-
+    os.environ["CUDA_LAUNCH_BLOCKING"]="1"
+    os.environ["WANDB_AGENT_DISABLE_FLAPPING"] = "True"
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
-        data = dataset.load_data()
-        # Prepare preprocessing pipeline and apply it to data
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
-        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
-        preprocessing_pipeline(data)
-        if args.preprocess != "aux":
-            cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
-            cell_type_labels_unique = list(np.unique(cell_type_labels))
-            c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
-            data.data['mod1'].obsm["cell_type"] = c_labels
-            data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
-            data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
-
-        train_size = len(data.get_split_idx("train"))
-
-        data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data)
-        data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data)
-        # data.set_config(
-        #     feature_mod=["mod1", "mod2"],
-        #     label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"],
-        #     feature_channel=["X_pca", "X_pca"],
-        #     label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
-        # )
-        (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch")
-        phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
-        test_id = np.arange(x_mod1.shape[0])
-        labels = cell_type.numpy()
-        adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
-        model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
-                               num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1])
-        model.fit(
-            g_mod1=data.data["mod1"].uns["g"],
-            g_mod2=data.data["mod2"].uns["g"],
-            train_size=train_size,
-            cell_type=cell_type,
-            batch_label=batch_label,
-            phase_score=phase_score,
-        )
-
-        embeds = model.predict(test_id).cpu().numpy()
-        score = model.score(test_id, labels, metric="clustering")
-        # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
-        score.update({
-            'subtask': args.subtask,
-            'method': 'scmogcn',
-        })
-
-        score["ARI"] = score["dance_ari"]
-        del score["dance_ari"]
-        wandb.log(score)
-        wandb.finish()
-        torch.cuda.empty_cache()
+        wandb_config = wandb.config
+        if "run_kwargs" in pipeline_planer.config:
+            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                wandb_config = wandb_config["run_kwargs"]
+            else:
+                wandb.log({"skip": 1})
+                wandb.finish()
+                return
+        try:
+            dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess)
+            data = dataset.load_data()
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb_config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+            preprocessing_pipeline(data)
+            # train_idx=list(set(data.mod["meta1"].obs_names) & set(data.mod["mod1"].obs_names))
+            train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
+            
+            # train_size=data.mod["meta1"].shape[0]
+            # test_size=data.mod["mod1"].shape[0]-train_size
+            data.set_split_idx("train",train_idx)
+            data.set_split_idx("test",test_idx)
+            if args.preprocess != "aux":
+                cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
+                cell_type_labels_unique = list(np.unique(cell_type_labels))
+                c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
+                data.data['mod1'].obsm["cell_type"] = c_labels
+                data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
+                data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
+                data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
+                data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
+
+            # train_size = len(data.get_split_idx("train"))
+            #按理说meta1应该包括mod1前半部分的所有内容，可能中途打乱了顺序
+            data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data)
+            data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data)
+            # data.set_config(
+            #     feature_mod=["mod1", "mod2"],
+            #     label_mod=["mod1", "mod1", "mod1", "mod1", "mod1"],
+            #     feature_channel=["X_pca", "X_pca"],
+            #     label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
+            # )
+            (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch")
+            phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
+            test_id = np.arange(x_mod1.shape[0])
+            labels = cell_type.numpy()
+            adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
+            model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
+                                num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1])
+            model.fit(
+                g_mod1=data.data["mod1"].uns["g"],
+                g_mod2=data.data["mod2"].uns["g"],
+                train_size=train_idx,
+                cell_type=cell_type,
+                batch_label=batch_label,
+                phase_score=phase_score,
+            )
+
+            embeds = model.predict(test_id).cpu().numpy()
+            score = model.score(test_id, labels, metric="clustering")
+            # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+            score.update({
+                'subtask': args.subtask,
+                'method': 'scmogcn',
+            })
+
+            score["ARI"] = score["dance_ari"]
+            del score["dance_ari"]
+            wandb.log(score)
+            wandb.finish()
+        finally:
+            # del data,model,adata_sol,adata,embeds,emb1, emb2,total_loader,total,test_loader,test,train_loader,train,Nfeature2,Nfeature1
+            # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels
+            # del labels,le,dataset,score
+            # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score]
+            locals_keys=list(locals().keys())
+            for var in locals_keys:
+                try:
+                    exec(f"del {var}")
+                    logger.info(f"Deleted '{var}'")
+                except NameError:
+                    logger.info(f"Variable '{var}' does not exist, continuing...")
+            torch.cuda.empty_cache()
+            gc.collect()
 
     entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
         evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index c52f2108..b3070b40 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -1,4 +1,5 @@
 import argparse
+import gc
 import os
 import pprint
 import sys
@@ -71,81 +72,107 @@ def parameter_setting():
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
-        data = dataset.load_data()
-
-        le = preprocessing.LabelEncoder()
-        labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
-        data.mod["mod1"].obsm["labels"] = labels
-
-        # Prepare preprocessing pipeline and apply it to data
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
-        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
-        preprocessing_pipeline(data)
-
-        (x_train, y_train), _ = data.get_train_data(return_type="torch")
-        (x_test, y_test), labels = data.get_test_data(return_type="torch")
-
-        lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()]))
-        lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()]))
-        lib_mean1 = torch.from_numpy(lib_mean1)
-        lib_var1 = torch.from_numpy(lib_var1)
-        lib_mean2 = torch.from_numpy(lib_mean2)
-        lib_var2 = torch.from_numpy(lib_var2)
-
-        Nfeature1 = x_train.shape[1]
-        Nfeature2 = y_train.shape[1]
-        train_size = len(data.get_split_idx("train"))
-        train = data_utils.TensorDataset(x_train, lib_mean1[:train_size], lib_var1[:train_size], lib_mean2[:train_size],
-                                         lib_var2[:train_size], y_train)
-
-        valid = data_utils.TensorDataset(x_test, lib_mean1[train_size:], lib_var1[train_size:], lib_mean2[train_size:],
-                                         lib_var2[train_size:], y_test)
-
-        total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
-
-        total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
-
-        x_test = torch.cat([x_train, x_test])
-        y_test = torch.cat([y_train, y_test])
-        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
-        model = scMVAE(
-            encoder_1=[Nfeature1, 1024, 128, 128],
-            hidden_1=128,
-            Z_DIMS=22,
-            decoder_share=[22, 128, 256],
-            share_hidden=128,
-            decoder_1=[128, 128, 1024],
-            hidden_2=1024,
-            encoder_l=[Nfeature1, 128],
-            hidden3=128,
-            encoder_2=[Nfeature2, 1024, 128, 128],
-            hidden_4=128,
-            encoder_l1=[Nfeature2, 128],
-            hidden3_1=128,
-            decoder_2=[128, 128, 1024],
-            hidden_5=1024,
-            drop_rate=0.1,
-            log_variational=True,
-            Type="ZINB",
-            device=device,
-            n_centroids=22,
-            penality="GMM",
-            model=1,
-        )
-        model.to(device)
-        model.init_gmm_params(total_loader)
-        model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
-
-        # embeds = model.predict(x_test, y_test).cpu().numpy()
-        score = model.score(x_test, y_test, labels)
-        score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
-        score["ARI"] = score["dance_ari"]
-        del score["dance_ari"]
-        wandb.log(score)
-        wandb.finish()
-        torch.cuda.empty_cache()
+        wandb_config = wandb.config
+        if "run_kwargs" in pipeline_planer.config:
+            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                wandb_config = wandb_config["run_kwargs"]
+            else:
+                wandb.log({"skip": 1})
+                wandb.finish()
+                return
+        try:
+            dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
+            data = dataset.load_data()
+
+            le = preprocessing.LabelEncoder()
+            labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+            data.mod["mod1"].obsm["labels"] = labels
+
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb_config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+            preprocessing_pipeline(data)
+            train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
+            
+            # train_size=data.mod["meta1"].shape[0]
+            # test_size=data.mod["mod1"].shape[0]-train_size
+            data.set_split_idx("train",train_idx)
+            data.set_split_idx("test",test_idx)
+            (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch")
+            (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch")
+            # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels)
+            lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()]))
+            lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()]))
+            lib_mean1 = torch.from_numpy(lib_mean1)
+            lib_var1 = torch.from_numpy(lib_var1)
+            lib_mean2 = torch.from_numpy(lib_mean2)
+            lib_var2 = torch.from_numpy(lib_var2)
+
+            Nfeature1 = x_train.shape[1]
+            Nfeature2 = y_train.shape[1]
+            # train_size = len(data.get_split_idx("train"))
+            # train_size=x_train.shape[0]
+            train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx],
+                                            lib_var2[train_idx], y_train)
+
+            valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx],
+                                            lib_var2[test_idx], y_test)
+
+            total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
+
+            total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
+
+            x_test = torch.cat([x_train, x_test])
+            y_test = torch.cat([y_train, y_test])
+            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
+            model = scMVAE(
+                encoder_1=[Nfeature1, 1024, 128, 128],
+                hidden_1=128,
+                Z_DIMS=22,
+                decoder_share=[22, 128, 256],
+                share_hidden=128,
+                decoder_1=[128, 128, 1024],
+                hidden_2=1024,
+                encoder_l=[Nfeature1, 128],
+                hidden3=128,
+                encoder_2=[Nfeature2, 1024, 128, 128],
+                hidden_4=128,
+                encoder_l1=[Nfeature2, 128],
+                hidden3_1=128,
+                decoder_2=[128, 128, 1024],
+                hidden_5=1024,
+                drop_rate=0.1,
+                log_variational=True,
+                Type="ZINB",
+                device=device,
+                n_centroids=22,
+                penality="GMM",
+                model=1,
+            )
+            model.to(device)
+            model.init_gmm_params(total_loader)
+            model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
+
+            # embeds = model.predict(x_test, y_test).cpu().numpy()
+            score = model.score(x_test, y_test, labels)
+            # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
+            score["ARI"] = score["dance_ari"]
+            del score["dance_ari"]
+            wandb.log(score)
+            wandb.finish()
+        finally:
+            locals_keys=list(locals().keys())
+            for var in locals_keys:
+                try:
+                    exec(f"del {var}")
+                    logger.info(f"Deleted '{var}'")
+                except NameError:
+                    logger.info(f"Variable '{var}' does not exist, continuing...")
+            torch.cuda.empty_cache()
+            gc.collect()
         # score.update({
         #     'seed': args.seed + k,
         #     'subtask': args.subtask,

From ab0b5dfd5793be26c71bf10f800801312770c864 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 Sep 2024 01:21:54 +0000
Subject: [PATCH 082/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../multi_modality/joint_embedding/scmvae.py  |  4 +--
 .../tuning/joint_embedding_scmogcn/main.py    | 31 +++++++++++--------
 .../tuning/joint_embedding_scmvae/main.py     | 22 ++++++-------
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/dance/modules/multi_modality/joint_embedding/scmvae.py b/dance/modules/multi_modality/joint_embedding/scmvae.py
index 36837053..0173ea39 100644
--- a/dance/modules/multi_modality/joint_embedding/scmvae.py
+++ b/dance/modules/multi_modality/joint_embedding/scmvae.py
@@ -369,7 +369,7 @@ def _inference(self, X1=None, X2=None):
 
         if X1 is not None:
             if self.log_variational:
-                X1_ = torch.log(torch.clamp(X1_,min=1e-7)+ 1)
+                X1_ = torch.log(torch.clamp(X1_, min=1e-7) + 1)
 
             mean_l, logvar_l, library = self.X1_encoder_l(X1_)
 
@@ -381,7 +381,7 @@ def _inference(self, X1=None, X2=None):
             if self.Type == 'ZINB':
                 if self.log_variational:
                     # X2_ = torch.log(X2_ + 1)
-                    X2_ = torch.log(torch.clamp(X2_,min=1e-7)+ 1)
+                    X2_ = torch.log(torch.clamp(X2_, min=1e-7) + 1)
                     mean_l2, logvar_l2, library2 = self.X2_encoder_l(X2_)
 
         means, logvar = self._encode_modalities(X1_, X2_)
diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index bbb6d890..39152202 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -20,8 +20,10 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-        choices=["GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_SKIN_atac2gex","openproblems_2022_multi_atac2gex"])
+        "-t", "--subtask", default="openproblems_bmmc_cite_phase2", choices=[
+            "GSE140203_BRAIN_atac2gex", "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2",
+            "GSE140203_SKIN_atac2gex", "openproblems_2022_multi_atac2gex"
+        ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -56,8 +58,9 @@
     logger.info(f"\n files is saved in {file_root_path}")
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
     os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
-    os.environ["CUDA_LAUNCH_BLOCKING"]="1"
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
     os.environ["WANDB_AGENT_DISABLE_FLAPPING"] = "True"
+
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
@@ -78,14 +81,14 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
             preprocessing_pipeline(data)
             # train_idx=list(set(data.mod["meta1"].obs_names) & set(data.mod["mod1"].obs_names))
-            train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
-            train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
-            test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
-            
+            train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx)))
+
             # train_size=data.mod["meta1"].shape[0]
             # test_size=data.mod["mod1"].shape[0]-train_size
-            data.set_split_idx("train",train_idx)
-            data.set_split_idx("test",test_idx)
+            data.set_split_idx("train", train_idx)
+            data.set_split_idx("test", test_idx)
             if args.preprocess != "aux":
                 cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
                 cell_type_labels_unique = list(np.unique(cell_type_labels))
@@ -106,13 +109,15 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             #     feature_channel=["X_pca", "X_pca"],
             #     label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
             # )
-            (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch")
+            (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score,
+                               G2M_score) = data.get_data(return_type="torch")
             phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
             test_id = np.arange(x_mod1.shape[0])
             labels = cell_type.numpy()
             adata_sol = data.data['test_sol']  # [data._split_idx_dict['test']]
-            model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1), num_batches=int(batch_label.max() + 1),
-                                num_phases=phase_score.shape[1], num_features=x_mod1.shape[1] + x_mod2.shape[1])
+            model = ScMoGCNWrapper(args, num_celL_types=int(cell_type.max() + 1),
+                                   num_batches=int(batch_label.max() + 1), num_phases=phase_score.shape[1],
+                                   num_features=x_mod1.shape[1] + x_mod2.shape[1])
             model.fit(
                 g_mod1=data.data["mod1"].uns["g"],
                 g_mod2=data.data["mod2"].uns["g"],
@@ -139,7 +144,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             # del x_train, y_train, x_train_raw, y_train_raw, x_train_size,y_train_size,train_labels,x_test, y_test, x_test_raw, y_test_raw, x_test_size,y_test_size, test_labels
             # del labels,le,dataset,score
             # variables_to_delete=["data","model","adata_sol","adata","embeds","emb1", "emb2","total_loader","total,test_loader","test,train_loader","train","Nfeature2","Nfeature1","x_train", "y_train", "x_train_raw", "y_train_raw", "x_train_size","y_train_size","train_labels","x_test", "y_test"," x_test_raw", y_test_raw, x_test_size,y_test_size, test_labels,labels,le,dataset,score]
-            locals_keys=list(locals().keys())
+            locals_keys = list(locals().keys())
             for var in locals_keys:
                 try:
                     exec(f"del {var}")
diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index b3070b40..9fb85885 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -93,16 +93,16 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             preprocessing_pipeline = pipeline_planer.generate(**kwargs)
             print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
             preprocessing_pipeline(data)
-            train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
-            train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
-            test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
-            
+            train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx)))
+
             # train_size=data.mod["meta1"].shape[0]
             # test_size=data.mod["mod1"].shape[0]-train_size
-            data.set_split_idx("train",train_idx)
-            data.set_split_idx("test",test_idx)
-            (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch")
-            (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch")
+            data.set_split_idx("train", train_idx)
+            data.set_split_idx("test", test_idx)
+            (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch")
+            (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch")
             # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels)
             lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()]))
             lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()]))
@@ -116,10 +116,10 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             # train_size = len(data.get_split_idx("train"))
             # train_size=x_train.shape[0]
             train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx],
-                                            lib_var2[train_idx], y_train)
+                                             lib_var2[train_idx], y_train)
 
             valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx],
-                                            lib_var2[test_idx], y_test)
+                                             lib_var2[test_idx], y_test)
 
             total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
 
@@ -164,7 +164,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             wandb.log(score)
             wandb.finish()
         finally:
-            locals_keys=list(locals().keys())
+            locals_keys = list(locals().keys())
             for var in locals_keys:
                 try:
                     exec(f"del {var}")

From cbc4469aa3b7a923fb658107c2809a562511d319 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 3 Oct 2024 00:09:55 +0800
Subject: [PATCH 083/203] update metadata

---
 dance/metadata/scdeepsort.csv | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 8b1c2c44..fc732dad 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -83,3 +83,8 @@ human,Blood,10000,train,,,train_human_Bloodae29ebd0-1973-40a4-a6af-d15a5f77a80f_
 human,Blood,10000,train,,,train_human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad,https://www.dropbox.com/scl/fi/md5mqb2dh0w9655v0c281/human_Bloodbc260987-8ee5-4b6e-8773-72805166b3f7_data.h5ad?rlkey=afdyzlpcmd44lo7tl5gnzw8u3&st=gt2fdipz&dl=1
 human,Blood,10000,train,,,train_human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad,https://www.dropbox.com/scl/fi/1cih4y8h03dboijqieheg/human_Bloodbc2a7b3d-f04e-477e-96c9-9d5367d5425c_data.h5ad?rlkey=yupm1kblpt9a8qlmksz1u3xob&st=9jurnfn4&dl=1
 human,Blood,10000,train,,,train_human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad,https://www.dropbox.com/scl/fi/b2hvwk1xmbc6ifhouz4kv/human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_data.h5ad?rlkey=82vzr0qcii75tm4sjsn2xw89g&st=6pwjmxnk&dl=1
+human,Blood,549,train,,,train_human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=af7wxs06&dl=1
+human,Blood,1324,train,,,train_human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad,https://www.dropbox.com/scl/fi/kbuvlttd8dfmvx1v94fr4/human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad?rlkey=v1ne1dg2gl8b4j6qj3j3ry6fy&st=gy9vb5q6&dl=1
+human,Blood,10000,train,,,train_human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad,https://www.dropbox.com/scl/fi/8wq8eaod0xuvgwhsjoapa/human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad?rlkey=b6u3b7335l7baricjlgbwthb3&st=cw7mjmx5&dl=1
+human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=cfgc3m7s&dl=1
+human,Blood,10000,train,,,train_human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad,https://www.dropbox.com/scl/fi/kgay0bhk4er6qjx96okrz/human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad?rlkey=m5ax0vhx3vh7ylo4pc74tx9ky&st=sbhonz18&dl=1

From 26c73bafacc4f4a5bad4ce06c1ce3bda73d64344 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 3 Oct 2024 00:11:19 +0800
Subject: [PATCH 084/203] minor change

---
 get_result_web.py | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 8d79d857..726e4426 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -7,31 +7,32 @@
 
 from dance.utils import try_import
 
-# os.environ["http_proxy"]="http://121.250.209.147:7890"
-# os.environ["https_proxy"]="http://121.250.209.147:7890"
+os.environ["http_proxy"] = "http://121.250.209.147:7890"
+os.environ["https_proxy"] = "http://121.250.209.147:7890"
 wandb = try_import("wandb")
 entity = "xzy11632"
 project = "dance-dev"
 collect_datasets = {
     "cta_actinn": [
-        "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab",
-        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436"
-    ],
-    "cta_celltypist": [
-        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031",
+        "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031",
         "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
         "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
     ],
+    "cta_celltypist": [
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+    ],
+    "cta_scdeepsort": [
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+    ],
     "cta_singlecellnet": [
-        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
     ]
 }
 file_root = "/home/zyxing/dance/examples/tuning"
@@ -55,13 +56,16 @@ def check_identical_strings(string_list):
     #     if s != first_string:
     #         raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'")
     # return first_string
-def get_sweep_url(step_csv: pd.DataFrame):
+def get_sweep_url(step_csv: pd.DataFrame, single=True):
     ids = step_csv["id"]
     sweep_urls = []
-    for run_id in tqdm(ids, leave=False):
+    for run_id in tqdm(reversed(ids),
+                       leave=False):  #The reversal of order is related to additional_sweep_ids.append(sweep_id)
         api = wandb.Api()
         run = api.run(f"/{entity}/{project}/runs/{run_id}")
         sweep_urls.append(run.sweep.url)
+        if single:
+            break
     sweep_url = check_identical_strings(sweep_urls)
     return sweep_url
 

From 4f45b1753deb614b1416007cb660ecbdaad3c1b9 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 2 Oct 2024 12:22:56 -0400
Subject: [PATCH 085/203] minor change

---
 get_result_web.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 8d79d857..2a5df8b8 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -13,13 +13,12 @@
 entity = "xzy11632"
 project = "dance-dev"
 collect_datasets = {
-    "cta_actinn": [
-        "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab",
-        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436"
-    ],
     "cta_celltypist": [
+        "84230ea4-998d-4aa8-8456-81dd54ce23af",
+        "d3566d6a-a455-4a15-980f-45eb29114cab",
+    ],
+    "cta_scdeepsort": [
+        "84230ea4-998d-4aa8-8456-81dd54ce23af", "d3566d6a-a455-4a15-980f-45eb29114cab",
         "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
         "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
         "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031",
@@ -27,14 +26,11 @@
         "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
     ],
     "cta_singlecellnet": [
-        "4c4cd77c-8fee-4836-9145-16562a8782fe", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436", "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
+        "84230ea4-998d-4aa8-8456-81dd54ce23af",
+        "d3566d6a-a455-4a15-980f-45eb29114cab",
     ]
 }
-file_root = "/home/zyxing/dance/examples/tuning"
+file_root = "/egr/research-dselab/dingjia5/zhongyu/dance/examples/tuning"
 
 
 def check_identical_strings(string_list):
@@ -58,7 +54,8 @@ def check_identical_strings(string_list):
 def get_sweep_url(step_csv: pd.DataFrame):
     ids = step_csv["id"]
     sweep_urls = []
-    for run_id in tqdm(ids, leave=False):
+    for run_id in tqdm(reversed(ids),
+                       leave=False):  #The reversal of order is related to additional_sweep_ids.append(sweep_id)
         api = wandb.Api()
         run = api.run(f"/{entity}/{project}/runs/{run_id}")
         sweep_urls.append(run.sweep.url)

From 436c7e51f8565a4246d0c9d4b065336d521d5c47 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 7 Oct 2024 21:05:48 +0800
Subject: [PATCH 086/203] update get_result_web

---
 get_result_web.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/get_result_web.py b/get_result_web.py
index cb5018ef..4232fdd9 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -78,7 +78,7 @@ def write_ans():
             step3_urls = []
             for i in range(3):
                 file_csv = f"{file_path}/params/{i}_best_test_acc.csv"
-                if not os.path.exists(file_csv):
+                if not os.path.exists(file_csv):  #no parameter
                     print(f"文件 {file_csv} 不存在，跳过。")
                     continue
                 step3_urls.append(get_sweep_url(pd.read_csv(file_csv)))

From b1e205d898564cca733379d77255524d864771d5 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 23 Oct 2024 15:49:27 +0800
Subject: [PATCH 087/203] sc_sim

---
 sc_similarity/anndata_similarity.py | 180 ++++++++++++++++++++++++++++
 sc_similarity/example_usage.py      |  89 ++++++++++++++
 2 files changed, 269 insertions(+)
 create mode 100644 sc_similarity/anndata_similarity.py
 create mode 100644 sc_similarity/example_usage.py

diff --git a/sc_similarity/anndata_similarity.py b/sc_similarity/anndata_similarity.py
new file mode 100644
index 00000000..eb8d355c
--- /dev/null
+++ b/sc_similarity/anndata_similarity.py
@@ -0,0 +1,180 @@
+# anndata_similarity.py
+# TODO translate notes
+import warnings
+from typing import Callable, Dict, List
+
+import anndata
+import numpy as np
+import pandas as pd
+from scipy.spatial.distance import jaccard
+from scipy.stats import pearsonr, wasserstein_distance
+from sklearn.metrics.pairwise import cosine_similarity
+
+# Suppress scipy warnings for constant input in Pearson correlation
+warnings.filterwarnings("ignore", message="An input array is constant")
+
+
+class AnnDataSimilarity:
+
+    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData):
+        """初始化 AnnDataSimilarity 对象，进行数据预处理。"""
+        self.adata1 = adata1.copy()
+        self.adata2 = adata2.copy()
+        self.preprocess()
+        self.results = {}
+        self.results_score = {}
+
+    def preprocess(self):
+        """预处理数据，包括对数归一化和归一化为概率分布。"""
+        # 对原始数据进行对数归一化
+        self.adata1.obs['celltype'] = self.adata1.obs['celltype'].astype(str)
+        self.adata2.obs['celltype'] = self.adata2.obs['celltype'].astype(str)
+
+        # 计算每个细胞类型的平均表达
+        self.avg_expr1 = self._compute_average_expression(self.adata1)
+        self.avg_expr2 = self._compute_average_expression(self.adata2)
+
+        # 归一化为概率分布以计算 JS 散度等
+        self.prob_expr1 = self._normalize_to_probability(self.avg_expr1)
+        self.prob_expr2 = self._normalize_to_probability(self.avg_expr2)
+
+    def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame:
+        """计算每种细胞类型的平均基因表达。"""
+        return adata.to_df().groupby(adata.obs['celltype']).mean()
+
+    def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame:
+        """将基因表达矩阵归一化为概率分布（每个细胞类型的表达总和为1）。"""
+        return df.div(df.sum(axis=1), axis=0).fillna(0)
+
+    def cosine_sim(self) -> pd.DataFrame:
+        """计算两个数据集间的余弦相似度。 返回数据框，行和列分别为 adata1 和 adata2 的细胞类型。"""
+        sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2)
+        return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index)
+
+    def pearson_corr(self) -> pd.DataFrame:
+        """计算两个数据集间的皮尔逊相关系数。 返回数据框，行和列分别为 adata1 和 adata2 的细胞类型。"""
+        celltypes1 = self.avg_expr1.index
+        celltypes2 = self.avg_expr2.index
+        corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
+                corr_matrix.at[ct1, ct2] = corr
+
+        return corr_matrix.astype(float)
+
+    def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame:
+        """计算两个数据集间的 Jaccard 相似度。 使用基因表达的二值化表示，基于指定阈值。 返回数据框，行和列分别为 adata1 和 adata2
+        的细胞类型。"""
+        # 二值化表达矩阵
+        binary_expr1 = (self.avg_expr1 > threshold).astype(int)
+        binary_expr2 = (self.avg_expr2 > threshold).astype(int)
+
+        celltypes1 = binary_expr1.index
+        celltypes2 = binary_expr2.index
+        sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2])
+                sim_matrix.at[ct1, ct2] = sim
+
+        return sim_matrix.astype(float)
+
+    def js_distance(self) -> pd.DataFrame:
+        """计算两个数据集间的 Jensen-Shannon 散度。 需要先将表达数据归一化为概率分布。 返回数据框，行和列分别为 adata1 和 adata2
+        的细胞类型。"""
+        # def jsd(p, q):
+        #     """
+        #     计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。
+        #     """
+        #     p = p + 1e-12
+        #     q = q + 1e-12
+        #     m = 0.5 * (p + q)
+        #     return 0.5 * (entropy(p, m) + entropy(q, m))
+
+        # from scipy.stats import entropy
+
+        celltypes1 = self.prob_expr1.index
+        celltypes2 = self.prob_expr2.index
+        js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values,
+                                                                self.prob_expr2.loc[ct2].values)
+                js_matrix.at[ct1, ct2] = jsd_value
+
+        return js_matrix.astype(float)
+
+    def _jensen_shannon_divergence(self, p, q) -> float:
+        """计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。"""
+        from scipy.spatial.distance import jensenshannon
+        return jensenshannon(p, q)
+
+    def otdd():
+        """计算两个数据集间的 OTDD。"""
+        raise NotImplementedError("OTDD!")
+
+    def wasserstein_dist(self) -> pd.DataFrame:
+        """计算两个数据集间的 Wasserstein 距离。 返回数据框，行和列分别为 adata1 和 adata2 的细胞类型。"""
+        celltypes1 = self.avg_expr1.index
+        celltypes2 = self.avg_expr2.index
+        wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
+                wasserstein_matrix.at[ct1, ct2] = wd
+
+        return wasserstein_matrix.astype(float)
+
+    def compute_similarity(
+            self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance',
+                                        'otdd']) -> Dict[str, pd.DataFrame]:
+        """计算指定的相似性度量。 参数:
+
+        methods: 要计算的相似性度量方法列表。支持 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
+        返回:
+            包含各个相似性矩阵的字典
+
+        """
+        results = {}
+        for method in methods:
+            if method == 'cosine':
+                results['cosine'] = self.cosine_sim()
+            elif method == 'pearson':
+                results['pearson'] = self.pearson_corr()
+            elif method == 'jaccard':
+                results['jaccard'] = self.jaccard_sim()
+            elif method == 'js_distance':
+                results['js_distance'] = self.js_distance()
+            elif method == 'wasserstein':
+                results['wasserstein'] = self.wasserstein_dist()
+            elif method == "otdd":
+                results['otdd'] = self.otdd()
+            else:
+                raise ValueError(f"Unsupported similarity method: {method}")
+        return results
+
+    def get_similarity_matrix(
+            self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance']) -> Dict[str, pd.DataFrame]:
+        """同 compute_similarity，保留方法名一致性。"""
+        self.results = self.compute_similarity(methods)
+        return self.results
+
+    def get_max_similarity_A_to_B(self):
+        if self.results is None:
+            raise ValueError(f"need results!")
+        else:
+            self.results_score = {}
+            for key in self.results:
+                self.results_score[key] = self._get_max_similarity(self.results[key])
+        return self.results_score
+
+    def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
+        """最大匹配平均相似性分数."""
+        max_similarity = similarity_matrix.max(axis=1)
+        overall_similarity = max_similarity.mean()
+        return overall_similarity
diff --git a/sc_similarity/example_usage.py b/sc_similarity/example_usage.py
new file mode 100644
index 00000000..3fedb874
--- /dev/null
+++ b/sc_similarity/example_usage.py
@@ -0,0 +1,89 @@
+# test_anndata_similarity.py
+
+import anndata
+import numpy as np
+import pandas as pd
+from anndata_similarity import AnnDataSimilarity
+
+
+def create_test_ann_data():
+    # 定义基因和细胞类型
+    genes = ['gene1', 'gene2']
+    celltypes1 = ['A', 'B']
+    celltypes2 = ['A', 'B']
+
+    # 创建数据集1
+    data1 = np.array([
+        [10, 0],  # 细胞类型 A
+        [0, 10]  # 细胞类型 B
+    ])
+    obs1 = pd.DataFrame({'celltype': celltypes1}, index=['cell1', 'cell2'])
+    adata1 = anndata.AnnData(X=data1, obs=obs1, var=pd.DataFrame(index=genes))
+
+    # 创建数据集2
+    data2 = np.array([
+        [10, 0],  # 细胞类型 A
+        [10, 0]  # 细胞类型 B
+    ])
+    obs2 = pd.DataFrame({'celltype': celltypes2}, index=['cell3', 'cell4'])
+    adata2 = anndata.AnnData(X=data2, obs=obs2, var=pd.DataFrame(index=genes))
+
+    return adata1, adata2
+
+
+def run_test_case():
+    # 创建测试数据
+    adata1, adata2 = create_test_ann_data()
+
+    # 初始化相似性计算器
+    similarity_calculator = AnnDataSimilarity(adata1, adata2)
+
+    # 计算相似性
+    similarity_matrices = similarity_calculator.compute_similarity(
+        methods=['cosine', 'pearson', 'jaccard', 'js_distance'])
+
+    # 预期结果
+    expected_cosine = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B'])
+
+    expected_pearson = pd.DataFrame([[1.0, 1.0], [-1.0, -1.0]], index=['A', 'B'], columns=['A', 'B'])
+
+    expected_jaccard = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B'])
+
+    expected_js = pd.DataFrame([[1.0, 1.0], [0.167445, 0.167445]], index=['A', 'B'], columns=['A', 'B'])
+
+    # 打印结果
+    print("Computed Cosine Similarity:")
+    print(similarity_matrices['cosine'])
+    print("\nExpected Cosine Similarity:")
+    print(expected_cosine)
+
+    print("\nComputed Pearson Correlation:")
+    print(similarity_matrices['pearson'])
+    print("\nExpected Pearson Correlation:")
+    print(expected_pearson)
+
+    print("\nComputed Jaccard Similarity:")
+    print(similarity_matrices['jaccard'])
+    print("\nExpected Jaccard Similarity:")
+    print(expected_jaccard)
+
+    print("\nComputed Jensen-Shannon distance:")
+    print(similarity_matrices['js_distance'])
+    print("\nExpected Jensen-Shannon distance:")
+    print(expected_js)
+
+    # 验证结果是否与预期一致
+    assert similarity_matrices['cosine'].equals(expected_cosine), "Cosine similarity does not match expected values."
+    assert similarity_matrices['pearson'].equals(
+        expected_pearson), "Pearson correlation does not match expected values."
+    assert similarity_matrices['jaccard'].equals(expected_jaccard), "Jaccard similarity does not match expected values."
+
+    # 由于浮点数计算的精度问题，使用近似比较
+    assert np.allclose(similarity_matrices['js_distance'], expected_js,
+                       atol=1e-4), "JS distance does not match expected values."
+
+    print("\nAll tests passed successfully!")
+
+
+if __name__ == "__main__":
+    run_test_case()

From ea5bb9b096ab222f47b4533cd746f7aaa724c7c3 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 20:18:04 +0800
Subject: [PATCH 088/203] update get_result_web

---
 get_result_web.py | 133 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 104 insertions(+), 29 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 37e694ec..e9fce0fc 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -3,38 +3,20 @@
 
 import numpy as np
 import pandas as pd
+from omegaconf import OmegaConf
+from sympy import im
 from tqdm import tqdm
 
 from dance.utils import try_import
 
+# get yaml of best method
 os.environ["http_proxy"] = "http://121.250.209.147:7890"
 os.environ["https_proxy"] = "http://121.250.209.147:7890"
 wandb = try_import("wandb")
 entity = "xzy11632"
 project = "dance-dev"
-collect_datasets = {
-    "cta_actinn": [
-        "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
-    ],
-    "cta_celltypist": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ],
-    "cta_scdeepsort": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ],
-    "cta_singlecellnet": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ]
-}
+with open("dataset_server.json") as f:
+    collect_datasets = json.load(f)
 file_root = "/home/zyxing/dance/examples/tuning"
 
 
@@ -70,24 +52,117 @@ def get_sweep_url(step_csv: pd.DataFrame, single=True):
     return sweep_url
 
 
+import re
+
+
+def spilt_web(url: str):
+    pattern = r"https://wandb\.ai/([^/]+)/([^/]+)/sweeps/([^/]+)"
+
+    match = re.search(pattern, url)
+
+    if match:
+        entity = match.group(1)
+        project = match.group(2)
+        sweep_id = match.group(3)
+
+        return entity, project, sweep_id
+    else:
+        print(url)
+        print("No match found")
+
+
+def get_best_method(urls, metric_col="test_acc"):
+    all_best_run = None
+    all_best_step_name = None
+    step_names = ["step2", "step3_0", "step3_1", "step3_2"]
+
+    def get_metric(run):
+        if metric_col not in run.summary:
+            return float('-inf')
+        else:
+            return run.summary[metric_col]
+
+    for step_name, url in zip(step_names, urls):
+        _, _, sweep_id = spilt_web(url)
+        sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+        goal = sweep.config["metric"]["goal"]
+        if goal == "maximize":
+            best_run = max(sweep.runs, key=get_metric)
+        elif goal == "minimize":
+            best_run = min(sweep.runs, key=get_metric)
+        else:
+            raise RuntimeError("choose goal in ['minimize','maximize']")
+        if metric_col not in best_run.summary:
+            continue
+        if all_best_run is None:
+            all_best_run = best_run
+            all_best_step_name = step_name
+        elif all_best_run.summary[metric_col] < best_run.summary[metric_col] and goal == "maximize":
+            all_best_run = best_run
+            all_best_step_name = step_name
+        elif all_best_run.summary[metric_col] > best_run.summary[metric_col] and goal == "minimize":
+            all_best_run = best_run
+            all_best_step_name = step_name
+    return all_best_step_name, all_best_run
+
+
+def get_best_yaml(step_name, best_run, file_path):
+    if step_name == "step2":
+        conf = OmegaConf.load(f"{file_path}/pipeline_params_tuning_config.yaml")
+        for i, fun in enumerate(conf["pipeline"]):
+            if "include" not in fun:
+                continue
+            type_fun = fun["type"]
+            prefix = f"pipeline.{i}.{type_fun}"
+            # filtered_dict = {k: v for k, v in b_run.config.items() if k==prefix}.items()[0]
+            fun_name = best_run.config[prefix]
+            fun['target'] = fun_name
+            if 'params' not in fun:
+                fun['params'] = {}
+            if "default_params" in fun and fun_name in fun["default_params"]:
+                fun['params'].update(fun["default_params"][fun_name])
+            del fun["include"]
+            del fun["default_params"]
+    else:
+        step3_number = step_name.split("_")[1]
+        conf = OmegaConf.load(f"{file_path}/config_yamls/params/{step3_number}_test_acc_params_tuning_config.yaml")
+        for i, fun in enumerate(conf['pipeline']):
+            if 'params_to_tune' not in fun:
+                continue
+            target = fun["target"]
+            prefix = f"params.{i}.{target}"
+            filtered_dict = {k: v for k, v in best_run.config.items() if k.startswith(prefix)}
+            for k, v in filtered_dict.items():
+                param_name = k.split(".")[-1]
+                fun['params_to_tune'][param_name] = v
+            if "params" not in fun:
+                fun["params"] = {}
+            fun["params"].update(fun['params_to_tune'])
+            del fun["params_to_tune"]
+    return OmegaConf.to_yaml(conf["pipeline"])
+
+
 def write_ans():
     ans = []
     for method_folder in tqdm(collect_datasets):
         for dataset_id in collect_datasets[method_folder]:
-            file_path = f"{file_root}/{method_folder}/{dataset_id}/results"
-            step2_url = get_sweep_url(pd.read_csv(f"{file_path}/pipeline/best_test_acc.csv"))
+            file_path = f"{file_root}/{method_folder}/{dataset_id}"
+            step2_url = get_sweep_url(pd.read_csv(f"{file_path}/results/pipeline/best_test_acc.csv"))
             step3_urls = []
             for i in range(3):
-                file_csv = f"{file_path}/params/{i}_best_test_acc.csv"
+                file_csv = f"{file_path}/results/params/{i}_best_test_acc.csv"
                 if not os.path.exists(file_csv):  #no parameter
                     print(f"文件 {file_csv} 不存在，跳过。")
                     continue
                 step3_urls.append(get_sweep_url(pd.read_csv(file_csv)))
             step3_str = ",".join(step3_urls)
             step_str = f"step2:{step2_url}|step3:{step3_str}"
-            ans.append({"Dataset_id": dataset_id, method_folder: step_str})
-    with open('temp_ans.json', 'w') as f:
-        json.dump(ans, f)
+            step_name, best_run = get_best_method([step2_url] + step3_urls)
+            best_yaml = get_best_yaml(step_name, best_run, file_path)
+            ans.append({"Dataset_id": dataset_id, method_folder: step_str, "best_yaml": best_yaml})
+    # with open('temp_ans.json', 'w') as f:
+    #     json.dump(ans, f,indent=4)
+    pd.DataFrame(ans).to_csv("temp_ans.csv")
 
 
 write_ans()

From ef68af1f8163136f75cc96c38a7bee6e4faacf0a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 20:52:14 +0800
Subject: [PATCH 089/203] minor change

---
 get_result_web.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/get_result_web.py b/get_result_web.py
index e9fce0fc..97717245 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+from natsort import os_sort_key
 from omegaconf import OmegaConf
 from sympy import im
 from tqdm import tqdm
@@ -142,11 +143,22 @@ def get_best_yaml(step_name, best_run, file_path):
     return OmegaConf.to_yaml(conf["pipeline"])
 
 
+def check_exist(file_path):
+    file_path = f"{file_path}/results/params/"
+    if os.path.exists(file_path) and os.path.isdir(file_path):
+        file_num = len(os.listdir(file_path))
+        return file_num > 1
+    else:
+        return False
+
+
 def write_ans():
     ans = []
     for method_folder in tqdm(collect_datasets):
         for dataset_id in collect_datasets[method_folder]:
             file_path = f"{file_root}/{method_folder}/{dataset_id}"
+            if not check_exist(file_path):
+                continue
             step2_url = get_sweep_url(pd.read_csv(f"{file_path}/results/pipeline/best_test_acc.csv"))
             step3_urls = []
             for i in range(3):

From 2b918df2513f5cef6303040db2ef1bba4514828d Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 20:52:44 +0800
Subject: [PATCH 090/203] minor change

---
 dataset_server.json | 92 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 dataset_server.json

diff --git a/dataset_server.json b/dataset_server.json
new file mode 100644
index 00000000..10a279da
--- /dev/null
+++ b/dataset_server.json
@@ -0,0 +1,92 @@
+{
+     "cta_actinn": [
+        "01209dce-3575-4bed-b1df-129f57fbc031",
+        "055ca631-6ffb-40de-815e-b931e10718c0",
+        "2a498ace-872a-4935-984b-1afa70fd9886",
+        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
+        "3faad104-2ab8-4434-816d-474d8d2641db",
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "4c4cd77c-8fee-4836-9145-16562a8782fe",
+        "84230ea4-998d-4aa8-8456-81dd54ce23af",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7",
+        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d3566d6a-a455-4a15-980f-45eb29114cab",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958",
+        "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
+        "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "71be997d-ff75-41b9-8a9f-1288c865f921"
+     ]
+     ,
+     "cta_celltypist": [
+        "01209dce-3575-4bed-b1df-129f57fbc031",
+        "055ca631-6ffb-40de-815e-b931e10718c0",
+        "2a498ace-872a-4935-984b-1afa70fd9886",
+        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
+        "3faad104-2ab8-4434-816d-474d8d2641db",
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "4c4cd77c-8fee-4836-9145-16562a8782fe",
+        "84230ea4-998d-4aa8-8456-81dd54ce23af",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7",
+        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d3566d6a-a455-4a15-980f-45eb29114cab",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958",
+        "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
+        "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "71be997d-ff75-41b9-8a9f-1288c865f921"
+     ],
+    "cta_scdeepsort": [
+        "01209dce-3575-4bed-b1df-129f57fbc031",
+        "055ca631-6ffb-40de-815e-b931e10718c0",
+        "2a498ace-872a-4935-984b-1afa70fd9886",
+        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
+        "3faad104-2ab8-4434-816d-474d8d2641db",
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "4c4cd77c-8fee-4836-9145-16562a8782fe",
+        "84230ea4-998d-4aa8-8456-81dd54ce23af",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7",
+        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d3566d6a-a455-4a15-980f-45eb29114cab",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958",
+        "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
+        "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "71be997d-ff75-41b9-8a9f-1288c865f921"
+    ]
+    ,
+     "cta_singlecellnet": [
+        "01209dce-3575-4bed-b1df-129f57fbc031",
+        "055ca631-6ffb-40de-815e-b931e10718c0",
+        "2a498ace-872a-4935-984b-1afa70fd9886",
+        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
+        "3faad104-2ab8-4434-816d-474d8d2641db",
+        "471647b3-04fe-4c76-8372-3264feb950e8",
+        "4c4cd77c-8fee-4836-9145-16562a8782fe",
+        "84230ea4-998d-4aa8-8456-81dd54ce23af",
+        "8a554710-08bc-4005-87cd-da9675bdc2e7",
+        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+        "bc260987-8ee5-4b6e-8773-72805166b3f7",
+        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+        "d3566d6a-a455-4a15-980f-45eb29114cab",
+        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
+        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958",
+        "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
+        "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "71be997d-ff75-41b9-8a9f-1288c865f921"
+     ]
+}

From 86d000df330499468a9530aa669fa06aea99cefb Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 21:07:50 +0800
Subject: [PATCH 091/203] minor_change

---
 get_result_web.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 37e694ec..ca1dc1f2 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -13,26 +13,26 @@
 entity = "xzy11632"
 project = "dance-dev"
 collect_datasets = {
-    "cta_actinn": [
-        "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
-    ],
-    "cta_celltypist": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ],
-    "cta_scdeepsort": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ],
+    # "cta_actinn": [
+    #     "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7",
+    #     "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031",
+    #     "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
+    #     "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
+    # ],
+    # "cta_celltypist": [
+    #     "471647b3-04fe-4c76-8372-3264feb950e8",
+    #     "8a554710-08bc-4005-87cd-da9675bdc2e7",
+    #     "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+    # ],
+    # "cta_scdeepsort": [
+    #     "471647b3-04fe-4c76-8372-3264feb950e8",
+    #     "8a554710-08bc-4005-87cd-da9675bdc2e7",
+    #     "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+    # ],
     "cta_singlecellnet": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "71be997d-ff75-41b9-8a9f-1288c865f921"
     ]
 }
 file_root = "/home/zyxing/dance/examples/tuning"
@@ -87,7 +87,7 @@ def write_ans():
             step_str = f"step2:{step2_url}|step3:{step3_str}"
             ans.append({"Dataset_id": dataset_id, method_folder: step_str})
     with open('temp_ans.json', 'w') as f:
-        json.dump(ans, f)
+        json.dump(ans, f, indent=4)
 
 
 write_ans()

From 5d32d2eacca45fc82811e45ad7e1aeebf21728f3 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 08:11:55 -0500
Subject: [PATCH 092/203] minor

---
 get_result_web.py | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 7d3ed259..fe3c54ec 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -7,32 +7,14 @@
 
 from dance.utils import try_import
 
-os.environ["http_proxy"] = "http://121.250.209.147:7890"
-os.environ["https_proxy"] = "http://121.250.209.147:7890"
 wandb = try_import("wandb")
 entity = "xzy11632"
 project = "dance-dev"
 collect_datasets = {
-    "cta_actinn": [
-        "471647b3-04fe-4c76-8372-3264feb950e8", "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569", "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0", "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf", "3faad104-2ab8-4434-816d-474d8d2641db"
-    ],
     "cta_celltypist": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ],
-    "cta_scdeepsort": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-    ],
-    "cta_singlecellnet": [
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "71be997d-ff75-41b9-8a9f-1288c865f921"
     ]
 }
 file_root = "/egr/research-dselab/dingjia5/zhongyu/dance/examples/tuning"
@@ -87,7 +69,7 @@ def write_ans():
             step_str = f"step2:{step2_url}|step3:{step3_str}"
             ans.append({"Dataset_id": dataset_id, method_folder: step_str})
     with open('temp_ans.json', 'w') as f:
-        json.dump(ans, f)
+        json.dump(ans, f, indent=4)
 
 
 write_ans()

From 2408e5428d563214528a7fd37dbf415cd2c4de5d Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 21:15:14 +0800
Subject: [PATCH 093/203] minor

---
 get_result_web.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 97717245..bed6753b 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -11,8 +11,7 @@
 from dance.utils import try_import
 
 # get yaml of best method
-os.environ["http_proxy"] = "http://121.250.209.147:7890"
-os.environ["https_proxy"] = "http://121.250.209.147:7890"
+
 wandb = try_import("wandb")
 entity = "xzy11632"
 project = "dance-dev"

From 7d30ad76bcf8a2410ff17655f9b807af5cbe6124 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 8 Nov 2024 21:21:42 +0800
Subject: [PATCH 094/203] minor change

---
 get_result_web.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/get_result_web.py b/get_result_web.py
index 97717245..18fc3708 100644
--- a/get_result_web.py
+++ b/get_result_web.py
@@ -104,7 +104,7 @@ def get_metric(run):
         elif all_best_run.summary[metric_col] > best_run.summary[metric_col] and goal == "minimize":
             all_best_run = best_run
             all_best_step_name = step_name
-    return all_best_step_name, all_best_run
+    return all_best_step_name, all_best_run, all_best_run.summary[metric_col]
 
 
 def get_best_yaml(step_name, best_run, file_path):
@@ -169,9 +169,14 @@ def write_ans():
                 step3_urls.append(get_sweep_url(pd.read_csv(file_csv)))
             step3_str = ",".join(step3_urls)
             step_str = f"step2:{step2_url}|step3:{step3_str}"
-            step_name, best_run = get_best_method([step2_url] + step3_urls)
+            step_name, best_run, best_res = get_best_method([step2_url] + step3_urls)
             best_yaml = get_best_yaml(step_name, best_run, file_path)
-            ans.append({"Dataset_id": dataset_id, method_folder: step_str, "best_yaml": best_yaml})
+            ans.append({
+                "Dataset_id": dataset_id,
+                method_folder: step_str,
+                f"{method_folder}_best_yaml": best_yaml,
+                f"{method_folder}_best_res": best_res
+            })
     # with open('temp_ans.json', 'w') as f:
     #     json.dump(ans, f,indent=4)
     pd.DataFrame(ans).to_csv("temp_ans.csv")

From 05d995dc809849f5b99e6075db910116277c1c9b Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sat, 9 Nov 2024 10:58:21 +0800
Subject: [PATCH 095/203] minor

---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index fc732dad..d8fa18ce 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -86,5 +86,5 @@ human,Blood,10000,train,,,train_human_Bloodd9b4bc69-ed90-4f5f-99b2-61b0681ba436_
 human,Blood,549,train,,,train_human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=af7wxs06&dl=1
 human,Blood,1324,train,,,train_human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad,https://www.dropbox.com/scl/fi/kbuvlttd8dfmvx1v94fr4/human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_data.h5ad?rlkey=v1ne1dg2gl8b4j6qj3j3ry6fy&st=gy9vb5q6&dl=1
 human,Blood,10000,train,,,train_human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad,https://www.dropbox.com/scl/fi/8wq8eaod0xuvgwhsjoapa/human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad?rlkey=b6u3b7335l7baricjlgbwthb3&st=cw7mjmx5&dl=1
-human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/26c6t2yk44kxqmc54djfz/human_Blood71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=kfv9p7kvx5vgdiav9ew9nj2me&st=cfgc3m7s&dl=1
+human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/7gszhapz281uah6ytc615/human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad?rlkey=28ywz595f00ppjqwg6054tdqj&st=7bxft78n&dl=1
 human,Blood,10000,train,,,train_human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad,https://www.dropbox.com/scl/fi/kgay0bhk4er6qjx96okrz/human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad?rlkey=m5ax0vhx3vh7ylo4pc74tx9ky&st=sbhonz18&dl=1

From fb148f3599dcd6bebe77bdb081223d3ac55cb066 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 13 Nov 2024 15:42:38 +0800
Subject: [PATCH 096/203] minor change

---
 .../dataset_server.json                       |   0
 .../get_result_web.py                         |   9 +-
 .../sc_similarity_examples/sim_query_atlas.py |  92 +++++++++
 sc_similarity/anndata_similarity.py           | 180 ------------------
 sc_similarity/example_usage.py                |  89 ---------
 5 files changed, 98 insertions(+), 272 deletions(-)
 rename dataset_server.json => examples/dataset_server.json (100%)
 rename get_result_web.py => examples/get_result_web.py (97%)
 create mode 100644 examples/sc_similarity_examples/sim_query_atlas.py
 delete mode 100644 sc_similarity/anndata_similarity.py
 delete mode 100644 sc_similarity/example_usage.py

diff --git a/dataset_server.json b/examples/dataset_server.json
similarity index 100%
rename from dataset_server.json
rename to examples/dataset_server.json
diff --git a/get_result_web.py b/examples/get_result_web.py
similarity index 97%
rename from get_result_web.py
rename to examples/get_result_web.py
index 64c214c8..ee5d4158 100644
--- a/get_result_web.py
+++ b/examples/get_result_web.py
@@ -1,5 +1,6 @@
 import json
 import os
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
@@ -15,9 +16,10 @@
 wandb = try_import("wandb")
 entity = "xzy11632"
 project = "dance-dev"
-with open("dataset_server.json") as f:
+file_root = str(Path(__file__).resolve().parent)
+with open(f"{file_root}/dataset_server.json") as f:
     collect_datasets = json.load(f)
-file_root = "/home/zyxing/dance/examples/tuning"
+file_root = "./tuning"
 
 
 def check_identical_strings(string_list):
@@ -181,4 +183,5 @@ def write_ans():
     pd.DataFrame(ans).to_csv("temp_ans.csv")
 
 
-write_ans()
+if __name__ == "__main__":
+    write_ans()
diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/sc_similarity_examples/sim_query_atlas.py
new file mode 100644
index 00000000..0b6dc8d7
--- /dev/null
+++ b/examples/sc_similarity_examples/sim_query_atlas.py
@@ -0,0 +1,92 @@
+import pandas as pd
+
+atlas_datasets = [
+    "01209dce-3575-4bed-b1df-129f57fbc031", "055ca631-6ffb-40de-815e-b931e10718c0",
+    "2a498ace-872a-4935-984b-1afa70fd9886", "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
+    "3faad104-2ab8-4434-816d-474d8d2641db", "471647b3-04fe-4c76-8372-3264feb950e8",
+    "4c4cd77c-8fee-4836-9145-16562a8782fe", "84230ea4-998d-4aa8-8456-81dd54ce23af",
+    "8a554710-08bc-4005-87cd-da9675bdc2e7", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+    "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+    "d3566d6a-a455-4a15-980f-45eb29114cab", "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
+    "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569"
+]
+import sys
+
+sys.path.append("..")
+import ast
+
+from get_result_web import get_sweep_url, spilt_web
+
+from dance.utils import try_import
+
+
+def find_unique_matching_row(df, config_col, input_dict_list):
+    """在 DataFrame 中查找指定列中与输入字典列表匹配的唯一一行。
+
+    :param df: pandas.DataFrame，包含要搜索的数据。
+    :param config_col: str，DataFrame 中包含字典列表字符串的列名。
+    :param input_dict_list: list of dicts，输入的字典列表，用于匹配。
+    :return: pandas.Series，匹配的行。
+    :raises ValueError: 如果匹配的行数不等于1。
+
+    """
+
+    # 定义一个函数，用于解析字符串并比较
+    def is_match(config_str):
+        try:
+            # 使用 ast.literal_eval 安全地解析字符串为 Python 对象
+            config = ast.literal_eval(config_str)
+            return config == input_dict_list
+        except (ValueError, SyntaxError):
+            # 如果解析失败，则不匹配
+            return False
+
+    # 应用比较函数，得到一个布尔系列
+    matches = df[config_col].apply(is_match)
+
+    # 获取所有匹配的行
+    matching_rows = df[matches]
+
+    # 检查匹配的行数
+    num_matches = len(matching_rows)
+    if num_matches == 1:
+        return matching_rows.iloc[0]
+    elif num_matches == 0:
+        raise ValueError("未找到匹配的行。")
+    else:
+        raise ValueError(f"找到 {num_matches} 行匹配，预期恰好一行。")
+
+
+wandb = try_import("wandb")
+entity = "xzy11632"
+project = "dance-dev"
+query_datasets = [
+    "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865",
+    "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8",
+    "71be997d-ff75-41b9-8a9f-1288c865f921"
+]
+
+
+def get_ans(query_dataset, method):
+    data = pd.read_csv(f"/home/zyxing/dance/examples/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv")
+    sweep_url = get_sweep_url(data)
+    _, _, sweep_id = spilt_web(sweep_url)
+    sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+    ans = pd.DataFrame(index=[method], columns=atlas_datasets)
+    for i, run_kwarg in enumerate(sweep.config["parameters"]["run_kwargs"]["values"]):
+        ans.loc[method, atlas_datasets[i]] = find_unique_matching_row(data, "run_kwargs", run_kwarg)["test_acc"]
+        # ans.append({atlas_datasets[i]:find_unique_matching_row(data,"run_kwargs",run_kwarg)["test_acc"]})
+    return ans
+
+
+ans_all = {}
+methods = ["cta_actinn", "cta_scdeepsort"]
+if __name__ == "__main__":
+    for query_dataset in query_datasets:
+        ans = []
+        for method in methods:
+            ans.append(get_ans(query_dataset, method))
+        ans = pd.concat(ans)
+        ans_all[query_dataset] = ans
+    for k, v in ans_all.items():
+        v.to_csv(f"{str(methods)}_{k}_in_atlas.csv")
diff --git a/sc_similarity/anndata_similarity.py b/sc_similarity/anndata_similarity.py
deleted file mode 100644
index eb8d355c..00000000
--- a/sc_similarity/anndata_similarity.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# anndata_similarity.py
-# TODO translate notes
-import warnings
-from typing import Callable, Dict, List
-
-import anndata
-import numpy as np
-import pandas as pd
-from scipy.spatial.distance import jaccard
-from scipy.stats import pearsonr, wasserstein_distance
-from sklearn.metrics.pairwise import cosine_similarity
-
-# Suppress scipy warnings for constant input in Pearson correlation
-warnings.filterwarnings("ignore", message="An input array is constant")
-
-
-class AnnDataSimilarity:
-
-    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData):
-        """初始化 AnnDataSimilarity 对象，进行数据预处理。"""
-        self.adata1 = adata1.copy()
-        self.adata2 = adata2.copy()
-        self.preprocess()
-        self.results = {}
-        self.results_score = {}
-
-    def preprocess(self):
-        """预处理数据，包括对数归一化和归一化为概率分布。"""
-        # 对原始数据进行对数归一化
-        self.adata1.obs['celltype'] = self.adata1.obs['celltype'].astype(str)
-        self.adata2.obs['celltype'] = self.adata2.obs['celltype'].astype(str)
-
-        # 计算每个细胞类型的平均表达
-        self.avg_expr1 = self._compute_average_expression(self.adata1)
-        self.avg_expr2 = self._compute_average_expression(self.adata2)
-
-        # 归一化为概率分布以计算 JS 散度等
-        self.prob_expr1 = self._normalize_to_probability(self.avg_expr1)
-        self.prob_expr2 = self._normalize_to_probability(self.avg_expr2)
-
-    def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame:
-        """计算每种细胞类型的平均基因表达。"""
-        return adata.to_df().groupby(adata.obs['celltype']).mean()
-
-    def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame:
-        """将基因表达矩阵归一化为概率分布（每个细胞类型的表达总和为1）。"""
-        return df.div(df.sum(axis=1), axis=0).fillna(0)
-
-    def cosine_sim(self) -> pd.DataFrame:
-        """计算两个数据集间的余弦相似度。 返回数据框，行和列分别为 adata1 和 adata2 的细胞类型。"""
-        sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2)
-        return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index)
-
-    def pearson_corr(self) -> pd.DataFrame:
-        """计算两个数据集间的皮尔逊相关系数。 返回数据框，行和列分别为 adata1 和 adata2 的细胞类型。"""
-        celltypes1 = self.avg_expr1.index
-        celltypes2 = self.avg_expr2.index
-        corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
-                corr_matrix.at[ct1, ct2] = corr
-
-        return corr_matrix.astype(float)
-
-    def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame:
-        """计算两个数据集间的 Jaccard 相似度。 使用基因表达的二值化表示，基于指定阈值。 返回数据框，行和列分别为 adata1 和 adata2
-        的细胞类型。"""
-        # 二值化表达矩阵
-        binary_expr1 = (self.avg_expr1 > threshold).astype(int)
-        binary_expr2 = (self.avg_expr2 > threshold).astype(int)
-
-        celltypes1 = binary_expr1.index
-        celltypes2 = binary_expr2.index
-        sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2])
-                sim_matrix.at[ct1, ct2] = sim
-
-        return sim_matrix.astype(float)
-
-    def js_distance(self) -> pd.DataFrame:
-        """计算两个数据集间的 Jensen-Shannon 散度。 需要先将表达数据归一化为概率分布。 返回数据框，行和列分别为 adata1 和 adata2
-        的细胞类型。"""
-        # def jsd(p, q):
-        #     """
-        #     计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。
-        #     """
-        #     p = p + 1e-12
-        #     q = q + 1e-12
-        #     m = 0.5 * (p + q)
-        #     return 0.5 * (entropy(p, m) + entropy(q, m))
-
-        # from scipy.stats import entropy
-
-        celltypes1 = self.prob_expr1.index
-        celltypes2 = self.prob_expr2.index
-        js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values,
-                                                                self.prob_expr2.loc[ct2].values)
-                js_matrix.at[ct1, ct2] = jsd_value
-
-        return js_matrix.astype(float)
-
-    def _jensen_shannon_divergence(self, p, q) -> float:
-        """计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。"""
-        from scipy.spatial.distance import jensenshannon
-        return jensenshannon(p, q)
-
-    def otdd():
-        """计算两个数据集间的 OTDD。"""
-        raise NotImplementedError("OTDD!")
-
-    def wasserstein_dist(self) -> pd.DataFrame:
-        """计算两个数据集间的 Wasserstein 距离。 返回数据框，行和列分别为 adata1 和 adata2 的细胞类型。"""
-        celltypes1 = self.avg_expr1.index
-        celltypes2 = self.avg_expr2.index
-        wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
-                wasserstein_matrix.at[ct1, ct2] = wd
-
-        return wasserstein_matrix.astype(float)
-
-    def compute_similarity(
-            self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance',
-                                        'otdd']) -> Dict[str, pd.DataFrame]:
-        """计算指定的相似性度量。 参数:
-
-        methods: 要计算的相似性度量方法列表。支持 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
-        返回:
-            包含各个相似性矩阵的字典
-
-        """
-        results = {}
-        for method in methods:
-            if method == 'cosine':
-                results['cosine'] = self.cosine_sim()
-            elif method == 'pearson':
-                results['pearson'] = self.pearson_corr()
-            elif method == 'jaccard':
-                results['jaccard'] = self.jaccard_sim()
-            elif method == 'js_distance':
-                results['js_distance'] = self.js_distance()
-            elif method == 'wasserstein':
-                results['wasserstein'] = self.wasserstein_dist()
-            elif method == "otdd":
-                results['otdd'] = self.otdd()
-            else:
-                raise ValueError(f"Unsupported similarity method: {method}")
-        return results
-
-    def get_similarity_matrix(
-            self, methods: List[str] = ['cosine', 'pearson', 'jaccard', 'js_distance']) -> Dict[str, pd.DataFrame]:
-        """同 compute_similarity，保留方法名一致性。"""
-        self.results = self.compute_similarity(methods)
-        return self.results
-
-    def get_max_similarity_A_to_B(self):
-        if self.results is None:
-            raise ValueError(f"need results!")
-        else:
-            self.results_score = {}
-            for key in self.results:
-                self.results_score[key] = self._get_max_similarity(self.results[key])
-        return self.results_score
-
-    def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
-        """最大匹配平均相似性分数."""
-        max_similarity = similarity_matrix.max(axis=1)
-        overall_similarity = max_similarity.mean()
-        return overall_similarity
diff --git a/sc_similarity/example_usage.py b/sc_similarity/example_usage.py
deleted file mode 100644
index 3fedb874..00000000
--- a/sc_similarity/example_usage.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# test_anndata_similarity.py
-
-import anndata
-import numpy as np
-import pandas as pd
-from anndata_similarity import AnnDataSimilarity
-
-
-def create_test_ann_data():
-    # 定义基因和细胞类型
-    genes = ['gene1', 'gene2']
-    celltypes1 = ['A', 'B']
-    celltypes2 = ['A', 'B']
-
-    # 创建数据集1
-    data1 = np.array([
-        [10, 0],  # 细胞类型 A
-        [0, 10]  # 细胞类型 B
-    ])
-    obs1 = pd.DataFrame({'celltype': celltypes1}, index=['cell1', 'cell2'])
-    adata1 = anndata.AnnData(X=data1, obs=obs1, var=pd.DataFrame(index=genes))
-
-    # 创建数据集2
-    data2 = np.array([
-        [10, 0],  # 细胞类型 A
-        [10, 0]  # 细胞类型 B
-    ])
-    obs2 = pd.DataFrame({'celltype': celltypes2}, index=['cell3', 'cell4'])
-    adata2 = anndata.AnnData(X=data2, obs=obs2, var=pd.DataFrame(index=genes))
-
-    return adata1, adata2
-
-
-def run_test_case():
-    # 创建测试数据
-    adata1, adata2 = create_test_ann_data()
-
-    # 初始化相似性计算器
-    similarity_calculator = AnnDataSimilarity(adata1, adata2)
-
-    # 计算相似性
-    similarity_matrices = similarity_calculator.compute_similarity(
-        methods=['cosine', 'pearson', 'jaccard', 'js_distance'])
-
-    # 预期结果
-    expected_cosine = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B'])
-
-    expected_pearson = pd.DataFrame([[1.0, 1.0], [-1.0, -1.0]], index=['A', 'B'], columns=['A', 'B'])
-
-    expected_jaccard = pd.DataFrame([[1.0, 1.0], [0.0, 0.0]], index=['A', 'B'], columns=['A', 'B'])
-
-    expected_js = pd.DataFrame([[1.0, 1.0], [0.167445, 0.167445]], index=['A', 'B'], columns=['A', 'B'])
-
-    # 打印结果
-    print("Computed Cosine Similarity:")
-    print(similarity_matrices['cosine'])
-    print("\nExpected Cosine Similarity:")
-    print(expected_cosine)
-
-    print("\nComputed Pearson Correlation:")
-    print(similarity_matrices['pearson'])
-    print("\nExpected Pearson Correlation:")
-    print(expected_pearson)
-
-    print("\nComputed Jaccard Similarity:")
-    print(similarity_matrices['jaccard'])
-    print("\nExpected Jaccard Similarity:")
-    print(expected_jaccard)
-
-    print("\nComputed Jensen-Shannon distance:")
-    print(similarity_matrices['js_distance'])
-    print("\nExpected Jensen-Shannon distance:")
-    print(expected_js)
-
-    # 验证结果是否与预期一致
-    assert similarity_matrices['cosine'].equals(expected_cosine), "Cosine similarity does not match expected values."
-    assert similarity_matrices['pearson'].equals(
-        expected_pearson), "Pearson correlation does not match expected values."
-    assert similarity_matrices['jaccard'].equals(expected_jaccard), "Jaccard similarity does not match expected values."
-
-    # 由于浮点数计算的精度问题，使用近似比较
-    assert np.allclose(similarity_matrices['js_distance'], expected_js,
-                       atol=1e-4), "JS distance does not match expected values."
-
-    print("\nAll tests passed successfully!")
-
-
-if __name__ == "__main__":
-    run_test_case()

From 18b197f98bf5184f1d46084bb4dce22c56377522 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 13 Nov 2024 15:46:56 +0800
Subject: [PATCH 097/203] minor change

---
 examples/sc_similarity_examples/sim_query_atlas.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/sc_similarity_examples/sim_query_atlas.py
index 0b6dc8d7..de0c2b5b 100644
--- a/examples/sc_similarity_examples/sim_query_atlas.py
+++ b/examples/sc_similarity_examples/sim_query_atlas.py
@@ -1,3 +1,5 @@
+import argparse
+
 import pandas as pd
 
 atlas_datasets = [
@@ -80,7 +82,10 @@ def get_ans(query_dataset, method):
 
 
 ans_all = {}
-methods = ["cta_actinn", "cta_scdeepsort"]
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("--methods", default=["cta_actinn", "cta_scdeepsort"], nargs="+")
+args = parser.parse_args()
+methods = args.methods
 if __name__ == "__main__":
     for query_dataset in query_datasets:
         ans = []

From 864747e81747814c66772e47d21401b24852231d Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 13 Nov 2024 02:58:42 -0500
Subject: [PATCH 098/203] minor change

---
 examples/sc_similarity_examples/sim_query_atlas.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/sc_similarity_examples/sim_query_atlas.py
index de0c2b5b..de783f20 100644
--- a/examples/sc_similarity_examples/sim_query_atlas.py
+++ b/examples/sc_similarity_examples/sim_query_atlas.py
@@ -1,4 +1,5 @@
 import argparse
+from pathlib import Path
 
 import pandas as pd
 
@@ -21,6 +22,8 @@
 
 from dance.utils import try_import
 
+file_root = str(Path(__file__).resolve().parent.parent)
+
 
 def find_unique_matching_row(df, config_col, input_dict_list):
     """在 DataFrame 中查找指定列中与输入字典列表匹配的唯一一行。
@@ -70,7 +73,7 @@ def is_match(config_str):
 
 
 def get_ans(query_dataset, method):
-    data = pd.read_csv(f"/home/zyxing/dance/examples/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv")
+    data = pd.read_csv(f"{file_root}/tuning/{method}/{query_dataset}/results/atlas/best_test_acc.csv")
     sweep_url = get_sweep_url(data)
     _, _, sweep_id = spilt_web(sweep_url)
     sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")

From b2db5fa5076d01e75db826b24350e96804945129 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 14 Nov 2024 16:00:23 +0800
Subject: [PATCH 099/203] minor

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c58e5232..11365116 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ h5py==3.11.0
 igraph==0.10.8
 leidenalg==0.10.1
 louvain==0.8.1
-mudata==0.2.3
+mudata==0.2.4
 networkx==3.3; python_version >= "3.10"
 networkx==3.2.1; python_version < "3.10"
 numba==0.59.0

From 4faaa46c1e1bd4e0f10ce90c349511b76e7b3540 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 15 Nov 2024 16:48:21 +0800
Subject: [PATCH 100/203] minor

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 11365116..1d28d37e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ h5py==3.11.0
 igraph==0.10.8
 leidenalg==0.10.1
 louvain==0.8.1
-mudata==0.2.4
+mudata==0.3.1
 networkx==3.3; python_version >= "3.10"
 networkx==3.2.1; python_version < "3.10"
 numba==0.59.0

From 53d9ca9488bb64adaf613a4bc3128b871f44a028 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 15 Nov 2024 17:18:46 +0800
Subject: [PATCH 101/203] minor

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 1d28d37e..a627a45d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,3 +26,4 @@ threadpoolctl==3.5.0
 tifffile==2024.2.12
 torchnmf==0.3.5
 tqdm==4.66.2
+anndata==0.10.8

From 1ff7f672196fd33b6f691b3211132ae22a6a7491 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 15 Nov 2024 17:29:42 +0800
Subject: [PATCH 102/203] minor

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a627a45d..b22a69a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ h5py==3.11.0
 igraph==0.10.8
 leidenalg==0.10.1
 louvain==0.8.1
-mudata==0.3.1
+mudata==0.2.3
 networkx==3.3; python_version >= "3.10"
 networkx==3.2.1; python_version < "3.10"
 numba==0.59.0

From 7f204df1af556b84c596a07b3581318790f09b2e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 15 Nov 2024 20:38:23 +0800
Subject: [PATCH 103/203] see https://github.com/PyCQA/docformatter/pull/287

---
 .pre-commit-config.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c580386f..194f88fa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,13 +32,13 @@ repos:
         name: Sort imports
         args: [--line-width, "120", --profile, black]
 
-  - repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.5
-    hooks:
-      - id: docformatter
-        name: Format docstring
-        additional_dependencies: [tomli]
-        args: [--config, pyproject.toml]
+  # - repo: https://github.com/PyCQA/docformatter
+  #   rev: v1.7.5
+  #   hooks:
+  #     - id: docformatter
+  #       name: Format docstring
+  #       additional_dependencies: [tomli]
+  #       args: [--config, pyproject.toml]
 
   - repo: https://github.com/executablebooks/mdformat
     rev: 0.7.17

From 5c1d43a6d81ec5884b039286cdc97cc88cddb40c Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 20 Nov 2024 16:54:40 +0800
Subject: [PATCH 104/203] sc_similarity

---
 dance/sc_similarity/anndata_similarity.py | 357 ++++++++++++++++++++++
 dance/sc_similarity/download_data.py      |   9 +
 2 files changed, 366 insertions(+)
 create mode 100644 dance/sc_similarity/anndata_similarity.py
 create mode 100644 dance/sc_similarity/download_data.py

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
new file mode 100644
index 00000000..0287dee8
--- /dev/null
+++ b/dance/sc_similarity/anndata_similarity.py
@@ -0,0 +1,357 @@
+# anndata_similarity.py
+# TODO translate notes
+import re
+import warnings
+from typing import Callable, Dict, List, Optional
+
+import anndata
+import anndata as ad
+import numpy as np
+import pandas as pd
+import scanpy as sc
+import yaml
+from omegaconf import OmegaConf
+from scipy.spatial.distance import jaccard
+from scipy.stats import pearsonr, wasserstein_distance
+from sklearn.metrics.pairwise import cosine_similarity
+
+# Suppress scipy warnings for constant input in Pearson correlation
+warnings.filterwarnings("ignore", message="An input array is constant")
+
+
+class AnnDataSimilarity:
+
+    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, cell_col: str,
+                 ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None,
+                 adata2_name: Optional[str] = None,
+                 methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"):
+        """Initialize the AnnDataSimilarity object and perform data preprocessing."""
+        self.adata1 = adata1.copy()
+        self.adata2 = adata2.copy()
+        self.origin_adata1 = adata1.copy()
+        self.origin_adata2 = adata2.copy()
+        self.cell_col = cell_col
+        self.preprocess()
+        self.results = {}
+        self.results_score = {}
+        self.ground_truth_conf_path = ground_truth_conf_path
+        self.adata1_name = adata1_name
+        self.adata2_name = adata2_name
+        self.methods = methods
+        self.tissue = tissue
+
+    def filter_gene(self):
+        sc.pp.highly_variable_genes(self.adata1, n_top_genes=2000, flavor='seurat_v3')
+        sc.pp.highly_variable_genes(self.adata2, n_top_genes=2000, flavor='seurat_v3')
+
+        common_hvg = self.adata1.var_names[self.adata1.var['highly_variable']].intersection(
+            self.adata2.var_names[self.adata2.var['highly_variable']])
+
+        self.adata1 = self.adata1[:, common_hvg].copy()
+        self.adata2 = self.adata2[:, common_hvg].copy()
+        self.common_genes = common_hvg
+
+    def preprocess(self):
+        self.filter_gene()
+        """Preprocess the data, including log normalization and normalization to probability distribution."""
+        self.adata1.obs[self.cell_col] = self.adata1.obs[self.cell_col].astype(str)
+        self.adata2.obs[self.cell_col] = self.adata2.obs[self.cell_col].astype(str)
+        self.avg_expr1 = self._compute_average_expression(self.adata1)
+        self.avg_expr2 = self._compute_average_expression(self.adata2)
+        self.prob_expr1 = self._normalize_to_probability(self.avg_expr1)
+        self.prob_expr2 = self._normalize_to_probability(self.avg_expr2)
+
+    def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame:
+        """Calculate the average gene expression for each cell type"""
+        return adata.to_df().groupby(adata.obs[self.cell_col]).mean()
+
+    def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Normalize the gene expression matrix to a probability distribution (expression sums to 1 for each cell type)"""
+        return df.div(df.sum(axis=1), axis=0).fillna(0)
+
+    def cosine_sim(self) -> pd.DataFrame:
+        """Computes the cosine similarity between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively."""
+        sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2)
+        return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index)
+
+    def pearson_corr(self) -> pd.DataFrame:
+        """Computes the Pearson correlation coefficient between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively."""
+        celltypes1 = self.avg_expr1.index
+        celltypes2 = self.avg_expr2.index
+        corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
+                corr_matrix.at[ct1, ct2] = corr
+
+        return corr_matrix.astype(float)
+
+    def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame:
+        """Computes the Jaccard similarity between two datasets. Uses a binary representation of gene expression based on a specified threshold. Returns a data frame with rows and columns of cell types in adata1 and adata2 respectively."""
+        # Binarized expression matrix
+        binary_expr1 = (self.avg_expr1 > threshold).astype(int)
+        binary_expr2 = (self.avg_expr2 > threshold).astype(int)
+
+        celltypes1 = binary_expr1.index
+        celltypes2 = binary_expr2.index
+        sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2])
+                sim_matrix.at[ct1, ct2] = sim
+
+        return sim_matrix.astype(float)
+
+    def js_distance(self) -> pd.DataFrame:
+        """Computes the Jensen-Shannon divergence between two datasets. The expression data must first be normalized to a probability distribution. Returns a data frame with rows and columns containing the cell types of adata1 and adata2, respectively."""
+        # def jsd(p, q):
+        #     """
+        #     计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。
+        #     """
+        #     p = p + 1e-12
+        #     q = q + 1e-12
+        #     m = 0.5 * (p + q)
+        #     return 0.5 * (entropy(p, m) + entropy(q, m))
+
+        # from scipy.stats import entropy
+
+        celltypes1 = self.prob_expr1.index
+        celltypes2 = self.prob_expr2.index
+        js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values,
+                                                                self.prob_expr2.loc[ct2].values)
+                js_matrix.at[ct1, ct2] = jsd_value
+
+        return js_matrix.astype(float)
+
+    def _jensen_shannon_divergence(self, p, q) -> float:
+        """Compute the Jensen-Shannon divergence of two probability distributions p and q."""
+        from scipy.spatial.distance import jensenshannon
+        return jensenshannon(p, q)
+
+    def common_genes_num(self):
+        return len(self.common_genes)
+
+    def otdd():
+        """Compute the OTDD between two data sets."""
+        raise NotImplementedError("OTDD!")
+
+    def data_company():
+        raise NotImplementedError("data company")
+
+    def wasserstein_dist(self) -> pd.DataFrame:
+        """Compute the Wasserstein distance between two datasets. Return a data frame with the cell types in rows and columns of adata1 and adata2 respectively."""
+        celltypes1 = self.avg_expr1.index
+        celltypes2 = self.avg_expr2.index
+        wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
+
+        for ct1 in celltypes1:
+            for ct2 in celltypes2:
+                wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
+                wasserstein_matrix.at[ct1, ct2] = wd
+
+        return wasserstein_matrix.astype(float)
+
+    def get_dataset_meta_sim(self):
+        # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general']
+        con_cols = [
+            "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean",
+            "n_counts_var", "var_n_counts_mean", "var_n_counts_var"
+        ]
+        dis_cols = ['assay', 'tissue']
+
+        def get_discrete_sim(col_list1, col_list2):
+            set1 = set(col_list1)
+            set2 = set(col_list2)
+            intersection = len(set1.intersection(set2))
+            union = len(set1.union(set2))
+            return intersection / union
+
+        def get_con_sim(con_data_1, con_data_2):
+            return abs(con_data_1 - con_data_2) / max(con_data_1, con_data_2)
+
+        def get_dataset_info(data: ad.AnnData):
+            con_sim = {}
+            con_sim["nnz_mean"] = np.mean(data.obs["nnz"])
+            con_sim["nnz_var"] = np.var(data.obs["nnz"])
+            nnz_values = data.X[data.X.nonzero()]
+            con_sim["nnz_counts_mean"] = np.mean(nnz_values)
+            con_sim["nnz_counts_var"] = np.var(nnz_values)
+            con_sim["n_measured_vars"] = np.mean(data.obs["n_measured_vars"])
+            con_sim["cell_num"] = len(data.obs)
+            con_sim["gene_num"] = len(data.var)
+            con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"])
+            con_sim["n_counts_var"] = np.var(data.obs["n_counts"])
+            con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"])
+            con_sim["var_n_counts_var"] = np.var(data.var["n_counts"])
+            data.uns["con_sim"] = con_sim
+            return data
+
+        data_1 = self.adata1.copy()
+        data_2 = self.adata2.copy()
+        data_1 = get_dataset_info(data_1)
+        data_2 = get_dataset_info(data_2)
+        ans = {}
+        obs_1 = data_1.obs
+        obs_2 = data_2.obs
+        con_sim_1 = data_1.uns["con_sim"]
+        con_sim_2 = data_2.uns["con_sim"]
+        for dis_col in dis_cols:
+            ans[f"{dis_col}_sim"] = get_discrete_sim(obs_1[dis_col].values, obs_2[dis_col].values)
+        for con_col in con_cols:
+            ans[f"{con_col}_sim"] = get_con_sim(con_sim_1[con_col], con_sim_2[con_col])
+        return np.mean(list(ans.values()))
+
+    def get_ground_truth(self):
+        assert self.ground_truth_conf_path is not None
+        assert self.adata1_name is not None
+        assert self.adata2_name is not None
+        ground_truth_conf = pd.read_excel(self.ground_truth_conf_path, sheet_name=self.tissue, index_col=0)
+
+        def get_targets(dataset_truth: str):
+            dataset_truth = OmegaConf.create(fix_yaml_string(dataset_truth))
+            targets = []
+            for item in dataset_truth:
+                targets.append(item["target"])
+            return targets
+
+        sim_targets = []
+        for method in self.methods:
+            query_dataset_truth = ground_truth_conf.loc[self.adata1_name, f"{method}_method"]
+            atlas_dataset_truth = ground_truth_conf.loc[self.adata2_name, f"{method}_method"]
+            query_targets = get_targets(query_dataset_truth)
+            atlas_targets = get_targets(atlas_dataset_truth)
+            assert len(query_targets) == len(atlas_targets)
+            sim_targets.append((sum(a == b for a, b in zip(query_targets, atlas_targets)), len(query_targets)))
+        sim_targets.append((sum(x for x, y in sim_targets), sum(y for x, y in sim_targets)))
+        return sim_targets
+
+    def compute_similarity(
+        self, methods: List[str] = [
+            'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim"
+        ]
+    ) -> Dict[str, pd.DataFrame]:
+        """Computes the specified similarity measure. Parameters:
+
+        methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
+        Returns:
+        Dictionary containing the similarity matrices
+
+        """
+        results = {}
+        for method in methods:
+            if method == 'cosine':
+                results['cosine'] = self.cosine_sim()
+            elif method == 'pearson':
+                results['pearson'] = self.pearson_corr()
+            elif method == 'jaccard':
+                results['jaccard'] = self.jaccard_sim()
+            elif method == 'js_distance':
+                results['js_distance'] = self.js_distance()
+            elif method == 'wasserstein':
+                results['wasserstein'] = self.wasserstein_dist()
+            elif method == "common_genes_num":
+                results["common_genes_num"] = self.common_genes_num()
+            elif method == "otdd":
+                results['otdd'] = self.otdd()
+            elif method == "ground_truth":
+                results["ground_truth"] = self.get_ground_truth()
+            elif method == "metadata_sim":
+                results["metadata_sim"] = self.get_dataset_meta_sim()
+            else:
+                raise ValueError(f"Unsupported similarity method: {method}")
+        return results
+
+    def get_similarity_matrix(
+        self, methods: List[str] = [
+            'cosine', 'pearson', 'jaccard', 'js_distance', "common_genes_num", "ground_truth", "metadata_sim"
+        ]
+    ) -> Dict[str, pd.DataFrame]:
+        """Same as compute_similarity, keeping method name consistency."""
+        self.results = self.compute_similarity(methods)
+        return self.results
+
+    def get_max_similarity_A_to_B(self):
+        if self.results is None:
+            raise ValueError(f"need results!")
+        else:
+            self.results_score = {}
+            for key in self.results:
+                if key not in ["common_genes_num", "ground_truth", "metadata_sim"]:
+                    self.results_score[key] = self._get_max_similarity(self.results[key])
+                else:
+                    self.results_score[key] = self.results[key]
+        return self.results_score
+
+    def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
+        """Maximum matching average similarity score."""
+        matched_values = [
+            similarity_matrix.loc[label,
+                                  label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max()
+            for label in similarity_matrix.index
+        ]  # need to ask
+        overall_similarity = np.mean(matched_values)
+        return overall_similarity
+
+
+def extract_type_target_params(item_text):
+    lines = item_text.strip().split('\n')
+    item_dict = {}
+    params_dict = {}
+    current_param_key = None
+    in_params = False
+    for line in lines:
+        stripped_line = line.strip()
+        if stripped_line.startswith('- type:'):
+            item_dict['type'] = stripped_line.split(':', 1)[1].strip()
+        elif stripped_line.startswith('target:'):
+            item_dict['target'] = stripped_line.split(':', 1)[1].strip()
+        elif stripped_line.startswith('params:'):
+            params_content = stripped_line.split(':', 1)[1].strip()
+            if params_content == '{}':
+                params_dict = {}
+                in_params = False
+            else:
+                params_dict = {}
+                in_params = True
+        elif in_params:
+            if re.match(r'^\w+:$', stripped_line):
+                current_param_key = stripped_line[:-1].strip()
+                params_dict[current_param_key] = {}
+            elif re.match(r'^- ', stripped_line):
+                list_item = stripped_line[2:].strip()
+                if current_param_key:
+                    if not isinstance(params_dict[current_param_key], list):
+                        params_dict[current_param_key] = []
+                    params_dict[current_param_key].append(list_item)
+            elif ':' in stripped_line:
+                key, value = map(str.strip, stripped_line.split(':', 1))
+                if current_param_key and isinstance(params_dict.get(current_param_key, None), dict):
+                    params_dict[current_param_key][key] = yaml.safe_load(value)
+                else:
+                    params_dict[key] = yaml.safe_load(value)
+    item_dict['params'] = params_dict
+    return item_dict
+
+
+def fix_yaml_string(original_str):
+    #It will be deleted
+    yaml_str = original_str.replace('\\n', '\n').strip()
+    items = re.split(r'(?=-\s*type:)', yaml_str)
+    config_list = []
+    for item in items:
+        if not item.strip():
+            continue
+        if not item.strip().startswith('- type:'):
+            print(item)
+            print("警告: 某个项未以 '- type:' 开头，跳过此项.")
+            continue
+        item_dict = extract_type_target_params(item)
+        config_list.append(item_dict)
+    fixed_yaml = yaml.dump(config_list, sort_keys=False)
+    return fixed_yaml
diff --git a/dance/sc_similarity/download_data.py b/dance/sc_similarity/download_data.py
new file mode 100644
index 00000000..83c705fd
--- /dev/null
+++ b/dance/sc_similarity/download_data.py
@@ -0,0 +1,9 @@
+from dance.datasets.singlemodality import CellTypeAnnotationDataset
+
+
+def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "h5ad", train_dataset=[],
+                test_dataset=[], valid_dataset=[], data_dir="../temp_data"):
+    data = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset,
+                                     valid_dataset=valid_dataset, data_dir=data_dir, tissue=tissue, species=species,
+                                     filetype=filetype).load_data()
+    return data.data

From 60d5f21ed9fa30427423139a01c9170fbcded19b Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 21 Nov 2024 21:23:34 +0800
Subject: [PATCH 105/203] minor

---
 dance/metadata/scdeepsort.csv | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index d8fa18ce..cc41e000 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -88,3 +88,23 @@ human,Blood,1324,train,,,train_human_Blooda5d95a42-0137-496f-8a60-101e17f263c8_d
 human,Blood,10000,train,,,train_human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad,https://www.dropbox.com/scl/fi/8wq8eaod0xuvgwhsjoapa/human_Bloodc7775e88-49bf-4ba2-a03b-93f00447c958_data.h5ad?rlkey=b6u3b7335l7baricjlgbwthb3&st=cw7mjmx5&dl=1
 human,Blood,10000,train,,,train_human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad,https://www.dropbox.com/scl/fi/7gszhapz281uah6ytc615/human_Blood456e8b9b-f872-488b-871d-94534090a865_data.h5ad?rlkey=28ywz595f00ppjqwg6054tdqj&st=7bxft78n&dl=1
 human,Blood,10000,train,,,train_human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad,https://www.dropbox.com/scl/fi/kgay0bhk4er6qjx96okrz/human_Blood738942eb-ac72-44ff-a64b-8943b5ecd8d9_data.h5ad?rlkey=m5ax0vhx3vh7ylo4pc74tx9ky&st=sbhonz18&dl=1
+human,Heart,10000,train,,,train_human_Heart1c739a3e-c3f5-49d5-98e0-73975e751201_data.h5ad,https://www.dropbox.com/scl/fi/ymqp6iki4vu2ur9jw56fp/human_Heart1c739a3e-c3f5-49d5-98e0-73975e751201_data.h5ad?rlkey=o6cladni7kfsfigvni5hk01n7&st=zcxajdyt&dl=1
+human,Heart,10000,train,,,train_human_Heart4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad,https://www.dropbox.com/scl/fi/ml4fvp9v9nufot215m9tl/human_Heart4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad?rlkey=nzfgqj7lb943tvxt2mgtg6o8m&st=4jn8mo66&dl=1
+human,Heart,2834,train,,,train_human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/w2oe3csvt50riz9afrr1g/human_Heart5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=auhxwg79r0xkixttj9f33ho9t&st=0584ksnu&dl=1
+human,Heart,2839,train,,,train_human_Heart83b5e943-a1d5-4164-b3f2-f7a37f01b524_data.h5ad,https://www.dropbox.com/scl/fi/o8vppxwumzh58her9ifxu/human_Heart83b5e943-a1d5-4164-b3f2-f7a37f01b524_data.h5ad?rlkey=iyr5mrhxeisrj5n6pfs2ogdwd&st=vlkpmn65&dl=1
+human,Heart,1089,train,,,train_human_Heart97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad,https://www.dropbox.com/scl/fi/x84r3fzkgfcnn8nsnvmyz/human_Heart97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad?rlkey=vzv8xknnfawjfzwv635p5cj8l&st=3jjdb83q&dl=1
+human,Heart,10000,train,,,train_human_Heart572f3f3e-d3e4-4d13-8e2b-88215e508481_data.h5ad,https://www.dropbox.com/scl/fi/rjv6hv7f14exz1lbh2n9m/human_Heart572f3f3e-d3e4-4d13-8e2b-88215e508481_data.h5ad?rlkey=fszfqswjpe9aspm30x1js05sb&st=35ij2wom&dl=1
+human,Heart,3961,train,,,train_human_Heart1009f384-b12d-448e-ba9f-1b7d2ecfbb4e_data.h5ad,https://www.dropbox.com/scl/fi/a2mooa1bszno3xh4o6sew/human_Heart1009f384-b12d-448e-ba9f-1b7d2ecfbb4e_data.h5ad?rlkey=k9x23whyta1z0g9org4zdrsmm&st=k1z51tdw&dl=1
+human,Heart,10000,train,,,train_human_Heart1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d_data.h5ad,https://www.dropbox.com/scl/fi/86e8yrjglgo714p02jozm/human_Heart1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d_data.h5ad?rlkey=ff7y5amtc8hovujepsqtvyki6&st=dtsskar4&dl=1
+human,Heart,10000,train,,,train_human_Heart1252c5fb-945f-42d6-b1a8-8a3bd864384b_data.h5ad,https://www.dropbox.com/scl/fi/ulhxkmys6q8cxiucrsc7q/human_Heart1252c5fb-945f-42d6-b1a8-8a3bd864384b_data.h5ad?rlkey=z8lbxq7hevqh6vjozlu9pmfzu&st=wml2yxfx&dl=1
+human,Heart,10000,train,,,train_human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad,https://www.dropbox.com/scl/fi/tus9c1k07fh12ggq7s8v4/human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad?rlkey=l3qetjv2iedp9be2yi7x7060k&st=sasyq4k0&dl=1
+human,Heart,10000,train,,,train_human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/ud57pptc2vucgfzbumhi0/human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=foifku28abrsjmn7fyeo3nzlq&st=2o88drh7&dl=1
+human,Heart,2576,train,,,train_human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad,https://www.dropbox.com/scl/fi/3yyr268to8d5z2v1ybph5/human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad?rlkey=tavfun6ztdeuz7gw93r80u3f2&st=lypmrhf9&dl=1
+human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/4t7rimzdbu614qicv7ac1/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=eih47wtd6txwtchaobnwb13qu&st=tgh2kahb&dl=1
+human,Heart,10000,train,,,train_human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad,https://www.dropbox.com/scl/fi/932xsn8bkxeppy4xnuy81/human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad?rlkey=yhgjmo6vcbwcfbfpsnnbxc4np&st=p5asp1ad&dl=1
+human,Heart,10000,train,,,train_human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad,https://www.dropbox.com/scl/fi/okl5stq86etyx6zhfaex9/human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad?rlkey=eg9u06l5k0ycw9t4hf9rlz9rh&st=r3henhn9&dl=1
+human,Heart,10000,train,,,train_human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad,https://www.dropbox.com/scl/fi/qv88ufwdqb89boz79t4w8/human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad?rlkey=s5u32dymnqzpczp67f2vopdxp&st=8u7airt8&dl=1
+human,Heart,10000,train,,,train_human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_data.h5ad,https://www.dropbox.com/scl/fi/xddt54xc5bujkkvhql5nv/human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_data.h5ad?rlkey=fnlrns7gupn548zfa6g6s11iw&st=42qx454t&dl=1
+human,Heart,3799,train,,,train_human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad,https://www.dropbox.com/scl/fi/82mmm9drh008r4faduuvd/human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad?rlkey=u4iwlbm6e6laaht8ey7950fqh&st=v8u040to&dl=1
+human,Heart,10000,train,,,train_human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad,https://www.dropbox.com/scl/fi/uiufoyquxt0hea9dgf0l2/human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad?rlkey=7mu54uqnqrbqtuyfxnqdexmc5&st=2k45pou1&dl=1
+human,Heart,10000,train,,,train_human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/kkgg4abyidaylwxkut2bx/human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=77wmk5knn1oenbffwxo9a6zkk&st=bxemfdf6&dl=1

From a44f62517c75652dcc8a08a9d4f0f816072b5476 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sun, 24 Nov 2024 16:53:16 +0800
Subject: [PATCH 106/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index cc41e000..dc22dd98 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -100,7 +100,7 @@ human,Heart,10000,train,,,train_human_Heart1252c5fb-945f-42d6-b1a8-8a3bd864384b_
 human,Heart,10000,train,,,train_human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad,https://www.dropbox.com/scl/fi/tus9c1k07fh12ggq7s8v4/human_Heart9434b020-de42-43eb-bcc4-542b2be69015_data.h5ad?rlkey=l3qetjv2iedp9be2yi7x7060k&st=sasyq4k0&dl=1
 human,Heart,10000,train,,,train_human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/ud57pptc2vucgfzbumhi0/human_Hearta68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=foifku28abrsjmn7fyeo3nzlq&st=2o88drh7&dl=1
 human,Heart,2576,train,,,train_human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad,https://www.dropbox.com/scl/fi/3yyr268to8d5z2v1ybph5/human_Heartbdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7_data.h5ad?rlkey=tavfun6ztdeuz7gw93r80u3f2&st=lypmrhf9&dl=1
-human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/4t7rimzdbu614qicv7ac1/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=eih47wtd6txwtchaobnwb13qu&st=tgh2kahb&dl=1
+human,Heart,1067,train,,,train_human_Heartc5d88abe-f23a-45fa-a534-788985e93dad(Heart)_data.h5ad,https://www.dropbox.com/scl/fi/4t7rimzdbu614qicv7ac1/human_Heartc5d88abe-f23a-45fa-a534-788985e93dad-Heart-_data.h5ad?rlkey=eih47wtd6txwtchaobnwb13qu&st=9cq6ttbe&dl=1
 human,Heart,10000,train,,,train_human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad,https://www.dropbox.com/scl/fi/932xsn8bkxeppy4xnuy81/human_Heartd4e69e01-3ba2-4d6b-a15d-e7048f78f22e_data.h5ad?rlkey=yhgjmo6vcbwcfbfpsnnbxc4np&st=p5asp1ad&dl=1
 human,Heart,10000,train,,,train_human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad,https://www.dropbox.com/scl/fi/okl5stq86etyx6zhfaex9/human_Heartd567b692-c374-4628-a508-8008f6778f22_data.h5ad?rlkey=eg9u06l5k0ycw9t4hf9rlz9rh&st=r3henhn9&dl=1
 human,Heart,10000,train,,,train_human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad,https://www.dropbox.com/scl/fi/qv88ufwdqb89boz79t4w8/human_Hearte6a11140-2545-46bc-929e-da243eed2cae_data.h5ad?rlkey=s5u32dymnqzpczp67f2vopdxp&st=8u7airt8&dl=1

From 136995b62ba246ea3403a90920cb4ef0da3cf88d Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Tue, 26 Nov 2024 09:26:34 +0800
Subject: [PATCH 107/203] update main

---
 .../tuning/joint_embedding_scmvae/main.py     | 177 ++++++++++--------
 1 file changed, 102 insertions(+), 75 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index c52f2108..b3070b40 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -1,4 +1,5 @@
 import argparse
+import gc
 import os
 import pprint
 import sys
@@ -71,81 +72,107 @@ def parameter_setting():
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-        dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
-        data = dataset.load_data()
-
-        le = preprocessing.LabelEncoder()
-        labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
-        data.mod["mod1"].obsm["labels"] = labels
-
-        # Prepare preprocessing pipeline and apply it to data
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
-        print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
-        preprocessing_pipeline(data)
-
-        (x_train, y_train), _ = data.get_train_data(return_type="torch")
-        (x_test, y_test), labels = data.get_test_data(return_type="torch")
-
-        lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()]))
-        lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()]))
-        lib_mean1 = torch.from_numpy(lib_mean1)
-        lib_var1 = torch.from_numpy(lib_var1)
-        lib_mean2 = torch.from_numpy(lib_mean2)
-        lib_var2 = torch.from_numpy(lib_var2)
-
-        Nfeature1 = x_train.shape[1]
-        Nfeature2 = y_train.shape[1]
-        train_size = len(data.get_split_idx("train"))
-        train = data_utils.TensorDataset(x_train, lib_mean1[:train_size], lib_var1[:train_size], lib_mean2[:train_size],
-                                         lib_var2[:train_size], y_train)
-
-        valid = data_utils.TensorDataset(x_test, lib_mean1[train_size:], lib_var1[train_size:], lib_mean2[train_size:],
-                                         lib_var2[train_size:], y_test)
-
-        total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
-
-        total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
-
-        x_test = torch.cat([x_train, x_test])
-        y_test = torch.cat([y_train, y_test])
-        labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
-        model = scMVAE(
-            encoder_1=[Nfeature1, 1024, 128, 128],
-            hidden_1=128,
-            Z_DIMS=22,
-            decoder_share=[22, 128, 256],
-            share_hidden=128,
-            decoder_1=[128, 128, 1024],
-            hidden_2=1024,
-            encoder_l=[Nfeature1, 128],
-            hidden3=128,
-            encoder_2=[Nfeature2, 1024, 128, 128],
-            hidden_4=128,
-            encoder_l1=[Nfeature2, 128],
-            hidden3_1=128,
-            decoder_2=[128, 128, 1024],
-            hidden_5=1024,
-            drop_rate=0.1,
-            log_variational=True,
-            Type="ZINB",
-            device=device,
-            n_centroids=22,
-            penality="GMM",
-            model=1,
-        )
-        model.to(device)
-        model.init_gmm_params(total_loader)
-        model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
-
-        # embeds = model.predict(x_test, y_test).cpu().numpy()
-        score = model.score(x_test, y_test, labels)
-        score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
-        score["ARI"] = score["dance_ari"]
-        del score["dance_ari"]
-        wandb.log(score)
-        wandb.finish()
-        torch.cuda.empty_cache()
+        wandb_config = wandb.config
+        if "run_kwargs" in pipeline_planer.config:
+            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                wandb_config = wandb_config["run_kwargs"]
+            else:
+                wandb.log({"skip": 1})
+                wandb.finish()
+                return
+        try:
+            dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
+            data = dataset.load_data()
+
+            le = preprocessing.LabelEncoder()
+            labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+            data.mod["mod1"].obsm["labels"] = labels
+
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb_config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+            preprocessing_pipeline(data)
+            train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
+            
+            # train_size=data.mod["meta1"].shape[0]
+            # test_size=data.mod["mod1"].shape[0]-train_size
+            data.set_split_idx("train",train_idx)
+            data.set_split_idx("test",test_idx)
+            (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch")
+            (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch")
+            # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels)
+            lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()]))
+            lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()]))
+            lib_mean1 = torch.from_numpy(lib_mean1)
+            lib_var1 = torch.from_numpy(lib_var1)
+            lib_mean2 = torch.from_numpy(lib_mean2)
+            lib_var2 = torch.from_numpy(lib_var2)
+
+            Nfeature1 = x_train.shape[1]
+            Nfeature2 = y_train.shape[1]
+            # train_size = len(data.get_split_idx("train"))
+            # train_size=x_train.shape[0]
+            train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx],
+                                            lib_var2[train_idx], y_train)
+
+            valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx],
+                                            lib_var2[test_idx], y_test)
+
+            total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
+
+            total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
+
+            x_test = torch.cat([x_train, x_test])
+            y_test = torch.cat([y_train, y_test])
+            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
+            model = scMVAE(
+                encoder_1=[Nfeature1, 1024, 128, 128],
+                hidden_1=128,
+                Z_DIMS=22,
+                decoder_share=[22, 128, 256],
+                share_hidden=128,
+                decoder_1=[128, 128, 1024],
+                hidden_2=1024,
+                encoder_l=[Nfeature1, 128],
+                hidden3=128,
+                encoder_2=[Nfeature2, 1024, 128, 128],
+                hidden_4=128,
+                encoder_l1=[Nfeature2, 128],
+                hidden3_1=128,
+                decoder_2=[128, 128, 1024],
+                hidden_5=1024,
+                drop_rate=0.1,
+                log_variational=True,
+                Type="ZINB",
+                device=device,
+                n_centroids=22,
+                penality="GMM",
+                model=1,
+            )
+            model.to(device)
+            model.init_gmm_params(total_loader)
+            model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
+
+            # embeds = model.predict(x_test, y_test).cpu().numpy()
+            score = model.score(x_test, y_test, labels)
+            # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
+            score["ARI"] = score["dance_ari"]
+            del score["dance_ari"]
+            wandb.log(score)
+            wandb.finish()
+        finally:
+            locals_keys=list(locals().keys())
+            for var in locals_keys:
+                try:
+                    exec(f"del {var}")
+                    logger.info(f"Deleted '{var}'")
+                except NameError:
+                    logger.info(f"Variable '{var}' does not exist, continuing...")
+            torch.cuda.empty_cache()
+            gc.collect()
         # score.update({
         #     'seed': args.seed + k,
         #     'subtask': args.subtask,

From 82f16f41f804385fbae594255412d77d658c7596 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 Nov 2024 01:31:59 +0000
Subject: [PATCH 108/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmvae/main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index b349fd42..8b091eb7 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -1,6 +1,5 @@
 import argparse
 import gc
-import gc
 import os
 import pprint
 import sys
@@ -111,7 +110,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
             train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
             test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
-            
+
             # train_size=data.mod["meta1"].shape[0]
             # test_size=data.mod["mod1"].shape[0]-train_size
             data.set_split_idx("train",train_idx)

From e5e2edfe902fef99e10e61b477482de9277ce739 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 29 Nov 2024 10:59:10 +0800
Subject: [PATCH 109/203] minor

---
 .../result_analysis/get_important_pattern.py  | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 examples/result_analysis/get_important_pattern.py

diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py
new file mode 100644
index 00000000..b03731cc
--- /dev/null
+++ b/examples/result_analysis/get_important_pattern.py
@@ -0,0 +1,212 @@
+# metric_name = "test_acc"
+# ascending = False
+import argparse
+import itertools
+import pathlib
+from collections import Counter
+from itertools import combinations
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scikit_posthocs as sp
+import seaborn as sns
+from mlxtend.frequent_patterns import apriori
+from mlxtend.preprocessing import TransactionEncoder
+from networkx import parse_adjlist
+from scipy import cluster, stats
+
+
+#TODO need to sync all files or get sweep,not file
+#asceding need to think
+#负向的pattern，换一下顺序就可以吧
+def get_important_pattern(test_accs, ascending, vis=True, alpha=0.05, title=""):
+
+    if vis:
+        fig = plt.figure(figsize=(12, 4))
+        sns.boxplot(data=test_accs)
+        plt.xticks(list(range(len(test_accs))), [f"{i}" for i in range(len(test_accs))])
+        plt.title(title)
+        plt.show()
+    _, p_value = stats.kruskal(*test_accs)
+    if p_value < alpha:
+        medians = [np.median(group) for group in test_accs]
+        data = test_accs
+        p_values_matrix = sp.posthoc_dunn(a=data, p_adjust="bonferroni")
+        sorted_indices = np.argsort(np.argsort([-x for x in medians] if ascending else medians))
+        ranks = {
+            index: {
+                "rank": rank,
+                "before": None,
+                "after": [],
+                "real_rank": rank
+            }
+            for index, rank in enumerate(sorted_indices)
+        }
+        for (rank1, rank2) in combinations(range(max(sorted_indices) + 1), 2):
+            for idx1 in [index for index, value in ranks.items() if value["rank"] == rank1]:
+                for idx2 in [index for index, value in ranks.items() if value["rank"] == rank2]:
+                    if p_values_matrix.iloc[idx1, idx2] > alpha:
+                        if ranks[idx2]["before"] is None:
+                            ranks[idx1]["after"].append(idx2)
+                            ranks[idx2]["before"] = idx1
+
+        def change_real_rank(rank_item, real_rank):
+            rank_item["real_rank"] = real_rank
+            for idx in rank_item["after"]:
+                change_real_rank(ranks[idx], real_rank)
+
+        for rank_item in ranks.values():
+            if rank_item["before"] is None:
+                for idx in rank_item["after"]:
+                    change_real_rank(ranks[idx], rank_item["real_rank"])
+        return [v["real_rank"] for k, v in ranks.items()]
+    else:
+        if vis:
+            print("No significant differences found between the groups.")
+        return []
+
+
+def replace_nan_in_2d(lst):
+    return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst]
+
+
+def are_all_elements_same_direct(list_2d):
+    first_element = None
+    for sublist in list_2d:
+        for element in sublist:
+            if first_element is None:
+                first_element = element
+            elif element != first_element:
+                return False
+    return True if first_element is not None else True
+
+
+def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1):
+    threshold = int(len(step2_data) * threshold_per)
+    step2_data.loc[:, metric_name] = step2_data.loc[:, metric_name].astype(float)
+    df_sorted = step2_data.sort_values(metric_name, ascending=ascending)
+    top_10_percent = df_sorted.head(threshold)
+    columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
+    transactions = top_10_percent[columns].values.tolist()
+    te = TransactionEncoder()
+    te_ary = te.fit(transactions).transform(transactions)
+    df = pd.DataFrame(te_ary, columns=te.columns_)
+    frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
+    # print(frequent_itemsets)
+    # rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
+    return [tuple(a) for a in frequent_itemsets["itemsets"]]
+
+
+# def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False):
+#     if not data:
+#         return []
+
+#     n = max(1, n)
+
+#     mean = np.mean(data)
+#     std = np.std(data)
+
+#     if std == 0:
+#         return sorted(data, reverse=not ascending)[:n]
+
+#     z_scores = [(x, (x - mean) / std) for x in data]
+
+#     significant_values = [x for x, z in z_scores if z > threshold]
+
+#     significant_values_sorted = sorted(significant_values, reverse=not ascending)
+
+#     if len(significant_values_sorted) < n:
+#         remaining = sorted(data, reverse=not ascending)[:n - len(significant_values_sorted)]
+#         significant_values_sorted.extend(remaining)
+
+#     return significant_values_sorted[:n]
+
+
+def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05):
+    ans_all = []
+    columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
+    test_accs = []
+    test_acc_names = []
+    for r in range(1, len(columns)):  #全流程的单独处理
+        for com in itertools.combinations(columns, r):
+            test_accs_arrays = []
+            groups = step2_data.groupby(by=list(com))
+            if len(groups) == 1:
+                continue
+            for g in groups:
+                test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])})
+            test_accs += [i[metric_name] for i in test_accs_arrays]
+            test_acc_names += [i["name"] for i in test_accs_arrays]
+            # if are_all_elements_same_direct(test_accs):
+            #     continue
+    test_accs = replace_nan_in_2d(test_accs)
+    final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title=" ".join(list(com)), vis=vis)
+    if len(final_ranks) > 0:  #TODO maybe need to think ascending
+        max_rank = max(final_ranks)
+        max_rank_count = final_ranks.count(max_rank)
+        if max_rank_count < len(final_ranks) / 2:
+            for index, (test_acc_name, rank) in enumerate(zip(test_acc_names, final_ranks)):
+                if rank == max_rank:
+                    if vis:
+                        print(f"index={index},name={test_acc_name},rank={rank}")
+                    ans_all.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, ))
+    return ans_all
+
+
+def summary_pattern(data_path, metric_name, ascending, alpha=0.05, vis=False):
+    step2_origin_data = pd.read_csv(data_path)
+    step2_data = step2_origin_data.dropna()
+    com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha)
+    apr_ans = get_frequent_itemsets(step2_data, metric_name, ascending)
+    return list(set(com_ans) & set(apr_ans))
+
+
+# def list_files(directory,file_name="best_test_acc.csv",save_path="summary_file"):
+#     ans=[]
+#     path = Path(directory)
+#     for file_path in path.rglob('*'):
+#         if file_path.is_file():
+#             if file_path.name==file_name:
+#                 algorithm,dataset=file_path.relative_to(directory).parts[:2]
+#                 ans.append({"algorithm":algorithm,"dataset":dataset,"summary_pattern":summary_pattern(file_path)})
+#     pd.DataFrame(ans).to_csv(save_path)
+def list_files(directories, metric_name, ascending, file_name="best_test_acc.csv", alpha=0.05, vis=False):
+    ans_all = []
+    for directory in directories:
+        path = Path(directory)
+        for file_path in path.rglob('*'):
+            if file_path.is_file():
+                if file_path.name == file_name:
+                    print(file_path)
+                    dataset = file_path.parent
+                    method = file_path.parent.parent
+                    ans = summary_pattern(file_path, metric_name, ascending, alpha=alpha, vis=vis)
+                    with open(Path(file_path.parent.resolve(), "pipeline_summary_pattern.txt"), 'w') as f:
+                        f.write(str(ans))
+                    ans_all.append({"dataset": dataset, "method": method, "ans": ans})
+    return ans_all
+
+
+if __name__ == "__main__":
+    directories = []
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("task", default="cluster")
+    parser.add_argument("metric_name", default="acc")
+    parser.add_argument("ascending", default=False)
+    args = parser.parse_args()
+    task = args.task
+    metric_name = args.metric_name
+    ascending = args.ascending
+    file_root = Path(__file__).resolve().parent.parent / "tuning"
+    for path in file_root.iterdir():
+        if path.is_dir():
+            if str(path.name).startswith(task):
+                directories.append(path)
+    ans_all = list_files(directories, metric_name, ascending)
+    df = pd.DataFrame(ans_all)
+    pivot_df = df.pivot(index="dataset", columns="method", values="ans")
+    pivot_df.to_csv(f"{task}_pattern.csv")
+
+    # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_actinn/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True))

From d5390e122de8e351dea33347249c7934548acb0e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 29 Nov 2024 10:59:24 +0800
Subject: [PATCH 110/203] minor

---
 .../get_important_pattern_sweep.py            | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 examples/result_analysis/get_important_pattern_sweep.py

diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py
new file mode 100644
index 00000000..7010505d
--- /dev/null
+++ b/examples/result_analysis/get_important_pattern_sweep.py
@@ -0,0 +1,98 @@
+import json
+import sys
+from pathlib import Path
+from turtle import pos
+
+import pandas as pd
+import requests
+from get_important_pattern import get_com_all, get_frequent_itemsets
+
+sys.path.append("..")
+from get_result_web import spilt_web
+
+from dance.pipeline import flatten_dict
+from dance.utils import try_import
+
+entity = "xzy11632"
+project = "dance-dev"
+tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"]
+mertic_names = ["test_acc", "acc", "MRE", "ARI", "MSE"]
+ascendings = [False, False, True, False, True]
+file_root = Path(__file__).resolve().parent
+prefix = f'https://wandb.ai/{entity}/{project}'
+runs_sum = 0
+wandb = try_import("wandb")
+positive = True
+
+
+def get_additional_sweep(sweep_id):
+    # if sweep has piror runs
+    # every run get command , get additional sweep id
+    # or last run command
+    sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+    #last run command
+    run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None)
+    additional_sweep_ids = [sweep_id]
+    if run is None:  #check summary data num,note aznph5wt,数量可能不一致。
+        return additional_sweep_ids
+    run_id = run.id
+    web_abs = requests.get(f"https://api.wandb.ai/files/{run.entity}/{run.project}/{run_id}/wandb-metadata.json")
+    args = dict(web_abs.json())["args"]
+    for i in range(len(args)):
+        if args[i] == '--additional_sweep_ids':
+            if i + 1 < len(args):
+                additional_sweep_ids += get_additional_sweep(args[i + 1])
+    return additional_sweep_ids
+
+
+def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=False):
+    # try:
+    step2_data = step2_origin_data.dropna()
+    com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha)
+    apr_ans = get_frequent_itemsets(step2_data, metric_name, ascending)
+    return list(set(com_ans) & set(apr_ans))
+    # except Exception as e:
+    #     print(e)
+    #     return str(e)
+
+
+if __name__ == "__main__":
+    ans_all = []
+    for i, task in enumerate(tasks):
+        data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str)
+        data = data.ffill().set_index(['Methods'])
+        for row_idx in range(data.shape[0]):
+            for col_idx in range(data.shape[1]):
+                method = data.index[row_idx]
+                dataset = data.columns[col_idx]
+                value = data.iloc[row_idx, col_idx]
+                step_name = data.iloc[row_idx]["Unnamed: 1"]
+                if method != "SVM" or dataset != "Dataset 1: GSE67835 Brain":
+                    continue
+                if isinstance(value, str) and value.startswith(prefix) and (
+                        str(step_name).lower() == "step2" or str(step_name).lower() == "step 2"):  #TODO add step3
+                    sweep_url = value
+                else:
+                    continue
+                _, _, sweep_id = spilt_web(sweep_url)
+                sweep_ids = get_additional_sweep(sweep_id)
+                summary_data = []
+                for sweep_id in sweep_ids:
+                    sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+                    for run in sweep.runs:
+                        result = dict(run.summary._json_dict).copy()
+                        result.update(run.config)
+                        result.update({"id": run.id})
+                        summary_data.append(flatten_dict(result))  # get result and config
+                ans = pd.DataFrame(summary_data).set_index(["id"])
+                ans.sort_index(axis=1, inplace=True)
+                print(dataset)
+                print(method)
+                ans_all.append({
+                    "task": task,
+                    "dataset": dataset,
+                    "method": method,
+                    "pattern": summary_pattern(ans, mertic_names[i], ascendings[i])
+                })
+    with open(f"positive:{positive}_pattern.json", "w") as f:
+        json.dump(ans_all, f, indent=2)

From 4a7c5be485e4b424868c1e98f2f3789ec69001d0 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 2 Dec 2024 19:53:16 +0800
Subject: [PATCH 111/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index dc22dd98..c993b2d7 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -108,3 +108,7 @@ human,Heart,10000,train,,,train_human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_
 human,Heart,3799,train,,,train_human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad,https://www.dropbox.com/scl/fi/82mmm9drh008r4faduuvd/human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad?rlkey=u4iwlbm6e6laaht8ey7950fqh&st=v8u040to&dl=1
 human,Heart,10000,train,,,train_human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad,https://www.dropbox.com/scl/fi/uiufoyquxt0hea9dgf0l2/human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad?rlkey=7mu54uqnqrbqtuyfxnqdexmc5&st=2k45pou1&dl=1
 human,Heart,10000,train,,,train_human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/kkgg4abyidaylwxkut2bx/human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=77wmk5knn1oenbffwxo9a6zkk&st=bxemfdf6&dl=1
+human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/nxd9awxshy5y4ps6ctsqr/human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=g7je5vl6kaany80tw8jwsioxr&st=5zq8uimv&dl=1
+human,Heart,10000,train,,,train_human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad,https://www.dropbox.com/scl/fi/uxd1yhbc98ayx4f0ap4h8/human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad?rlkey=aa7f5rylwxzxyc7ue0efvc1kb&st=0rt0d1lr&dl=1
+human,Heart,10000,train,,,train_human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/gb98ycqcu24ewonalqjpo/human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=u8d2ovqkrhptigqmucm7qijde&st=dxhu640r&dl=1
+human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad,https://www.dropbox.com/scl/fi/mz5jq1ig0zp36w7xmv1pe/human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad?rlkey=g6gjodg99l1ba0swr678h8kb0&st=hugh3bpt&dl=1

From 06f85e7767c41baac7127c6b7e1752173276bb43 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 2 Dec 2024 19:58:16 +0800
Subject: [PATCH 112/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index c993b2d7..dd0bcedd 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -108,7 +108,7 @@ human,Heart,10000,train,,,train_human_Heartf15e263b-6544-46cb-a46e-e33ab7ce8347_
 human,Heart,3799,train,,,train_human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad,https://www.dropbox.com/scl/fi/82mmm9drh008r4faduuvd/human_Heartf75f2ff4-2884-4c2d-b375-70de37a34507_data.h5ad?rlkey=u4iwlbm6e6laaht8ey7950fqh&st=v8u040to&dl=1
 human,Heart,10000,train,,,train_human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad,https://www.dropbox.com/scl/fi/uiufoyquxt0hea9dgf0l2/human_Heartf7995301-7551-4e1d-8396-ffe3c9497ace_data.h5ad?rlkey=7mu54uqnqrbqtuyfxnqdexmc5&st=2k45pou1&dl=1
 human,Heart,10000,train,,,train_human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/kkgg4abyidaylwxkut2bx/human_Heartfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=77wmk5knn1oenbffwxo9a6zkk&st=bxemfdf6&dl=1
-human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/nxd9awxshy5y4ps6ctsqr/human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=g7je5vl6kaany80tw8jwsioxr&st=5zq8uimv&dl=1
+human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)_data.h5ad,https://www.dropbox.com/scl/fi/nxd9awxshy5y4ps6ctsqr/human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Heart-_data.h5ad?rlkey=g7je5vl6kaany80tw8jwsioxr&st=2n66z9yc&dl=1
 human,Heart,10000,train,,,train_human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad,https://www.dropbox.com/scl/fi/uxd1yhbc98ayx4f0ap4h8/human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad?rlkey=aa7f5rylwxzxyc7ue0efvc1kb&st=0rt0d1lr&dl=1
 human,Heart,10000,train,,,train_human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/gb98ycqcu24ewonalqjpo/human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=u8d2ovqkrhptigqmucm7qijde&st=dxhu640r&dl=1
 human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad,https://www.dropbox.com/scl/fi/mz5jq1ig0zp36w7xmv1pe/human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad?rlkey=g6gjodg99l1ba0swr678h8kb0&st=hugh3bpt&dl=1

From adfbfd2bed2bc288ab8f1d3245a546a8b40a2e51 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 4 Dec 2024 09:40:05 +0800
Subject: [PATCH 113/203] minor

---
 examples/tuning/joint_embedding_scmvae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 9fb85885..07ae5afa 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -9,9 +9,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE

From 95b28835ce129bda7c15493651789adc1b24df4c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 4 Dec 2024 01:41:18 +0000
Subject: [PATCH 114/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmvae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index d9a7188c..8b091eb7 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -9,9 +9,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE

From 13a42e04e8950d020f3a54b207018bd2c9566ffc Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 4 Dec 2024 23:37:45 +0800
Subject: [PATCH 115/203] minor

---
 dance/metadata/scdeepsort.csv | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index dd0bcedd..747fff41 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -112,3 +112,6 @@ human,Heart,10000,train,,,train_human_Heart2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(
 human,Heart,10000,train,,,train_human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad,https://www.dropbox.com/scl/fi/uxd1yhbc98ayx4f0ap4h8/human_Heart65badd7a-9262-4fd1-9ce2-eb5dc0ca8039_data.h5ad?rlkey=aa7f5rylwxzxyc7ue0efvc1kb&st=0rt0d1lr&dl=1
 human,Heart,10000,train,,,train_human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/gb98ycqcu24ewonalqjpo/human_Heartf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=u8d2ovqkrhptigqmucm7qijde&st=dxhu640r&dl=1
 human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad,https://www.dropbox.com/scl/fi/mz5jq1ig0zp36w7xmv1pe/human_Hearted852810-a003-4386-9846-1638362cee39_data.h5ad?rlkey=g6gjodg99l1ba0swr678h8kb0&st=hugh3bpt&dl=1
+human,Brain,10000,train,,,train_human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad,https://www.dropbox.com/scl/fi/hu35a45qk3b4m2ep17poa/human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad?rlkey=zbb9otp1tu6kvlkxsc7absfih&st=p9nwvnjo&dl=1
+human,Brain,10000,train,,,train_human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Brain)_data.h5ad,https://www.dropbox.com/scl/fi/f3l26pxi1d6bmtzzvugxr/human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Brain-_data.h5ad?rlkey=qn3hip8pk0be91uyrfvdf78uh&st=rsr9vb0j&dl=1
+human,Brain,10000,train,,,train_human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad,https://www.dropbox.com/scl/fi/8hw6yprqqc3tk7k2g03nj/human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad?rlkey=urxy3hu4omlt2824l2epdsl70&st=dqmdsxac&dl=1

From c0cd31ba9566bc296221bb840ff16ca17085174a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 5 Dec 2024 16:45:48 +0800
Subject: [PATCH 116/203] minor

---
 examples/dataset_server.json | 176 +++++++++++++++++++----------------
 1 file changed, 96 insertions(+), 80 deletions(-)

diff --git a/examples/dataset_server.json b/examples/dataset_server.json
index 10a279da..f0404928 100644
--- a/examples/dataset_server.json
+++ b/examples/dataset_server.json
@@ -1,92 +1,108 @@
 {
      "cta_actinn": [
-        "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0",
-        "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
-        "3faad104-2ab8-4434-816d-474d8d2641db",
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "4c4cd77c-8fee-4836-9145-16562a8782fe",
-        "84230ea4-998d-4aa8-8456-81dd54ce23af",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7",
-        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d3566d6a-a455-4a15-980f-45eb29114cab",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-        "c7775e88-49bf-4ba2-a03b-93f00447c958",
-        "456e8b9b-f872-488b-871d-94534090a865",
-        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
-        "a5d95a42-0137-496f-8a60-101e17f263c8",
-        "71be997d-ff75-41b9-8a9f-1288c865f921"
+      "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+      "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+      "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+      "f7995301-7551-4e1d-8396-ffe3c9497ace",
+      "e6a11140-2545-46bc-929e-da243eed2cae",
+      "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+      "1c739a3e-c3f5-49d5-98e0-73975e751201",
+      "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+      "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+      "d567b692-c374-4628-a508-8008f6778f22",
+      "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+      "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+      "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+      "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+      "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+      "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+      "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+      "9434b020-de42-43eb-bcc4-542b2be69015",
+      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+      "4ed927e9-c099-49af-b8ce-a2652d069333",
+      "ed852810-a003-4386-9846-1638362cee39",
+      "f75f2ff4-2884-4c2d-b375-70de37a34507",
+      "97a17473-e2b1-4f31-a544-44a60773e2dd"
      ]
      ,
      "cta_celltypist": [
-        "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0",
-        "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
-        "3faad104-2ab8-4434-816d-474d8d2641db",
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "4c4cd77c-8fee-4836-9145-16562a8782fe",
-        "84230ea4-998d-4aa8-8456-81dd54ce23af",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7",
-        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d3566d6a-a455-4a15-980f-45eb29114cab",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-        "c7775e88-49bf-4ba2-a03b-93f00447c958",
-        "456e8b9b-f872-488b-871d-94534090a865",
-        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
-        "a5d95a42-0137-496f-8a60-101e17f263c8",
-        "71be997d-ff75-41b9-8a9f-1288c865f921"
+         "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+         "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+         "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+         "f7995301-7551-4e1d-8396-ffe3c9497ace",
+         "e6a11140-2545-46bc-929e-da243eed2cae",
+         "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+         "1c739a3e-c3f5-49d5-98e0-73975e751201",
+         "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+         "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+         "d567b692-c374-4628-a508-8008f6778f22",
+         "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+         "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+         "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+         "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+         "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+         "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+         "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+         "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+         "9434b020-de42-43eb-bcc4-542b2be69015",
+         "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+         "4ed927e9-c099-49af-b8ce-a2652d069333",
+         "ed852810-a003-4386-9846-1638362cee39",
+         "f75f2ff4-2884-4c2d-b375-70de37a34507",
+         "97a17473-e2b1-4f31-a544-44a60773e2dd"
      ],
     "cta_scdeepsort": [
-        "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0",
-        "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
-        "3faad104-2ab8-4434-816d-474d8d2641db",
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "4c4cd77c-8fee-4836-9145-16562a8782fe",
-        "84230ea4-998d-4aa8-8456-81dd54ce23af",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7",
-        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d3566d6a-a455-4a15-980f-45eb29114cab",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-        "c7775e88-49bf-4ba2-a03b-93f00447c958",
-        "456e8b9b-f872-488b-871d-94534090a865",
-        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
-        "a5d95a42-0137-496f-8a60-101e17f263c8",
-        "71be997d-ff75-41b9-8a9f-1288c865f921"
+      "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+      "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+      "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+      "f7995301-7551-4e1d-8396-ffe3c9497ace",
+      "e6a11140-2545-46bc-929e-da243eed2cae",
+      "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+      "1c739a3e-c3f5-49d5-98e0-73975e751201",
+      "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+      "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+      "d567b692-c374-4628-a508-8008f6778f22",
+      "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+      "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+      "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+      "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+      "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+      "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+      "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+      "9434b020-de42-43eb-bcc4-542b2be69015",
+      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+      "4ed927e9-c099-49af-b8ce-a2652d069333",
+      "ed852810-a003-4386-9846-1638362cee39",
+      "f75f2ff4-2884-4c2d-b375-70de37a34507",
+      "97a17473-e2b1-4f31-a544-44a60773e2dd"
     ]
     ,
      "cta_singlecellnet": [
-        "01209dce-3575-4bed-b1df-129f57fbc031",
-        "055ca631-6ffb-40de-815e-b931e10718c0",
-        "2a498ace-872a-4935-984b-1afa70fd9886",
-        "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
-        "3faad104-2ab8-4434-816d-474d8d2641db",
-        "471647b3-04fe-4c76-8372-3264feb950e8",
-        "4c4cd77c-8fee-4836-9145-16562a8782fe",
-        "84230ea4-998d-4aa8-8456-81dd54ce23af",
-        "8a554710-08bc-4005-87cd-da9675bdc2e7",
-        "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
-        "bc260987-8ee5-4b6e-8773-72805166b3f7",
-        "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
-        "d3566d6a-a455-4a15-980f-45eb29114cab",
-        "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
-        "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569",
-        "c7775e88-49bf-4ba2-a03b-93f00447c958",
-        "456e8b9b-f872-488b-871d-94534090a865",
-        "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
-        "a5d95a42-0137-496f-8a60-101e17f263c8",
-        "71be997d-ff75-41b9-8a9f-1288c865f921"
+      "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+      "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+      "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+      "f7995301-7551-4e1d-8396-ffe3c9497ace",
+      "e6a11140-2545-46bc-929e-da243eed2cae",
+      "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+      "1c739a3e-c3f5-49d5-98e0-73975e751201",
+      "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+      "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+      "d567b692-c374-4628-a508-8008f6778f22",
+      "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+      "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+      "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+      "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+      "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+      "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+      "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+      "9434b020-de42-43eb-bcc4-542b2be69015",
+      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+      "4ed927e9-c099-49af-b8ce-a2652d069333",
+      "ed852810-a003-4386-9846-1638362cee39",
+      "f75f2ff4-2884-4c2d-b375-70de37a34507",
+      "97a17473-e2b1-4f31-a544-44a60773e2dd"
      ]
 }

From da1bfe7b6fe0a578086036d3bf39d21ea2d4e499 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 5 Dec 2024 19:45:06 +0800
Subject: [PATCH 117/203] minor

---
 examples/dataset_server.json | 216 ++++++++++++++++++-----------------
 1 file changed, 110 insertions(+), 106 deletions(-)

diff --git a/examples/dataset_server.json b/examples/dataset_server.json
index f0404928..615fd761 100644
--- a/examples/dataset_server.json
+++ b/examples/dataset_server.json
@@ -1,108 +1,112 @@
 {
-     "cta_actinn": [
-      "572f3f3e-d3e4-4d13-8e2b-88215e508481",
-      "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
-      "f15e263b-6544-46cb-a46e-e33ab7ce8347",
-      "f7995301-7551-4e1d-8396-ffe3c9497ace",
-      "e6a11140-2545-46bc-929e-da243eed2cae",
-      "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
-      "1c739a3e-c3f5-49d5-98e0-73975e751201",
-      "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
-      "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
-      "d567b692-c374-4628-a508-8008f6778f22",
-      "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
-      "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
-      "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
-      "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
-      "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
-      "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
-      "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
-      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
-      "9434b020-de42-43eb-bcc4-542b2be69015",
-      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
-      "4ed927e9-c099-49af-b8ce-a2652d069333",
-      "ed852810-a003-4386-9846-1638362cee39",
-      "f75f2ff4-2884-4c2d-b375-70de37a34507",
-      "97a17473-e2b1-4f31-a544-44a60773e2dd"
-     ]
-     ,
-     "cta_celltypist": [
-         "572f3f3e-d3e4-4d13-8e2b-88215e508481",
-         "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
-         "f15e263b-6544-46cb-a46e-e33ab7ce8347",
-         "f7995301-7551-4e1d-8396-ffe3c9497ace",
-         "e6a11140-2545-46bc-929e-da243eed2cae",
-         "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
-         "1c739a3e-c3f5-49d5-98e0-73975e751201",
-         "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
-         "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
-         "d567b692-c374-4628-a508-8008f6778f22",
-         "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
-         "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
-         "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
-         "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
-         "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
-         "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
-         "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
-         "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
-         "9434b020-de42-43eb-bcc4-542b2be69015",
-         "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
-         "4ed927e9-c099-49af-b8ce-a2652d069333",
-         "ed852810-a003-4386-9846-1638362cee39",
-         "f75f2ff4-2884-4c2d-b375-70de37a34507",
-         "97a17473-e2b1-4f31-a544-44a60773e2dd"
-     ],
-    "cta_scdeepsort": [
-      "572f3f3e-d3e4-4d13-8e2b-88215e508481",
-      "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
-      "f15e263b-6544-46cb-a46e-e33ab7ce8347",
-      "f7995301-7551-4e1d-8396-ffe3c9497ace",
-      "e6a11140-2545-46bc-929e-da243eed2cae",
-      "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
-      "1c739a3e-c3f5-49d5-98e0-73975e751201",
-      "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
-      "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
-      "d567b692-c374-4628-a508-8008f6778f22",
-      "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
-      "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
-      "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
-      "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
-      "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
-      "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
-      "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
-      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
-      "9434b020-de42-43eb-bcc4-542b2be69015",
-      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
-      "4ed927e9-c099-49af-b8ce-a2652d069333",
-      "ed852810-a003-4386-9846-1638362cee39",
-      "f75f2ff4-2884-4c2d-b375-70de37a34507",
-      "97a17473-e2b1-4f31-a544-44a60773e2dd"
-    ]
-    ,
-     "cta_singlecellnet": [
-      "572f3f3e-d3e4-4d13-8e2b-88215e508481",
-      "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
-      "f15e263b-6544-46cb-a46e-e33ab7ce8347",
-      "f7995301-7551-4e1d-8396-ffe3c9497ace",
-      "e6a11140-2545-46bc-929e-da243eed2cae",
-      "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
-      "1c739a3e-c3f5-49d5-98e0-73975e751201",
-      "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
-      "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
-      "d567b692-c374-4628-a508-8008f6778f22",
-      "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
-      "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
-      "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
-      "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
-      "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
-      "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
-      "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
-      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
-      "9434b020-de42-43eb-bcc4-542b2be69015",
-      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
-      "4ed927e9-c099-49af-b8ce-a2652d069333",
-      "ed852810-a003-4386-9846-1638362cee39",
-      "f75f2ff4-2884-4c2d-b375-70de37a34507",
-      "97a17473-e2b1-4f31-a544-44a60773e2dd"
-     ]
+     
+      "heart":{
+         "cta_actinn": [
+            "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+            "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+            "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+            "f7995301-7551-4e1d-8396-ffe3c9497ace",
+            "e6a11140-2545-46bc-929e-da243eed2cae",
+            "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+            "1c739a3e-c3f5-49d5-98e0-73975e751201",
+            "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+            "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+            "d567b692-c374-4628-a508-8008f6778f22",
+            "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+            "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+            "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+            "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+            "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+            "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+            "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+            "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+            "9434b020-de42-43eb-bcc4-542b2be69015",
+            "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+            "4ed927e9-c099-49af-b8ce-a2652d069333",
+            "ed852810-a003-4386-9846-1638362cee39",
+            "f75f2ff4-2884-4c2d-b375-70de37a34507",
+            "97a17473-e2b1-4f31-a544-44a60773e2dd"
+           ]
+           ,
+           "cta_celltypist": [
+               "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+               "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+               "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+               "f7995301-7551-4e1d-8396-ffe3c9497ace",
+               "e6a11140-2545-46bc-929e-da243eed2cae",
+               "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+               "1c739a3e-c3f5-49d5-98e0-73975e751201",
+               "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+               "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+               "d567b692-c374-4628-a508-8008f6778f22",
+               "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+               "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+               "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+               "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+               "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+               "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+               "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+               "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+               "9434b020-de42-43eb-bcc4-542b2be69015",
+               "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+               "4ed927e9-c099-49af-b8ce-a2652d069333",
+               "ed852810-a003-4386-9846-1638362cee39",
+               "f75f2ff4-2884-4c2d-b375-70de37a34507",
+               "97a17473-e2b1-4f31-a544-44a60773e2dd"
+           ],
+          "cta_scdeepsort": [
+            "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+            "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+            "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+            "f7995301-7551-4e1d-8396-ffe3c9497ace",
+            "e6a11140-2545-46bc-929e-da243eed2cae",
+            "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+            "1c739a3e-c3f5-49d5-98e0-73975e751201",
+            "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+            "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+            "d567b692-c374-4628-a508-8008f6778f22",
+            "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+            "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+            "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+            "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+            "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+            "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+            "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+            "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+            "9434b020-de42-43eb-bcc4-542b2be69015",
+            "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+            "4ed927e9-c099-49af-b8ce-a2652d069333",
+            "ed852810-a003-4386-9846-1638362cee39",
+            "f75f2ff4-2884-4c2d-b375-70de37a34507",
+            "97a17473-e2b1-4f31-a544-44a60773e2dd"
+          ]
+          ,
+           "cta_singlecellnet": [
+            "572f3f3e-d3e4-4d13-8e2b-88215e508481",
+            "fa27492b-82ff-4ab7-ac61-0e2b184eee67",
+            "f15e263b-6544-46cb-a46e-e33ab7ce8347",
+            "f7995301-7551-4e1d-8396-ffe3c9497ace",
+            "e6a11140-2545-46bc-929e-da243eed2cae",
+            "1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d",
+            "1c739a3e-c3f5-49d5-98e0-73975e751201",
+            "1252c5fb-945f-42d6-b1a8-8a3bd864384b",
+            "a68b64d8-aee3-4947-81b7-36b8fe5a44d2",
+            "d567b692-c374-4628-a508-8008f6778f22",
+            "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Heart)",
+            "65badd7a-9262-4fd1-9ce2-eb5dc0ca8039",
+            "f7c1c579-2dc0-47e2-ba19-8165c5a0e353",
+            "c5d88abe-f23a-45fa-a534-788985e93dad(Heart)",
+            "83b5e943-a1d5-4164-b3f2-f7a37f01b524",
+            "bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7",
+            "5a11f879-d1ef-458a-910c-9b0bdfca5ebf",
+            "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e",
+            "9434b020-de42-43eb-bcc4-542b2be69015",
+            "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e",
+            "4ed927e9-c099-49af-b8ce-a2652d069333",
+            "ed852810-a003-4386-9846-1638362cee39",
+            "f75f2ff4-2884-4c2d-b375-70de37a34507",
+            "97a17473-e2b1-4f31-a544-44a60773e2dd"
+           ]
+      }
+     
 }

From 8a27fc6a37341f4f180caa617d58c37016b307fd Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 5 Dec 2024 19:48:09 +0800
Subject: [PATCH 118/203] minor

---
 examples/get_result_web.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/examples/get_result_web.py b/examples/get_result_web.py
index ee5d4158..02da6824 100644
--- a/examples/get_result_web.py
+++ b/examples/get_result_web.py
@@ -13,13 +13,6 @@
 
 # get yaml of best method
 
-wandb = try_import("wandb")
-entity = "xzy11632"
-project = "dance-dev"
-file_root = str(Path(__file__).resolve().parent)
-with open(f"{file_root}/dataset_server.json") as f:
-    collect_datasets = json.load(f)
-file_root = "./tuning"
 
 
 def check_identical_strings(string_list):
@@ -153,8 +146,10 @@ def check_exist(file_path):
         return False
 
 
-def write_ans():
+def write_ans(tissue):
     ans = []
+    collect_datasets=all_datasets[tissue]
+    
     for method_folder in tqdm(collect_datasets):
         for dataset_id in collect_datasets[method_folder]:
             file_path = f"{file_root}/{method_folder}/{dataset_id}"
@@ -180,8 +175,17 @@ def write_ans():
             })
     # with open('temp_ans.json', 'w') as f:
     #     json.dump(ans, f,indent=4)
-    pd.DataFrame(ans).to_csv("temp_ans.csv")
+    pd.DataFrame(ans).to_csv(f"{tissue}_ans.csv")
 
 
 if __name__ == "__main__":
-    write_ans()
+    wandb = try_import("wandb")
+    entity = "xzy11632"
+    project = "dance-dev"
+    file_root = str(Path(__file__).resolve().parent)
+    with open(f"{file_root}/dataset_server.json") as f:
+        all_datasets = json.load(f)
+    file_root = "./tuning"
+    tissues=["heart"]
+    for tissue in tissues:
+        write_ans(tissue)

From 43d489491a5551f8d71fe3a2eedf5b4756ea124c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:48:45 +0000
Subject: [PATCH 119/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/dataset_server.json | 4 ++--
 examples/get_result_web.py   | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/dataset_server.json b/examples/dataset_server.json
index 615fd761..8d07f604 100644
--- a/examples/dataset_server.json
+++ b/examples/dataset_server.json
@@ -1,5 +1,5 @@
 {
-     
+
       "heart":{
          "cta_actinn": [
             "572f3f3e-d3e4-4d13-8e2b-88215e508481",
@@ -108,5 +108,5 @@
             "97a17473-e2b1-4f31-a544-44a60773e2dd"
            ]
       }
-     
+
 }
diff --git a/examples/get_result_web.py b/examples/get_result_web.py
index 02da6824..4e5f9da2 100644
--- a/examples/get_result_web.py
+++ b/examples/get_result_web.py
@@ -14,7 +14,6 @@
 # get yaml of best method
 
 
-
 def check_identical_strings(string_list):
     if not string_list:
         raise ValueError("列表为空")
@@ -148,8 +147,8 @@ def check_exist(file_path):
 
 def write_ans(tissue):
     ans = []
-    collect_datasets=all_datasets[tissue]
-    
+    collect_datasets = all_datasets[tissue]
+
     for method_folder in tqdm(collect_datasets):
         for dataset_id in collect_datasets[method_folder]:
             file_path = f"{file_root}/{method_folder}/{dataset_id}"
@@ -186,6 +185,6 @@ def write_ans(tissue):
     with open(f"{file_root}/dataset_server.json") as f:
         all_datasets = json.load(f)
     file_root = "./tuning"
-    tissues=["heart"]
+    tissues = ["heart"]
     for tissue in tissues:
         write_ans(tissue)

From def68bec15fa5de893b947787ca6eba3654a02f6 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 6 Dec 2024 21:25:50 +0800
Subject: [PATCH 120/203] minor

---
 dance/metadata/scdeepsort.csv | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 747fff41..6d2f6317 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -115,3 +115,6 @@ human,Heart,10000,train,,,train_human_Hearted852810-a003-4386-9846-1638362cee39_
 human,Brain,10000,train,,,train_human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad,https://www.dropbox.com/scl/fi/hu35a45qk3b4m2ep17poa/human_Brain0bc7235a-ae5a-479d-a487-510435377e55_data.h5ad?rlkey=zbb9otp1tu6kvlkxsc7absfih&st=p9nwvnjo&dl=1
 human,Brain,10000,train,,,train_human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Brain)_data.h5ad,https://www.dropbox.com/scl/fi/f3l26pxi1d6bmtzzvugxr/human_Brain2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Brain-_data.h5ad?rlkey=qn3hip8pk0be91uyrfvdf78uh&st=rsr9vb0j&dl=1
 human,Brain,10000,train,,,train_human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad,https://www.dropbox.com/scl/fi/8hw6yprqqc3tk7k2g03nj/human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_data.h5ad?rlkey=urxy3hu4omlt2824l2epdsl70&st=dqmdsxac&dl=1
+human,Brain,10000,train,,,train_human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad,https://www.dropbox.com/scl/fi/hth8if16ri6y64yfabe66/human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad?rlkey=eiyf474thqawso5ltjn0pnwko&st=54qn18ky&dl=1
+human,Brain,10000,train,,,train_human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad,https://www.dropbox.com/scl/fi/a3te6jw55cv4mujujq4ir/human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad?rlkey=scj4lcv1y00yh7bk18bztl0fz&st=yarlckox&dl=1
+human,Brain,10000,train,,,train_human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad,https://www.dropbox.com/scl/fi/b9uqvab7lderxnrd8c0e9/human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad?rlkey=heobzdft1rcl1ttn0wvi9ra5y&st=q0xewgnf&dl=1

From ad8ecb78751c83e1bdd62c93cd5e760b975adc9c Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 6 Dec 2024 21:27:07 +0800
Subject: [PATCH 121/203] minor

---
 examples/get_result_web.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/get_result_web.py b/examples/get_result_web.py
index ee5d4158..c1ca4536 100644
--- a/examples/get_result_web.py
+++ b/examples/get_result_web.py
@@ -65,9 +65,12 @@ def spilt_web(url: str):
     if match:
         entity = match.group(1)
         project = match.group(2)
-        sweep_id = match.group(3)
-
-        return entity, project, sweep_id
+        pattern = r'/sweeps/([^/?]+)'  # 正则表达式模式
+        match = re.search(pattern, url)
+        if match:
+            sweep_id = match.group(1)
+            return entity, project, sweep_id
+        return None
     else:
         print(url)
         print("No match found")

From 8fc8f622b7339a55d1fb482a0f5a7fb2c6d22a38 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Sat, 7 Dec 2024 20:21:57 +0800
Subject: [PATCH 122/203] update data

---
 dance/metadata/scdeepsort.csv | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 6d2f6317..6d5eb666 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -118,3 +118,15 @@ human,Brain,10000,train,,,train_human_Brain9c63201d-bfd9-41a8-bbbc-18d947556f3d_
 human,Brain,10000,train,,,train_human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad,https://www.dropbox.com/scl/fi/hth8if16ri6y64yfabe66/human_Brain52f18bc3-52d9-487b-bf8f-f0b7aa684b09_data.h5ad?rlkey=eiyf474thqawso5ltjn0pnwko&st=54qn18ky&dl=1
 human,Brain,10000,train,,,train_human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad,https://www.dropbox.com/scl/fi/a3te6jw55cv4mujujq4ir/human_Brain56c4912d-2bae-4b64-98f2-af8a84389208_data.h5ad?rlkey=scj4lcv1y00yh7bk18bztl0fz&st=yarlckox&dl=1
 human,Brain,10000,train,,,train_human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad,https://www.dropbox.com/scl/fi/b9uqvab7lderxnrd8c0e9/human_Brain43b7e156-65b3-4a7b-8c7a-08528e4b21d0_data.h5ad?rlkey=heobzdft1rcl1ttn0wvi9ra5y&st=q0xewgnf&dl=1
+human,Brain,10000,train,,,train_human_Brain07760522-707a-4a1c-8891-dbd1226d6b27_data.h5ad,https://www.dropbox.com/scl/fi/fdg72e3smhtbnen0njt1p/human_Brain07760522-707a-4a1c-8891-dbd1226d6b27_data.h5ad?rlkey=fcus37nuvqvur8qwqhxdayo9r&st=2e00lvxu&dl=1
+human,Brain,10000,train,,,train_human_Brain146216e1-ec30-4fee-a1fb-25defe801e2d_data.h5ad,https://www.dropbox.com/scl/fi/m55bqo60paasb7hbuhvex/human_Brain146216e1-ec30-4fee-a1fb-25defe801e2d_data.h5ad?rlkey=a1zz2xe6lsk20g5nie9c2ft6d&st=100jl011&dl=1
+human,Brain,10000,train,,,train_human_Brain22658f4f-9268-41ad-8828-cc53f4baa9fa_data.h5ad,https://www.dropbox.com/scl/fi/t5vnklfwn8eu2ht5ph4fd/human_Brain22658f4f-9268-41ad-8828-cc53f4baa9fa_data.h5ad?rlkey=latl59ppqnr4bif69xxt92sss&st=yhhuzvrf&dl=1
+human,Brain,10000,train,,,train_human_Brain421e5f54-5de7-425f-b399-34ead0651ce1_data.h5ad,https://www.dropbox.com/scl/fi/wmysnkpfbdjteospe0jjx/human_Brain421e5f54-5de7-425f-b399-34ead0651ce1_data.h5ad?rlkey=xdx6g6cisyi2gfohckjhud46g&st=zxu0jhp7&dl=1
+human,Brain,10000,train,,,train_human_Brain595c9010-99ec-462d-b6a1-2b2fe5407871_data.h5ad,https://www.dropbox.com/scl/fi/598qwju5349fc8z1zuskh/human_Brain595c9010-99ec-462d-b6a1-2b2fe5407871_data.h5ad?rlkey=8848cndanq1bun59yvggbjvnx&st=nnasrgud&dl=1
+human,Brain,10000,train,,,train_human_Brain700aed19-c16e-4ba8-9191-07da098a8626_data.h5ad,https://www.dropbox.com/scl/fi/h4lcjdl04mpgtkuthgc94/human_Brain700aed19-c16e-4ba8-9191-07da098a8626_data.h5ad?rlkey=hi0fk5rxia7shu6m4twky2tay&st=uw6trm86&dl=1
+human,Brain,10000,train,,,train_human_Brain70e4f35b-c98c-45a1-9aa9-2053b07315dd_data.h5ad,https://www.dropbox.com/scl/fi/ubqn57ate29dfuvapx4qi/human_Brain70e4f35b-c98c-45a1-9aa9-2053b07315dd_data.h5ad?rlkey=5p5wnukkq5lb6c8v0s5itcv2j&st=730w6nsz&dl=1
+human,Brain,10000,train,,,train_human_Brain72822932-10f6-466f-baf3-a2c1d89364bc_data.h5ad,https://www.dropbox.com/scl/fi/llmit9a77dkby69p5eki5/human_Brain72822932-10f6-466f-baf3-a2c1d89364bc_data.h5ad?rlkey=o294joyf2yrwi403qll905bak&st=m95dzpp5&dl=1
+human,Brain,10000,train,,,train_human_Brain9372df2d-13d6-4fac-980b-919a5b7eb483_data.h5ad,https://www.dropbox.com/scl/fi/kkgq2ry1nyzqp3aq9mag6/human_Brain9372df2d-13d6-4fac-980b-919a5b7eb483_data.h5ad?rlkey=yek0em9f7cal5bhq4h4xtjnxc&st=x3isnepq&dl=1
+human,Brain,10000,train,,,train_human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad,https://www.dropbox.com/scl/fi/q6vtn80sf8jpkvte55zpm/human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad?rlkey=6ca2gf47w5r53rw5y5h5b53e8&st=3wkp779m&dl=1
+human,Brain,10000,train,,,train_human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad,https://www.dropbox.com/scl/fi/nl14mhuuwlq9zmjntot7g/human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad?rlkey=o158zcyq781w4rj71pfsw8yf1&st=ds6kabvk&dl=1
+human,Brain,10000,train,,,train_human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad,https://www.dropbox.com/scl/fi/pyphmfixfeyu2wzyr216p/human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad?rlkey=sd6su8cmlc4g4k3hixihg1zo9&st=99diymvk&dl=1

From 02893d5883bf206318bf1a27a32bf87d7bad4244 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 9 Dec 2024 21:24:51 +0800
Subject: [PATCH 123/203] minor

---
 dance/sc_similarity/anndata_similarity.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
index 0287dee8..1d421ecf 100644
--- a/dance/sc_similarity/anndata_similarity.py
+++ b/dance/sc_similarity/anndata_similarity.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pandas as pd
 import scanpy as sc
+import scipy
 import yaml
 from omegaconf import OmegaConf
 from scipy.spatial.distance import jaccard
@@ -187,6 +188,13 @@ def get_dataset_info(data: ad.AnnData):
             con_sim["gene_num"] = len(data.var)
             con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"])
             con_sim["n_counts_var"] = np.var(data.obs["n_counts"])
+            if "n_counts" not in data.var.columns:
+                if scipy.sparse.issparse(data.X):
+                    gene_counts = np.array(data.X.sum(axis=0)).flatten()
+                else:
+                    gene_counts = data.X.sum(axis=0)
+            data.var["n_counts"]=gene_counts
+            data.var["n_counts"]=data.var["n_counts"].astype(float)
             con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"])
             con_sim["var_n_counts_var"] = np.var(data.var["n_counts"])
             data.uns["con_sim"] = con_sim

From 75a20b84b323a8b80cd028167c715ea196c25cd3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:27:03 +0000
Subject: [PATCH 124/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/sc_similarity/anndata_similarity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
index 1d421ecf..fca863db 100644
--- a/dance/sc_similarity/anndata_similarity.py
+++ b/dance/sc_similarity/anndata_similarity.py
@@ -193,8 +193,8 @@ def get_dataset_info(data: ad.AnnData):
                     gene_counts = np.array(data.X.sum(axis=0)).flatten()
                 else:
                     gene_counts = data.X.sum(axis=0)
-            data.var["n_counts"]=gene_counts
-            data.var["n_counts"]=data.var["n_counts"].astype(float)
+            data.var["n_counts"] = gene_counts
+            data.var["n_counts"] = data.var["n_counts"].astype(float)
             con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"])
             con_sim["var_n_counts_var"] = np.var(data.var["n_counts"])
             data.uns["con_sim"] = con_sim

From 75c64297ad1d93f3b85a5177a65f5cb313197c6a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 9 Dec 2024 22:01:24 +0800
Subject: [PATCH 125/203] minor

---
 dance/sc_similarity/anndata_similarity.py | 436 +++++++++++++++-------
 1 file changed, 295 insertions(+), 141 deletions(-)

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
index 0287dee8..0ff697b5 100644
--- a/dance/sc_similarity/anndata_similarity.py
+++ b/dance/sc_similarity/anndata_similarity.py
@@ -7,13 +7,15 @@
 import anndata
 import anndata as ad
 import numpy as np
+import ot
 import pandas as pd
 import scanpy as sc
 import yaml
 from omegaconf import OmegaConf
-from scipy.spatial.distance import jaccard
-from scipy.stats import pearsonr, wasserstein_distance
-from sklearn.metrics.pairwise import cosine_similarity
+from scipy.linalg import sqrtm
+from scipy.spatial import cKDTree
+from scipy.spatial.distance import cdist, directed_hausdorff, jaccard, jensenshannon
+from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel
 
 # Suppress scipy warnings for constant input in Pearson correlation
 warnings.filterwarnings("ignore", message="An input array is constant")
@@ -21,141 +23,252 @@
 
 class AnnDataSimilarity:
 
-    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, cell_col: str,
+    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None,
+                 init_random_state: Optional[int] = None, n_runs: int = 10,
                  ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None,
                  adata2_name: Optional[str] = None,
                  methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"):
         """Initialize the AnnDataSimilarity object and perform data preprocessing."""
-        self.adata1 = adata1.copy()
-        self.adata2 = adata2.copy()
         self.origin_adata1 = adata1.copy()
         self.origin_adata2 = adata2.copy()
-        self.cell_col = cell_col
+        self.sample_size = sample_size
+        self.init_random_state = init_random_state
         self.preprocess()
         self.results = {}
-        self.results_score = {}
         self.ground_truth_conf_path = ground_truth_conf_path
         self.adata1_name = adata1_name
         self.adata2_name = adata2_name
         self.methods = methods
         self.tissue = tissue
+        self.n_runs = n_runs
 
-    def filter_gene(self):
-        sc.pp.highly_variable_genes(self.adata1, n_top_genes=2000, flavor='seurat_v3')
-        sc.pp.highly_variable_genes(self.adata2, n_top_genes=2000, flavor='seurat_v3')
+    def filter_gene(self, n_top_genes=3000):
+        sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3')
+        sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3')
 
-        common_hvg = self.adata1.var_names[self.adata1.var['highly_variable']].intersection(
-            self.adata2.var_names[self.adata2.var['highly_variable']])
+        common_hvg = self.origin_adata1.var_names[self.origin_adata1.var['highly_variable']].intersection(
+            self.origin_adata2.var_names[self.origin_adata2.var['highly_variable']])
 
-        self.adata1 = self.adata1[:, common_hvg].copy()
-        self.adata2 = self.adata2[:, common_hvg].copy()
+        self.origin_adata1 = self.origin_adata1[:, common_hvg].copy()
+        self.origin_adata2 = self.origin_adata2[:, common_hvg].copy()
         self.common_genes = common_hvg
 
     def preprocess(self):
-        self.filter_gene()
         """Preprocess the data, including log normalization and normalization to probability distribution."""
-        self.adata1.obs[self.cell_col] = self.adata1.obs[self.cell_col].astype(str)
-        self.adata2.obs[self.cell_col] = self.adata2.obs[self.cell_col].astype(str)
-        self.avg_expr1 = self._compute_average_expression(self.adata1)
-        self.avg_expr2 = self._compute_average_expression(self.adata2)
-        self.prob_expr1 = self._normalize_to_probability(self.avg_expr1)
-        self.prob_expr2 = self._normalize_to_probability(self.avg_expr2)
-
-    def _compute_average_expression(self, adata: anndata.AnnData) -> pd.DataFrame:
-        """Calculate the average gene expression for each cell type"""
-        return adata.to_df().groupby(adata.obs[self.cell_col]).mean()
-
-    def _normalize_to_probability(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Normalize the gene expression matrix to a probability distribution (expression sums to 1 for each cell type)"""
-        return df.div(df.sum(axis=1), axis=0).fillna(0)
-
-    def cosine_sim(self) -> pd.DataFrame:
-        """Computes the cosine similarity between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively."""
-        sim_matrix = cosine_similarity(self.avg_expr1, self.avg_expr2)
-        return pd.DataFrame(sim_matrix, index=self.avg_expr1.index, columns=self.avg_expr2.index)
-
-    def pearson_corr(self) -> pd.DataFrame:
-        """Computes the Pearson correlation coefficient between two datasets. Returns a data frame with the cell types in rows and columns of adata1 and adata2 respectively."""
-        celltypes1 = self.avg_expr1.index
-        celltypes2 = self.avg_expr2.index
-        corr_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                corr, _ = pearsonr(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
-                corr_matrix.at[ct1, ct2] = corr
-
-        return corr_matrix.astype(float)
-
-    def jaccard_sim(self, threshold: float = 0.5) -> pd.DataFrame:
-        """Computes the Jaccard similarity between two datasets. Uses a binary representation of gene expression based on a specified threshold. Returns a data frame with rows and columns of cell types in adata1 and adata2 respectively."""
-        # Binarized expression matrix
-        binary_expr1 = (self.avg_expr1 > threshold).astype(int)
-        binary_expr2 = (self.avg_expr2 > threshold).astype(int)
-
-        celltypes1 = binary_expr1.index
-        celltypes2 = binary_expr2.index
-        sim_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                sim = 1 - jaccard(binary_expr1.loc[ct1], binary_expr2.loc[ct2])
-                sim_matrix.at[ct1, ct2] = sim
-
-        return sim_matrix.astype(float)
-
-    def js_distance(self) -> pd.DataFrame:
-        """Computes the Jensen-Shannon divergence between two datasets. The expression data must first be normalized to a probability distribution. Returns a data frame with rows and columns containing the cell types of adata1 and adata2, respectively."""
-        # def jsd(p, q):
-        #     """
-        #     计算两个概率分布 p 和 q 的 Jensen-Shannon 散度。
-        #     """
-        #     p = p + 1e-12
-        #     q = q + 1e-12
-        #     m = 0.5 * (p + q)
-        #     return 0.5 * (entropy(p, m) + entropy(q, m))
-
-        # from scipy.stats import entropy
-
-        celltypes1 = self.prob_expr1.index
-        celltypes2 = self.prob_expr2.index
-        js_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                jsd_value = 1 - self._jensen_shannon_divergence(self.prob_expr1.loc[ct1].values,
-                                                                self.prob_expr2.loc[ct2].values)
-                js_matrix.at[ct1, ct2] = jsd_value
-
-        return js_matrix.astype(float)
-
-    def _jensen_shannon_divergence(self, p, q) -> float:
-        """Compute the Jensen-Shannon divergence of two probability distributions p and q."""
-        from scipy.spatial.distance import jensenshannon
-        return jensenshannon(p, q)
+        self.filter_gene()
+
+    def sample_cells(self, random_state):
+        """
+        Randomly sample cells from each dataset if sample_size is specified.
+        """
+        np.random.seed(random_state)
+        if self.sample_size is None:
+            self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs)  #need to think
+        if self.adata1.n_obs > self.sample_size:
+            indices1 = np.random.choice(self.adata1.n_obs, size=self.sample_size, replace=False)
+            self.sampled_adata1 = self.adata1[indices1, :].copy()
+        else:
+            self.sampled_adata1 = self.adata1.copy()
+        if self.adata2.n_obs > self.sample_size:
+            indices2 = np.random.choice(self.adata2.n_obs, size=self.sample_size, replace=False)
+            self.sampled_adata2 = self.adata2[indices2, :].copy()
+        else:
+            self.sampled_adata2 = self.adata2.copy()
+
+    def normalize_data(self):  # I am not sure
+        """
+        Normalize the data by total counts per cell and log-transform.
+        """
+        sc.pp.normalize_total(self.adata1, target_sum=1e4)
+        sc.pp.log1p(self.adata1)
+        sc.pp.normalize_total(self.adata2, target_sum=1e4)
+        sc.pp.log1p(self.adata2)
+
+    def set_prob_data(self, sampled=False):
+        # Normalize the data to probability distributions
+        if sampled:
+            prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
+            prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
+        else:
+            prob_adata1 = self.adata1.X / self.adata1.X.sum(axis=1)
+            prob_adata2 = self.adata2.X / self.adata2.X.sum(axis=1)
+        # Handle any NaN values resulting from division by zero
+        self.X = np.nan_to_num(prob_adata1).toarray()
+        self.Y = np.nan_to_num(prob_adata2).toarray()
+
+    def cosine_sim_sampled(self) -> pd.DataFrame:
+        """
+        Computes the average cosine similarity between all pairs of cells from the two datasets.
+        """
+        # Compute cosine similarity matrix
+        sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X)
+        # Return the average similarity
+        return sim_matrix.mean()
+
+    def pearson_corr_sampled(self) -> pd.DataFrame:
+        """
+        Computes the average Pearson correlation coefficient between all pairs of cells from the two datasets.
+        """
+        # Compute Pearson correlation matrix
+        corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(),
+                                  self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs,
+                                                                   self.sampled_adata1.n_obs:]
+        # Return the average correlation
+        return np.nanmean(corr_matrix)
+
+    def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
+        """
+        Computes the average Jaccard similarity between all pairs of binarized cells from the two datasets.
+        """
+        # Binarize the data
+        binary_adata1 = (self.sampled_adata1.X > threshold).astype(int)
+        binary_adata2 = (self.sampled_adata2.X > threshold).astype(int)
+        # Compute Jaccard distance matrix
+        distance_matrix = cdist(binary_adata1.A, binary_adata2.A, metric='jaccard')
+        # Convert to similarity and compute the average
+        similarity_matrix = 1 - distance_matrix
+        return similarity_matrix.mean()
+
+    def js_divergence_sampled(self) -> float:
+        """
+        Computes the average Jensen-Shannon divergence between all pairs of cells from the two datasets.
+        """
+        # Normalize the data to probability distributions
+        prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
+        prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
+        # Handle any NaN values resulting from division by zero
+        prob_adata1 = np.nan_to_num(prob_adata1).toarray()
+        prob_adata2 = np.nan_to_num(prob_adata2).toarray()
+
+        # Define a function to compute JS divergence for a pair of probability vectors
+        def jsd(p, q):
+            return jensenshannon(p, q)
+
+        # Compute JS divergence matrix
+        jsd_vectorized = np.vectorize(jsd, signature='(n),(n)->()')
+        divergence_matrix = np.zeros((prob_adata1.shape[0], prob_adata2.shape[0]))
+        for i in range(prob_adata1.shape[0]):
+            divergence_matrix[i, :] = jsd_vectorized(
+                np.repeat(prob_adata1[i, :], prob_adata2.shape[0], axis=0).reshape(-1, prob_adata1.shape[1]),
+                prob_adata2)
+
+        # Convert divergence to similarity and compute the average
+        similarity_matrix = 1 - divergence_matrix
+        return np.nanmean(similarity_matrix)
+
+    def compute_mmd(self) -> float:
+        X = self.X
+        Y = self.Y
+        kernel = "rbf"
+        gamma = 1.0
+        if kernel == 'rbf':
+            K_X = np.exp(-gamma * cdist(X, X, 'sqeuclidean'))
+            K_Y = np.exp(-gamma * cdist(Y, Y, 'sqeuclidean'))
+            K_XY = np.exp(-gamma * cdist(X, Y, 'sqeuclidean'))
+        elif kernel == 'linear':
+            K_X = np.dot(X, X.T)
+            K_Y = np.dot(Y, Y.T)
+            K_XY = np.dot(X, Y.T)
+        else:
+            raise ValueError("Unsupported kernel type")
+
+        m = X.shape[0]
+        n = Y.shape[0]
+
+        sum_X = (np.sum(K_X) - np.sum(np.diag(K_X))) / (m * (m - 1))
+        sum_Y = (np.sum(K_Y) - np.sum(np.diag(K_Y))) / (n * (n - 1))
+        sum_XY = np.sum(K_XY) / (m * n)
+
+        mmd_squared = sum_X + sum_Y - 2 * sum_XY
+        mmd = np.sqrt(max(mmd_squared, 0))
+        return 1 / (1 + mmd)
 
     def common_genes_num(self):
         return len(self.common_genes)
 
-    def otdd():
+    def otdd(self):
         """Compute the OTDD between two data sets."""
         raise NotImplementedError("OTDD!")
 
-    def data_company():
+    def data_company(self):
         raise NotImplementedError("data company")
 
-    def wasserstein_dist(self) -> pd.DataFrame:
-        """Compute the Wasserstein distance between two datasets. Return a data frame with the cell types in rows and columns of adata1 and adata2 respectively."""
-        celltypes1 = self.avg_expr1.index
-        celltypes2 = self.avg_expr2.index
-        wasserstein_matrix = pd.DataFrame(index=celltypes1, columns=celltypes2)
-
-        for ct1 in celltypes1:
-            for ct2 in celltypes2:
-                wd = wasserstein_distance(self.avg_expr1.loc[ct1], self.avg_expr2.loc[ct2])
-                wasserstein_matrix.at[ct1, ct2] = wd
-
-        return wasserstein_matrix.astype(float)
+    def wasserstein_dist(self) -> float:
+        """
+        Computes the average Wasserstein distance between all pairs of cells from the two datasets.
+        """
+        X = self.X
+        Y = self.Y
+        a = np.ones((X.shape[0], )) / X.shape[0]
+        b = np.ones((Y.shape[0], )) / Y.shape[0]
+        M = ot.dist(X, Y, metric='euclidean')
+        wasserstein_dist = ot.emd2(a, b, M)
+        return 1 / 1 + wasserstein_dist
+
+    def get_Hausdorff(self):
+        X = self.X
+        Y = self.Y
+        forward = directed_hausdorff(X, Y)[0]
+        backward = directed_hausdorff(X, Y)[0]
+        hausdorff_distance = max(forward, backward)
+        normalized_hausdorff = hausdorff_distance / np.sqrt(X.shape[1])
+        similarity = 1 - normalized_hausdorff
+        return similarity
+
+    def chamfer_distance(self):
+        X = self.X
+        Y = self.Y
+        tree_A = cKDTree(X)
+        tree_B = cKDTree(Y)
+
+        distances_A_to_B, _ = tree_A.query(Y)
+        distances_B_to_A, _ = tree_B.query(X)
+
+        chamfer_A_to_B = np.mean(distances_A_to_B)
+        chamfer_B_to_A = np.mean(distances_B_to_A)
+        distance = chamfer_A_to_B + chamfer_B_to_A
+        normalized_chamfer = distance / np.sqrt(X.shape[1])
+        similarity = 1 - normalized_chamfer
+        return similarity
+
+    def energy_distance_metric(self):
+        X = self.X
+        Y = self.Y
+        XX = cdist(X, X, 'euclidean')
+        YY = cdist(Y, Y, 'euclidean')
+        XY = cdist(X, Y, 'euclidean')
+        distance = 2 * np.mean(XY) - np.mean(XX) - np.mean(YY)
+        return 1 / (1 + distance)
+
+    def get_sinkhorn2(self):
+        X = self.X
+        Y = self.Y
+        a = np.ones(X.shape[0]) / X.shape[0]
+        b = np.ones(Y.shape[0]) / Y.shape[0]
+        M = ot.dist(X, Y, metric='euclidean')
+        reg = 0.1
+        sinkhorn_dist = ot.sinkhorn2(a, b, M, reg)
+        return 1 / (1 + sinkhorn_dist)
+
+    def bures_distance(self):
+        X = self.X
+        Y = self.Y
+        C1 = np.cov(X, rowvar=False)
+        C2 = np.cov(Y, rowvar=False)
+        sqrt_C1 = sqrtm(C1)
+        product = sqrt_C1 @ C2 @ sqrt_C1
+        sqrt_product = sqrtm(product)
+        trace = np.trace(C1) + np.trace(C2) - 2 * np.trace(sqrt_product)
+        return 1 / (1 + np.sqrt(max(trace, 0)))
+
+    def spectral_distance(self):
+        X = self.X
+        Y = self.Y
+        C1 = np.cov(X, rowvar=False)
+        C2 = np.cov(Y, rowvar=False)
+        eig_A = np.linalg.eigvalsh(C1)
+        eig_B = np.linalg.eigvalsh(C2)
+        return 1 / (1 + np.linalg.norm(eig_A - eig_B))
 
     def get_dataset_meta_sim(self):
         # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general']
@@ -177,7 +290,7 @@ def get_con_sim(con_data_1, con_data_2):
 
         def get_dataset_info(data: ad.AnnData):
             con_sim = {}
-            con_sim["nnz_mean"] = np.mean(data.obs["nnz"])
+            con_sim["nnz_mean"] = np.mean(data.obs["nnz"])  #sample 10000之后这里是应该更新的
             con_sim["nnz_var"] = np.var(data.obs["nnz"])
             nnz_values = data.X[data.X.nonzero()]
             con_sim["nnz_counts_mean"] = np.mean(nnz_values)
@@ -232,10 +345,10 @@ def get_targets(dataset_truth: str):
         return sim_targets
 
     def compute_similarity(
-        self, methods: List[str] = [
+        self, random_state: int, methods: List[str] = [
             'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim"
         ]
-    ) -> Dict[str, pd.DataFrame]:
+    ) -> Dict[str, float]:
         """Computes the specified similarity measure. Parameters:
 
         methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
@@ -243,60 +356,101 @@ def compute_similarity(
         Dictionary containing the similarity matrices
 
         """
+        self.adata1 = self.origin_adata1.copy()
+        self.adata2 = self.origin_adata2.copy()
+        self.normalize_data()
+        self.sample_cells(random_state)
+        self.set_prob_data()
+
         results = {}
         for method in methods:
+            print(method)
             if method == 'cosine':
-                results['cosine'] = self.cosine_sim()
+                results['cosine'] = self.cosine_sim_sampled()
             elif method == 'pearson':
-                results['pearson'] = self.pearson_corr()
+                results['pearson'] = self.pearson_corr_sampled()
             elif method == 'jaccard':
-                results['jaccard'] = self.jaccard_sim()
+                results['jaccard'] = self.jaccard_sim_sampled()
             elif method == 'js_distance':
-                results['js_distance'] = self.js_distance()
+                results['js_distance'] = self.js_divergence_sampled()
             elif method == 'wasserstein':
                 results['wasserstein'] = self.wasserstein_dist()
             elif method == "common_genes_num":
                 results["common_genes_num"] = self.common_genes_num()
+            elif method == "Hausdorff":
+                results["Hausdorff"] = self.get_Hausdorff()
+            elif method == "chamfer":
+                results["chamfer"] = self.chamfer_distance()
+            elif method == "energy":
+                results["energy"] = self.energy_distance_metric()
+            elif method == "sinkhorn2":
+                results["sinkhorn2"] = self.get_sinkhorn2()
+            elif method == "bures":
+                results["bures"] = self.bures_distance()
+            elif method == "spectral":
+                results["spectral"] = self.spectral_distance()
             elif method == "otdd":
                 results['otdd'] = self.otdd()
             elif method == "ground_truth":
                 results["ground_truth"] = self.get_ground_truth()
             elif method == "metadata_sim":
                 results["metadata_sim"] = self.get_dataset_meta_sim()
+            elif method == "mmd":
+                results["mmd"] = self.compute_mmd()
             else:
                 raise ValueError(f"Unsupported similarity method: {method}")
         return results
 
-    def get_similarity_matrix(
+    def get_similarity_matrix_A2B(
         self, methods: List[str] = [
-            'cosine', 'pearson', 'jaccard', 'js_distance', "common_genes_num", "ground_truth", "metadata_sim"
+            "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num",
+            "ground_truth", "metadata_sim", "mmd"
         ]
-    ) -> Dict[str, pd.DataFrame]:
+    ) -> Dict[str, float]:
         """Same as compute_similarity, keeping method name consistency."""
-        self.results = self.compute_similarity(methods)
-        return self.results
+        cumulative_results = {method: 0.0 for method in methods}
 
-    def get_max_similarity_A_to_B(self):
-        if self.results is None:
-            raise ValueError(f"need results!")
-        else:
-            self.results_score = {}
-            for key in self.results:
-                if key not in ["common_genes_num", "ground_truth", "metadata_sim"]:
-                    self.results_score[key] = self._get_max_similarity(self.results[key])
+        for run in range(self.n_runs):
+            # Update random state for each run
+            if self.init_random_state is not None:
+                current_random_state = self.init_random_state + run
+            else:
+                current_random_state = None
+            run_results = self.compute_similarity(methods=methods, random_state=current_random_state)
+            for method in methods:
+                if method in ["ground_truth"]:
+                    cumulative_results[method] = run_results[method]
                 else:
-                    self.results_score[key] = self.results[key]
-        return self.results_score
-
-    def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
-        """Maximum matching average similarity score."""
-        matched_values = [
-            similarity_matrix.loc[label,
-                                  label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max()
-            for label in similarity_matrix.index
-        ]  # need to ask
-        overall_similarity = np.mean(matched_values)
-        return overall_similarity
+                    cumulative_results[method] += run_results[method]
+    # Average the results over the number of runs
+        averaged_results = {
+            method:
+            cumulative_results[method] if method in ["ground_truth"] else cumulative_results[method] / self.n_runs
+            for method in methods
+        }
+        return averaged_results
+
+    # def get_max_similarity_A_to_B(self):
+    #     if self.results is None:
+    #         raise ValueError(f"need results!")
+    #     else:
+    #         self.results_score = {}
+    #         for key in self.results:
+    #             if key not in ["common_genes_num", "ground_truth", "metadata_sim"]:
+    #                 self.results_score[key] = self._get_max_similarity(self.results[key])
+    #             else:
+    #                 self.results_score[key] = self.results[key]
+    #     return self.results_score
+
+    # def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
+    #     """Maximum matching average similarity score."""
+    #     matched_values = [
+    #         similarity_matrix.loc[label,
+    #                               label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max()
+    #         for label in similarity_matrix.index
+    #     ]  # need to ask
+    #     overall_similarity = np.mean(matched_values)
+    #     return overall_similarity
 
 
 def extract_type_target_params(item_text):

From 9eba40ad9e14ee0c85467c03d14ee2d52fdfb5b3 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Mon, 9 Dec 2024 22:03:14 +0800
Subject: [PATCH 126/203] minor

---
 dance/metadata/scdeepsort.csv | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 6d5eb666..ec72d557 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -130,3 +130,6 @@ human,Brain,10000,train,,,train_human_Brain9372df2d-13d6-4fac-980b-919a5b7eb483_
 human,Brain,10000,train,,,train_human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad,https://www.dropbox.com/scl/fi/q6vtn80sf8jpkvte55zpm/human_Brain94c41723-b2c4-4b59-a49a-64c9b851903e_data.h5ad?rlkey=6ca2gf47w5r53rw5y5h5b53e8&st=3wkp779m&dl=1
 human,Brain,10000,train,,,train_human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad,https://www.dropbox.com/scl/fi/nl14mhuuwlq9zmjntot7g/human_Brain9813a1d4-d107-459e-9b2e-7687be935f69_data.h5ad?rlkey=o158zcyq781w4rj71pfsw8yf1&st=ds6kabvk&dl=1
 human,Brain,10000,train,,,train_human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad,https://www.dropbox.com/scl/fi/pyphmfixfeyu2wzyr216p/human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_data.h5ad?rlkey=sd6su8cmlc4g4k3hixihg1zo9&st=99diymvk&dl=1
+human,Brain,10000,train,,,train_human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad,https://www.dropbox.com/scl/fi/0v3auavah96csg1f486f2/human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad?rlkey=72cwgt8wh1421v32l3fsg6mvx&st=3fghbaz3&dl=1
+human,Brain,10000,train,,,train_human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad,https://www.dropbox.com/scl/fi/ne31m4apt1q90942cvfpy/human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad?rlkey=46s37qp2qpf8rqwfeef0gw5p2&st=8npktyyr&dl=1
+human,Brain,10000,train,,,train_human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad,https://www.dropbox.com/scl/fi/g9yrnvpj68nohpq97psoq/human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad?rlkey=tkqnbytv0yl7v0f0gngml83jq&st=a7lynn7i&dl=1

From cdeebd9615186a0852d6991c8eed11cb899c9b42 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 10 Dec 2024 22:27:19 +0800
Subject: [PATCH 127/203] minor

---
 dance/metadata/scdeepsort.csv | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index ec72d557..46d46646 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -133,3 +133,7 @@ human,Brain,10000,train,,,train_human_Brainc893ddc3-f25b-45e2-8c9e-155918b4261c_
 human,Brain,10000,train,,,train_human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad,https://www.dropbox.com/scl/fi/0v3auavah96csg1f486f2/human_Braine8681d74-ac9e-4be5-be14-1cf1bbd54dd7_data.h5ad?rlkey=72cwgt8wh1421v32l3fsg6mvx&st=3fghbaz3&dl=1
 human,Brain,10000,train,,,train_human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad,https://www.dropbox.com/scl/fi/ne31m4apt1q90942cvfpy/human_Brain364348b4-bc34-4fe1-a851-60d99e36cafa_data.h5ad?rlkey=46s37qp2qpf8rqwfeef0gw5p2&st=8npktyyr&dl=1
 human,Brain,10000,train,,,train_human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad,https://www.dropbox.com/scl/fi/g9yrnvpj68nohpq97psoq/human_Brain93cb76aa-a84b-4a92-8e6c-66a914e26d4c_data.h5ad?rlkey=tkqnbytv0yl7v0f0gngml83jq&st=a7lynn7i&dl=1
+human,Blood,6368,train,,,train_human_Bloodfe52003e-1460-4a65-a213-2bb1a508332f_data.h5ad,https://www.dropbox.com/scl/fi/esqgoi7vgwwt9j6apipn3/human_Bloodfe52003e-1460-4a65-a213-2bb1a508332f_data.h5ad?rlkey=i5tw4dsprwonypls9q1sjv6z1&st=rvpv0bx4&dl=1
+human,Blood,10000,train,,,train_human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad,https://www.dropbox.com/scl/fi/2ze4zzjl9ho0yioypet94/human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad?rlkey=4tjvgzj69eqnwqt34y6ykjxx7&st=6eazya61&dl=1
+human,Blood,10000,train,,,train_human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad,https://www.dropbox.com/scl/fi/ppmitw72imo7hoiqk02uj/human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad?rlkey=qkvls3xesyu4wdc4f46ayy9wc&st=8wc95ewv&dl=1
+human,Blood,10000,train,,,train_human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad,https://www.dropbox.com/scl/fi/troppy0ouk9w60xx3gucv/human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad?rlkey=vm77ead52n9fy9e4lp9kpt8y3&st=zjdzi0rs&dl=1

From 225b6558f0a7478811e336f09d5591e74835d4e9 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Wed, 11 Dec 2024 15:10:51 +0800
Subject: [PATCH 128/203] minor

---
 examples/multi_modality/joint_embedding/dcca.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index beefdeb0..0f89ce64 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -45,11 +45,11 @@ def parameter_setting():
     parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch")
     parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test")
     parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
+    parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex")
     parser.add_argument("-device", "--device", default="cuda")
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
-
+    parser.add_argument("--span", type=float, default=0.3)
     return parser
 
 
@@ -65,7 +65,7 @@ def parameter_setting():
     args.lr2 = 0.005
     args.flr2 = 0.0005
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection")
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()
@@ -132,7 +132,7 @@ def parameter_setting():
         adata = adata[adata_sol.obs_names]
         adata_sol.obsm['X_emb'] = adata.X
         score = metrics.labeled_clustering_evaluate(adata, adata_sol)
-        score.update(metrics.integration_openproblems_evaluate(adata_sol))
+        # score.update(metrics.integration_openproblems_evaluate(adata_sol))
         score.update({
             'seed': args.seed + k,
             'subtask': args.subtask,

From f70ee3ee39b58b43d84244a5f9f4d5351d354b76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 11 Dec 2024 07:12:09 +0000
Subject: [PATCH 129/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/multi_modality/joint_embedding/dcca.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index 0f89ce64..1467a737 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -65,7 +65,8 @@ def parameter_setting():
     args.lr2 = 0.005
     args.flr2 = 0.0005
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",
+                                        span=args.span)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()

From 5645a6add8c80f2eebfd18f013db945189283402 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 12 Dec 2024 09:46:37 +0800
Subject: [PATCH 130/203] update example

---
 .../multi_modality/joint_embedding/jae.py     | 18 +++++++++++++----
 .../multi_modality/joint_embedding/scmogcn.py | 20 ++++++++++++++-----
 .../multi_modality/joint_embedding/scmvae.py  |  8 ++++----
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py
index cca32808..ab136764 100644
--- a/examples/multi_modality/joint_embedding/jae.py
+++ b/examples/multi_modality/joint_embedding/jae.py
@@ -10,8 +10,8 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+    parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex",
+                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -21,6 +21,7 @@
     parser.add_argument("-bs", "--batch_size", default=128, type=int)
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("--span", type=float, default=0.3)
 
     args = parser.parse_args()
 
@@ -30,7 +31,7 @@
     rndseed = args.seed
     set_seed(rndseed)
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span)
     data = dataset.load_data()
 
     data.set_config(
@@ -39,6 +40,15 @@
         feature_channel=["X_pca", "X_pca"],
         label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
     )
+    if True:
+        cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
+        cell_type_labels_unique = list(np.unique(cell_type_labels))
+        c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
+        data.data['mod1'].obsm["cell_type"] = c_labels
+        data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
+        data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
+        data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
+        data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
     (X_mod1_train, X_mod2_train), (cell_type, batch_label, phase_label, S_score,
                                    G2M_score) = data.get_train_data(return_type="torch")
     (X_mod1_test, X_mod2_test), (cell_type_test, _, _, _, _) = data.get_test_data(return_type="torch")
@@ -61,7 +71,7 @@
         print(embeds)
 
         score = model.score(X_test, test_id, labels, metric="clustering")
-        score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+        # score.update(model.score(X_test, test_id, labels, adata_sol=adata_sol, metric="openproblems"))
         score.update({
             'seed': args.seed + k,
             'subtask': args.subtask,
diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py
index 0ed73f3f..1e80786c 100644
--- a/examples/multi_modality/joint_embedding/scmogcn.py
+++ b/examples/multi_modality/joint_embedding/scmogcn.py
@@ -11,8 +11,8 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2",
-                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2"])
+    parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex",
+                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -24,7 +24,8 @@
     parser.add_argument("-bs", "--batch_size", default=512, type=int)
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
-
+    parser.add_argument("--span", type=float, default=0.3)
+    
     args = parser.parse_args()
 
     device = args.device
@@ -33,7 +34,7 @@
     rndseed = args.seed
     set_seed(rndseed)
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="aux", normalize=True)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span)
     data = dataset.load_data()
     train_size = len(data.get_split_idx("train"))
 
@@ -45,6 +46,15 @@
         feature_channel=["X_pca", "X_pca"],
         label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
     )
+    if True:
+        cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
+        cell_type_labels_unique = list(np.unique(cell_type_labels))
+        c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])
+        data.data['mod1'].obsm["cell_type"] = c_labels
+        data.data["mod1"].obsm["S_scores"] = np.zeros(data.data['mod1'].shape[0])
+        data.data["mod1"].obsm["G2M_scores"] = np.zeros(data.data['mod1'].shape[0])
+        data.data["mod1"].obsm["batch_label"] = np.zeros(data.data['mod1'].shape[0])
+        data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
     (x_mod1, x_mod2), (cell_type, batch_label, phase_label, S_score, G2M_score) = data.get_data(return_type="torch")
     phase_score = torch.cat([S_score[:, None], G2M_score[:, None]], 1)
     test_id = np.arange(x_mod1.shape[0])
@@ -68,7 +78,7 @@
         embeds = model.predict(test_id).cpu().numpy()
         print(embeds)
         score = model.score(test_id, labels, metric="clustering")
-        score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
+        # score.update(model.score(test_id, labels, adata_sol=adata_sol, metric="openproblems"))
         score.update({
             'seed': args.seed + k,
             'subtask': args.subtask,
diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py
index 65464c0f..b913c5f3 100644
--- a/examples/multi_modality/joint_embedding/scmvae.py
+++ b/examples/multi_modality/joint_embedding/scmvae.py
@@ -32,11 +32,11 @@ def parameter_setting():
     parser.add_argument("--epoch_per_test", "-ept", type=int, default=1,
                         help="Epoch per test, must smaller than max iteration.")
     parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
+    parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex")
     parser.add_argument("-device", "--device", default="cuda")
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
-
+    parser.add_argument("--span", type=float, default=0.3)
     return parser
 
 
@@ -46,7 +46,7 @@ def parameter_setting():
     set_seed(args.seed)
     assert args.max_iteration > args.epoch_per_test
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection")
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()
@@ -121,7 +121,7 @@ def parameter_setting():
         embeds = model.predict(x_test, y_test).cpu().numpy()
         print(embeds.shape)
         score = model.score(x_test, y_test, labels)
-        score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
+        # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
         score.update({
             'seed': args.seed + k,
             'subtask': args.subtask,

From 06d282daf412141287c2fb20949cd1c055e2dcde Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Dec 2024 01:47:03 +0000
Subject: [PATCH 131/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/multi_modality/joint_embedding/jae.py     | 10 +++++++---
 examples/multi_modality/joint_embedding/scmogcn.py | 12 ++++++++----
 examples/multi_modality/joint_embedding/scmvae.py  |  3 ++-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py
index ab136764..c726dd8b 100644
--- a/examples/multi_modality/joint_embedding/jae.py
+++ b/examples/multi_modality/joint_embedding/jae.py
@@ -10,8 +10,11 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex",
-                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"])
+    parser.add_argument(
+        "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[
+            "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex",
+            "GSE140203_SKIN_atac2gex"
+        ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -31,7 +34,8 @@
     rndseed = args.seed
     set_seed(rndseed)
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,
+                                        span=args.span)
     data = dataset.load_data()
 
     data.set_config(
diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py
index 1e80786c..1ef52647 100644
--- a/examples/multi_modality/joint_embedding/scmogcn.py
+++ b/examples/multi_modality/joint_embedding/scmogcn.py
@@ -11,8 +11,11 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex",
-                        choices=["openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2","GSE140203_BRAIN_atac2gex","GSE140203_SKIN_atac2gex"])
+    parser.add_argument(
+        "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[
+            "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex",
+            "GSE140203_SKIN_atac2gex"
+        ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
     parser.add_argument("-csv", "--csv_path", default="decoupled_lsi.csv")
@@ -25,7 +28,7 @@
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
     parser.add_argument("--span", type=float, default=0.3)
-    
+
     args = parser.parse_args()
 
     device = args.device
@@ -34,7 +37,8 @@
     rndseed = args.seed
     set_seed(rndseed)
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,span=args.span)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,
+                                        span=args.span)
     data = dataset.load_data()
     train_size = len(data.get_split_idx("train"))
 
diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py
index b913c5f3..461adcd4 100644
--- a/examples/multi_modality/joint_embedding/scmvae.py
+++ b/examples/multi_modality/joint_embedding/scmvae.py
@@ -46,7 +46,8 @@ def parameter_setting():
     set_seed(args.seed)
     assert args.max_iteration > args.epoch_per_test
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",
+                                        span=args.span)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()

From cee920432d1d053c78135d90f407806c4e1291da Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Thu, 12 Dec 2024 11:34:12 +0800
Subject: [PATCH 132/203] minor

---
 examples/multi_modality/joint_embedding/dcca.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index 1467a737..338f67fb 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -50,6 +50,7 @@ def parameter_setting():
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
     parser.add_argument("--span", type=float, default=0.3)
+    parser.add_argument("--selection_threshold", type=int, default=3000)
     return parser
 
 
@@ -66,7 +67,7 @@ def parameter_setting():
     args.flr2 = 0.0005
 
     dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",
-                                        span=args.span)
+                                        span=args.span,selection_threshold=args.selection_threshold)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()

From ed0b6b4b3104284bd93190dff210a6e01f6c5a31 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Dec 2024 03:34:36 +0000
Subject: [PATCH 133/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/multi_modality/joint_embedding/dcca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index 338f67fb..9c172a07 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -67,7 +67,7 @@ def parameter_setting():
     args.flr2 = 0.0005
 
     dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",
-                                        span=args.span,selection_threshold=args.selection_threshold)
+                                        span=args.span, selection_threshold=args.selection_threshold)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()

From a19a0508bdd3f73feef68f29aac072fec2ed5028 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 12 Dec 2024 11:35:23 +0800
Subject: [PATCH 134/203] minor

---
 examples/multi_modality/joint_embedding/dcca.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index 0f89ce64..c604fac2 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -46,7 +46,7 @@ def parameter_setting():
     parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test")
     parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
     parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex")
-    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("-device", "--device", default="cuda:5")
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
     parser.add_argument("--span", type=float, default=0.3)
@@ -65,7 +65,8 @@ def parameter_setting():
     args.lr2 = 0.005
     args.flr2 = 0.0005
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",span=args.span)
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",
+                                        span=args.span)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()

From 33eaa23cd888c84ff1e42d4965757c99ef9b0ce0 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 12 Dec 2024 15:18:32 +0800
Subject: [PATCH 135/203] minor

---
 examples/multi_modality/joint_embedding/dcca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/dcca.py b/examples/multi_modality/joint_embedding/dcca.py
index beefdeb0..47792c40 100644
--- a/examples/multi_modality/joint_embedding/dcca.py
+++ b/examples/multi_modality/joint_embedding/dcca.py
@@ -45,8 +45,8 @@ def parameter_setting():
     parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch")
     parser.add_argument("--epoch_per_test", "-ept", type=int, default=5, help="Epoch per test")
     parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
-    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("-t", "--subtask", default="GSE140203_BRAIN_atac2gex")
+    parser.add_argument("-device", "--device", default="cuda:4")
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
 

From b08ce147742e8c226e30d5bf3b100497ab3b955e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 12 Dec 2024 15:18:40 +0800
Subject: [PATCH 136/203] minor

---
 examples/multi_modality/joint_embedding/scmvae.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py
index 65464c0f..dfb5f4a6 100644
--- a/examples/multi_modality/joint_embedding/scmvae.py
+++ b/examples/multi_modality/joint_embedding/scmvae.py
@@ -32,8 +32,8 @@ def parameter_setting():
     parser.add_argument("--epoch_per_test", "-ept", type=int, default=1,
                         help="Epoch per test, must smaller than max iteration.")
     parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
-    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex")
+    parser.add_argument("-device", "--device", default="cuda:4")
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
 

From 1ad3cecabd7ad7d4e2f2fce145b1fa76be203a79 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Thu, 12 Dec 2024 20:28:45 +0800
Subject: [PATCH 137/203] minor

---
 dance/metadata/scdeepsort.csv | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 46d46646..04085cfa 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -137,3 +137,10 @@ human,Blood,6368,train,,,train_human_Bloodfe52003e-1460-4a65-a213-2bb1a508332f_d
 human,Blood,10000,train,,,train_human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad,https://www.dropbox.com/scl/fi/2ze4zzjl9ho0yioypet94/human_Bloodc2a461b1-0c15-4047-9fcb-1f966fe55100_data.h5ad?rlkey=4tjvgzj69eqnwqt34y6ykjxx7&st=6eazya61&dl=1
 human,Blood,10000,train,,,train_human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad,https://www.dropbox.com/scl/fi/ppmitw72imo7hoiqk02uj/human_Bloodb0e547f0-462b-4f81-b31b-5b0a5d96f537_data.h5ad?rlkey=qkvls3xesyu4wdc4f46ayy9wc&st=8wc95ewv&dl=1
 human,Blood,10000,train,,,train_human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad,https://www.dropbox.com/scl/fi/troppy0ouk9w60xx3gucv/human_Bloodd7d7e89c-c93a-422d-8958-9b4a90b69558_data.h5ad?rlkey=vm77ead52n9fy9e4lp9kpt8y3&st=zjdzi0rs&dl=1
+human,Brain,8077,train,,,train_human_Braind5452b83-7c3d-4d7c-ab7a-c7fece7196c5_data.h5ad,https://www.dropbox.com/scl/fi/yqk7qe9qynbysy2qzuymp/human_Braind5452b83-7c3d-4d7c-ab7a-c7fece7196c5_data.h5ad?rlkey=9zbbwyewq97ff9eaermqx8ers&st=j1ordew8&dl=1
+human,Brain,3581,train,,,train_human_Brain774c18c5-efa1-4dc5-9e5e-2c824bab2e34_data.h5ad,https://www.dropbox.com/scl/fi/6jvi5wnl28u4dw6notnpo/human_Brain774c18c5-efa1-4dc5-9e5e-2c824bab2e34_data.h5ad?rlkey=5cmvurxmnv9u2gmigw5cc250s&st=id2kc01n&dl=1
+human,Brain,1318,train,,,train_human_Brain3d044b52-140a-4528-bf0d-a2dbef9e1f40_data.h5ad,https://www.dropbox.com/scl/fi/ocowzkh5d6jlo7stam48h/human_Brain3d044b52-140a-4528-bf0d-a2dbef9e1f40_data.h5ad?rlkey=rjkcpggc3btgsti8sx0tychif&st=z79g3sv1&dl=1
+human,Brain,6877,train,,,train_human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_data.h5ad,https://www.dropbox.com/scl/fi/jem4d9yaa7ovg4ahhxken/human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_data.h5ad?rlkey=xjzehuoucxjamouyw7tfyo54h&st=0ti1j2kq&dl=1
+human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad,https://www.dropbox.com/scl/fi/xutjy05pxtqlt2nyk35kp/human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad?rlkey=34ahhk9g9cxrugt8canfmfl3u&st=ia53zjuo&dl=1
+human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1
+human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1

From c26f490d78f231689fc8e7c1902e61d6c4e6a2c8 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 12 Dec 2024 20:33:08 +0800
Subject: [PATCH 138/203] minor

---
 examples/multi_modality/joint_embedding/scmvae.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/scmvae.py b/examples/multi_modality/joint_embedding/scmvae.py
index 461adcd4..97354164 100644
--- a/examples/multi_modality/joint_embedding/scmvae.py
+++ b/examples/multi_modality/joint_embedding/scmvae.py
@@ -32,11 +32,12 @@ def parameter_setting():
     parser.add_argument("--epoch_per_test", "-ept", type=int, default=1,
                         help="Epoch per test, must smaller than max iteration.")
     parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
-    parser.add_argument("-t", "--subtask", default="GSE140203_SKIN_atac2gex")
+    parser.add_argument("-t", "--subtask", default="openproblems_2022_multi_atac2gex")
     parser.add_argument("-device", "--device", default="cuda")
     parser.add_argument("--final_rate", type=float, default=1e-4)
     parser.add_argument("--scale_factor", type=float, default=4)
     parser.add_argument("--span", type=float, default=0.3)
+    parser.add_argument("--selection_threshold", type=int, default=3000)
     return parser
 
 
@@ -47,7 +48,7 @@ def parameter_setting():
     assert args.max_iteration > args.epoch_per_test
 
     dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding", preprocess="feature_selection",
-                                        span=args.span)
+                                        span=args.span, selection_threshold=args.selection_threshold)
     data = dataset.load_data()
 
     le = preprocessing.LabelEncoder()

From 467d4a01a8df5f07041c3781fdf8d5896461311c Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 12 Dec 2024 22:26:58 +0800
Subject: [PATCH 139/203] minor

---
 dance/metadata/scdeepsort.csv | 61 +++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 04085cfa..381f8efc 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -144,3 +144,64 @@ human,Brain,6877,train,,,train_human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_d
 human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad,https://www.dropbox.com/scl/fi/xutjy05pxtqlt2nyk35kp/human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad?rlkey=34ahhk9g9cxrugt8canfmfl3u&st=ia53zjuo&dl=1
 human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1
 human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1
+human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&dl=1
+human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&dl=1
+human,Intestine,6444,train,,,train_human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad,https://www.dropbox.com/scl/fi/6rd98oo0z68dmqpuhn7ap/human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad?rlkey=v6zz40z2f73h4oivfx67y7cgr&dl=1
+human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad,https://www.dropbox.com/scl/fi/u57hnsh6nv88r8nnrwfnl/human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad?rlkey=suzpw1ikma8kpr3uxkv7nosue&dl=1
+human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&dl=1
+human,Intestine,7443,train,,,train_human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad,https://www.dropbox.com/scl/fi/i9271g8nx3kmxi6rb12iv/human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad?rlkey=2vfdh7k5amnul3d786g2zmr6g&dl=1
+human,Kidney,10000,train,,,train_human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad,https://www.dropbox.com/scl/fi/3zfgu6g7mzv2v7f7qpwvq/human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad?rlkey=euso7cvuwcpauk21o3yme3959&dl=1
+human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&dl=1
+human,Kidney,10000,train,,,train_human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/krhe97shrnuofdlthnopj/human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=0lft0ebfaz4e0rzo1xpotmt79&dl=1
+human,Kidney,10000,train,,,train_human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad,https://www.dropbox.com/scl/fi/mhr92khp2j4ydit9mjiky/human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad?rlkey=9eg9ota05td575buw4vh6qdjd&dl=1
+human,Kidney,10000,train,,,train_human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad,https://www.dropbox.com/scl/fi/29d15brkbpsx14tr9qt0w/human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad?rlkey=sxzcd0btog8nhm13sf36r0q78&dl=1
+human,Kidney,10000,train,,,train_human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad,https://www.dropbox.com/scl/fi/gkqbvzh99bhegjar2b9q0/human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad?rlkey=emfdlopj8v08yqqp9c82n16je&dl=1
+human,Kidney,10000,train,,,train_human_Kidney9df60c57-fdf3-4e93-828e-fe9303f20438_data.h5ad,https://www.dropbox.com/scl/fi/0uc53tlnh0c38kjtmoa76/human_Kidney9df60c57-fdf3-4e93-828e-fe9303f20438_data.h5ad?rlkey=633zoidj6app2k616pue50d7u&dl=1
+human,Kidney,10000,train,,,train_human_Kidneybe39785b-67cb-4177-be19-a40ee3747e45_data.h5ad,https://www.dropbox.com/scl/fi/uo55loqew4nsh2yntiec6/human_Kidneybe39785b-67cb-4177-be19-a40ee3747e45_data.h5ad?rlkey=z2qc037d8g6nc3rbgkchnsv1v&dl=1
+human,Kidney,10000,train,,,train_human_Kidney32b9bdce-2481-4c85-ba1b-6ad5fcea844c_data.h5ad,https://www.dropbox.com/scl/fi/ugqjbf78tlc5g55dygysz/human_Kidney32b9bdce-2481-4c85-ba1b-6ad5fcea844c_data.h5ad?rlkey=h0d4nxyftqt9ktsh7s1tn623e&dl=1
+human,Kidney,10000,train,,,train_human_Kidney0b4a15a7-4e9e-4555-9733-2423e5c66469_data.h5ad,https://www.dropbox.com/scl/fi/aql02yzq6rosod8071qlu/human_Kidney0b4a15a7-4e9e-4555-9733-2423e5c66469_data.h5ad?rlkey=0oaq4962yw2642wa7mlxpkzab&dl=1
+human,Kidney,9641,train,,,train_human_Kidney53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad,https://www.dropbox.com/scl/fi/bqnrkyjzmbppgnej9l1qq/human_Kidney53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad?rlkey=co1531kyzzbghf4bvcqgg5rjn&dl=1
+human,Kidney,86,train,,,train_human_Kidney6a30bf44-c490-41ac-965b-0bb58432b10a_data.h5ad,https://www.dropbox.com/scl/fi/8vs3bwrk84shbekth3the/human_Kidney6a30bf44-c490-41ac-965b-0bb58432b10a_data.h5ad?rlkey=kbcz4p48v0rhu18hb8r2y47hq&dl=1
+human,Kidney,6044,train,,,train_human_Kidneyf801b7a9-80a6-4d09-9161-71474deb58ae_data.h5ad,https://www.dropbox.com/scl/fi/60uxtvicy2n8srhqfmub3/human_Kidneyf801b7a9-80a6-4d09-9161-71474deb58ae_data.h5ad?rlkey=x0im2udw8litcyzcsywipm49u&dl=1
+human,Kidney,7802,train,,,train_human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad,https://www.dropbox.com/scl/fi/xmzomvt0c8bza3fy8me0p/human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad?rlkey=iqzword5254z5rujjdey1u8hc&dl=1
+human,Kidney,6847,train,,,train_human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad,https://www.dropbox.com/scl/fi/rhngz2alde48jotpy5c5v/human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad?rlkey=u0x4dsnt569wq07l3h1rqjzum&dl=1
+human,Kidney,10000,train,,,train_human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad,https://www.dropbox.com/scl/fi/ybml7y2bth0qjnv3x1ieg/human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad?rlkey=qkjgdqttk3s10ht54109a4cad&dl=1
+human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/553s0af5q2nibafj4nkux/human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=p85qlsjixsuuutgwnms3w4y30&dl=1
+human,Kidney,10000,train,,,train_human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad,https://www.dropbox.com/scl/fi/feklth6jvnc5qqwvgaydy/human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad?rlkey=28vpy2m90lnri9aekfthrsvr1&dl=1
+human,Kidney,5848,train,,,train_human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad,https://www.dropbox.com/scl/fi/1jq1wrqo1rcl041antcm8/human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad?rlkey=ssgfsiobqfah3pxgqnrsaff6l&dl=1
+human,Kidney,9641,train,,,train_human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad,https://www.dropbox.com/scl/fi/o2cnntkrd5j6coeqehv8b/human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad?rlkey=5tbupfd3cdvqzy2rix6scvwzu&dl=1
+human,Lung,10000,train,,,train_human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/w0n6axa32nej87tw4rk49/human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=8lgoi54y9wtxtfwpmnumpmzex&dl=1
+human,Lung,10000,train,,,train_human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad,https://www.dropbox.com/scl/fi/dqhei15s96dg3q8bdd31b/human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad?rlkey=ykpxbucys97t327fwehflkoa2&dl=1
+human,Lung,10000,train,,,train_human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad,https://www.dropbox.com/scl/fi/pwhyse079mo9radk2xzuw/human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad?rlkey=t60bp7w5mf3k877q1i430oc14&dl=1
+human,Lung,10000,train,,,train_human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad,https://www.dropbox.com/scl/fi/w2r13kqrkzdxecvhizm0i/human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad?rlkey=6s4wbv2ii1d8ged5l8s8lwt6l&dl=1
+human,Lung,10000,train,,,train_human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad,https://www.dropbox.com/scl/fi/ubcw0cyn5uvaq034ysgxl/human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad?rlkey=m57pb8bx4936fnao2yyljqdgz&dl=1
+human,Lung,10000,train,,,train_human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/0vqe7wmb0afoubwnb5srb/human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=7necb5o9afgpnppsj74tga5y2&dl=1
+human,Lung,10000,train,,,train_human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/sbe6h2v5dijlu36qd6nyw/human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=gunxweprd7r8e0xlk9mo2kkv3&dl=1
+human,Lung,10000,train,,,train_human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad,https://www.dropbox.com/scl/fi/mz6umlbnjoxynhqklwyxg/human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad?rlkey=upom03ch71gebjvxq15x59gk9&dl=1
+human,Lung,10000,train,,,train_human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad,https://www.dropbox.com/scl/fi/b2e6gr542wah0t5xtgshh/human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad?rlkey=7pkq1kh7wz6z0qzj4wdj94i79&dl=1
+human,Lung,10000,train,,,train_human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad,https://www.dropbox.com/scl/fi/ymmdfevzihlcyjosugyuq/human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad?rlkey=71rly1fkb21yl8gxy8af42ke7&dl=1
+human,Lung,4138,train,,,train_human_Lung7b3368a5-c1a0-4973-9e75-d95b4150c7da_data.h5ad,https://www.dropbox.com/scl/fi/kkfqus7bbbc5fyammhvs6/human_Lung7b3368a5-c1a0-4973-9e75-d95b4150c7da_data.h5ad?rlkey=7pi96515hn1wp6vdb2mgx5wke&dl=1
+human,Lung,8657,train,,,train_human_Lunge04daea4-4412-45b5-989e-76a9be070a89_data.h5ad,https://www.dropbox.com/scl/fi/3yi17ckqej50wbvrhrmev/human_Lunge04daea4-4412-45b5-989e-76a9be070a89_data.h5ad?rlkey=dhvki1bayogmxfvwbgdmr63m0&dl=1
+human,Lung,9784,train,,,train_human_Lunge9175006-8978-4417-939f-819855eab80e_data.h5ad,https://www.dropbox.com/scl/fi/1n3ztjbo8v7pksbwocwvt/human_Lunge9175006-8978-4417-939f-819855eab80e_data.h5ad?rlkey=x7mhr3hqee4yon8iiqncjxzlv&dl=1
+human,Lung,6947,train,,,train_human_Lung0ba16f4b-cb87-4fa3-9363-19fc51eec6e7_data.h5ad,https://www.dropbox.com/scl/fi/u1l3vw5jfjzo5j438hns0/human_Lung0ba16f4b-cb87-4fa3-9363-19fc51eec6e7_data.h5ad?rlkey=bdrqcl1rnqj3ckzae2nivdryk&dl=1
+human,Lung,1135,train,,,train_human_Lunga68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/wzajeyqibto1nmg6lcnnu/human_Lunga68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=yaj6fw3lmf4gj2om9myv2xqe2&dl=1
+human,Lung,10000,train,,,train_human_Lung8c42cfd0-0b0a-46d5-910c-fc833d83c45e_data.h5ad,https://www.dropbox.com/scl/fi/0octhu7p45vhm141xpxho/human_Lung8c42cfd0-0b0a-46d5-910c-fc833d83c45e_data.h5ad?rlkey=f2god56i85fy7kbicd3omlcrr&dl=1
+human,Lung,10000,train,,,train_human_Lung2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/h943ko4tvioc5nbf74xh4/human_Lung2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=8tr5zd09f5w6bh81do2plszem&dl=1
+human,Lung,10000,train,,,train_human_Lungd8da613f-e681-4c69-b463-e94f5e66847f_data.h5ad,https://www.dropbox.com/scl/fi/87y1haxwnvw18ip34fuc3/human_Lungd8da613f-e681-4c69-b463-e94f5e66847f_data.h5ad?rlkey=zhxl8bv8ttize8x0iomrln7bu&dl=1
+human,Lung,9096,train,,,train_human_Lung4023a2bc-6325-47db-bfdf-9639e91042c2_data.h5ad,https://www.dropbox.com/scl/fi/5lyw275vtgivn93cvryho/human_Lung4023a2bc-6325-47db-bfdf-9639e91042c2_data.h5ad?rlkey=0bbi53hwtg0bpche9zmsbe3bb&dl=1
+human,Lung,329,train,,,train_human_Lung71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad,https://www.dropbox.com/scl/fi/322l1c9u9yw96m5jcrgx2/human_Lung71be997d-ff75-41b9-8a9f-1288c865f921_data.h5ad?rlkey=no8s8y76yfh0h5rj2oycto9e4&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreas53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad,https://www.dropbox.com/scl/fi/isf2fm5xy6ymlxmy28j42/human_Pancreas53d208b0-2cfd-4366-9866-c3c6114081bc_data.h5ad?rlkey=sr9rmdejoqevjl7nfy5eieg1h&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreas78f10833-3e61-4fad-96c9-4bbd4f14bdfa_data.h5ad,https://www.dropbox.com/scl/fi/6euec8e6la5lb536eca9v/human_Pancreas78f10833-3e61-4fad-96c9-4bbd4f14bdfa_data.h5ad?rlkey=h048sako37cm7fldz0qyanbbt&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreasf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/13dwwcqh8uxne0fkv1nec/human_Pancreasf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=71velvjv1y9xp433ite74lc4o&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreas37b21763-7f0f-41ae-9001-60bad6e2841d_data.h5ad,https://www.dropbox.com/scl/fi/4saarn2fj0p9m1wz31oqm/human_Pancreas37b21763-7f0f-41ae-9001-60bad6e2841d_data.h5ad?rlkey=at8y4ydgoldzxo45dwd8r5q63&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreas9c4c8515-8f82-4c72-b0c6-f87647b00bbe_data.h5ad,https://www.dropbox.com/scl/fi/xie7wdjl99rijwubmkm8v/human_Pancreas9c4c8515-8f82-4c72-b0c6-f87647b00bbe_data.h5ad?rlkey=ev0t2j0oug6jn5lx3217nfilv&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreasfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/m57p32xedb57iy039t2u7/human_Pancreasfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=biux7hptrp8dkiu4bt5xsibdh&dl=1
+human,Pancreas,2126,train,,,train_human_Pancreasb07e5164-baf6-43d2-bdba-5a249d0da879_data.h5ad,https://www.dropbox.com/scl/fi/eklbb3ecg87j4ioh7j3yj/human_Pancreasb07e5164-baf6-43d2-bdba-5a249d0da879_data.h5ad?rlkey=d4rjld6ngijy8ozavipf62k7j&dl=1
+human,Pancreas,2742,train,,,train_human_Pancreasc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/wkmrwsu12z50p10knvhij/human_Pancreasc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=k4uaaq8dj1glvwjtw7dguhffu&dl=1
+human,Pancreas,440,train,,,train_human_Pancreasa68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad,https://www.dropbox.com/scl/fi/96fmfcse1tkek9fmt3l9i/human_Pancreasa68b64d8-aee3-4947-81b7-36b8fe5a44d2_data.h5ad?rlkey=iblncgxaczniac0tdhwluv3hd&dl=1
+human,Pancreas,2100,train,,,train_human_Pancreas5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad,https://www.dropbox.com/scl/fi/l437mlea7j4kndeuqmgfc/human_Pancreas5a11f879-d1ef-458a-910c-9b0bdfca5ebf_data.h5ad?rlkey=hf9cp43iokqzr3zkt7xhji46o&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreas3294d050-6eeb-4a00-b24c-71aacc9b777f_data.h5ad,https://www.dropbox.com/scl/fi/vu8iqocybwv1ntyfvniq0/human_Pancreas3294d050-6eeb-4a00-b24c-71aacc9b777f_data.h5ad?rlkey=t8kg0khgxloxvppl8n5yck5qw&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreas2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/1ohvymmbhbnuxztcg073v/human_Pancreas2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=4wqv2ep9tw0962rk3wit5qf1l&dl=1
+human,Pancreas,10000,train,,,train_human_Pancreasff45e623-7f5f-46e3-b47d-56be0341f66b_data.h5ad,https://www.dropbox.com/scl/fi/zg4zyy9g3ana3aozo8wpx/human_Pancreasff45e623-7f5f-46e3-b47d-56be0341f66b_data.h5ad?rlkey=os9h57ravoaw8dg4mav9qahxm&dl=1
+human,Pancreas,2544,train,,,train_human_Pancreas66d15835-5dc8-4e96-b0eb-f48971cb65e8_data.h5ad,https://www.dropbox.com/scl/fi/p2zo6qt4j0hq2xtd8yudn/human_Pancreas66d15835-5dc8-4e96-b0eb-f48971cb65e8_data.h5ad?rlkey=hum53j5mvk3vs3ybqwe0fb2r1&dl=1
+human,Pancreas,8215,train,,,train_human_Pancreas97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad,https://www.dropbox.com/scl/fi/43r5btoo1z1r43xwlg1st/human_Pancreas97a17473-e2b1-4f31-a544-44a60773e2dd_data.h5ad?rlkey=2zlbl33carcm5xsyp9dbazn84&dl=1

From 30eb4fd46ddba62f7944e352dbe5f6651c8425ee Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Fri, 13 Dec 2024 23:10:10 +0800
Subject: [PATCH 140/203] minor

---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 381f8efc..9f9e5ae9 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -144,7 +144,7 @@ human,Brain,6877,train,,,train_human_Brainf6d9f2ad-5ec7-4d53-b7f0-ceb0e7bcd181_d
 human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad,https://www.dropbox.com/scl/fi/xutjy05pxtqlt2nyk35kp/human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_data.h5ad?rlkey=34ahhk9g9cxrugt8canfmfl3u&st=ia53zjuo&dl=1
 human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1
 human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1
-human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&dl=1
+human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Intestine-_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&st=nay2m851&dl=1
 human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&dl=1
 human,Intestine,6444,train,,,train_human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad,https://www.dropbox.com/scl/fi/6rd98oo0z68dmqpuhn7ap/human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad?rlkey=v6zz40z2f73h4oivfx67y7cgr&dl=1
 human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad,https://www.dropbox.com/scl/fi/u57hnsh6nv88r8nnrwfnl/human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad?rlkey=suzpw1ikma8kpr3uxkv7nosue&dl=1

From 2d4070d81a4944c14ee179b2fa4ff6f7cb7f4039 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 16 Dec 2024 09:28:46 +0800
Subject: [PATCH 141/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 9f9e5ae9..0f537ded 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -145,10 +145,10 @@ human,Brain,5070,train,,,train_human_Brainf64e1be1-de15-4d27-8da4-82225cd4c035_d
 human,Brain,6044,train,,,train_human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad,https://www.dropbox.com/scl/fi/e4r8d8nfoogzzkeleocle/human_Brain576f193c-75d0-4a11-bd25-8676587e6dc2_data.h5ad?rlkey=oyici6185mp45rwahx870wiwf&st=93274l53&dl=1
 human,Brain,8573,train,,,train_human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad,https://www.dropbox.com/scl/fi/06j981vjht86i5pmy7oqy/human_Brained11cc3e-2947-407c-883c-c53b043917c3_data.h5ad?rlkey=c93stddc1kylxyrgme7448sva&st=pwxnqclk&dl=1
 human,Intestine,10000,train,,,train_human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/iofy51et57gmuayf8rcg4/human_Intestine2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Intestine-_data.h5ad?rlkey=u6j55oe6rj4nuocjz3cag56uw&st=nay2m851&dl=1
-human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&dl=1
+human,Intestine,10000,train,,,train_human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/42brjtap331l6a04ev85i/human_Intestinef7c1c579-2dc0-47e2-ba19-8165c5a0e353-Intestine-_data.h5ad?rlkey=10zmxwpvtnzkz77embs997lw2&st=odk1wce6&dl=1
 human,Intestine,6444,train,,,train_human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad,https://www.dropbox.com/scl/fi/6rd98oo0z68dmqpuhn7ap/human_Intestinee40c6272-af77-4a10-9385-62a398884f27_data.h5ad?rlkey=v6zz40z2f73h4oivfx67y7cgr&dl=1
 human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad,https://www.dropbox.com/scl/fi/u57hnsh6nv88r8nnrwfnl/human_Intestine6a270451-b4d9-43e0-aa89-e33aac1ac74b_data.h5ad?rlkey=suzpw1ikma8kpr3uxkv7nosue&dl=1
-human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&dl=1
+human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67-Intestine-_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&st=u1nnlhbw&dl=1
 human,Intestine,7443,train,,,train_human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad,https://www.dropbox.com/scl/fi/i9271g8nx3kmxi6rb12iv/human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad?rlkey=2vfdh7k5amnul3d786g2zmr6g&dl=1
 human,Kidney,10000,train,,,train_human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad,https://www.dropbox.com/scl/fi/3zfgu6g7mzv2v7f7qpwvq/human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad?rlkey=euso7cvuwcpauk21o3yme3959&dl=1
 human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&dl=1

From da9b8a67d45b4adfc615ce8c82ee44f3b4ee7fb7 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 16 Dec 2024 09:32:33 +0800
Subject: [PATCH 142/203] minor

---
 examples/tuning/cta_singlecellnet/main.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py
index cc5406d9..790b2f2f 100644
--- a/examples/tuning/cta_singlecellnet/main.py
+++ b/examples/tuning/cta_singlecellnet/main.py
@@ -6,12 +6,12 @@
 from typing import get_args
 
 import numpy as np
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet
-from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.pipeline import Pipeline, PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
 from dance.typing import LogLevel
 from dance.utils import set_seed
 
@@ -56,7 +56,15 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
-
+        if "run_kwargs" in pipeline_planer.config and tune_mode == "params":
+            wandb_config = dict(wandb.config)
+            config = {'pipeline': wandb_config["run_kwargs"], "type": "preprocessor"}
+            preprocessing_pipeline = Pipeline(config)
+
+        else:
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb.config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         # Initialize model and get model specific preprocessing pipeline
         model = SingleCellNet(num_trees=args.num_trees)
 
@@ -64,8 +72,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset,
                                          species=args.species, tissue=args.tissue, valid_dataset=args.valid_dataset,
                                          data_dir="../temp_data", filetype=args.filetype).load_data(cache=args.cache)
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        # kwargs = {tune_mode: dict(wandb.config)}
+        # preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
         preprocessing_pipeline(data)
 

From 3881988492be5ce0f550e951fca9f5c1b757e19d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Dec 2024 01:33:14 +0000
Subject: [PATCH 143/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/cta_singlecellnet/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/cta_singlecellnet/main.py b/examples/tuning/cta_singlecellnet/main.py
index 790b2f2f..bfe8bb88 100644
--- a/examples/tuning/cta_singlecellnet/main.py
+++ b/examples/tuning/cta_singlecellnet/main.py
@@ -6,8 +6,8 @@
 from typing import get_args
 
 import numpy as np
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.singlecellnet import SingleCellNet

From 27d9817c0094b3b5c580522abf660cdbb0f6437c Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 16 Dec 2024 09:35:23 +0800
Subject: [PATCH 144/203] ignore result

---
 .gitignore | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.gitignore b/.gitignore
index bf917f79..ffb7f309 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,14 @@ temp_data
 *.egg*
 __pycache__
 build/
+
+#ignore example result
+examples/tuning/**/*.log
+examples/tuning/**/*.yaml
+examples/tuning/**/*.csv
+examples/tuning/**/*.h5ad
+examples/tuning/**/*.sh
+examples/tuning/**/*.h5
+examples/tuning/**/*.tar.gz
+examples/tuning/**/*.tif
+examples/tuning/**/*.txt

From f42e6fcdb128b03355c6020ae0d1d04f32bbb3df Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 16 Dec 2024 14:54:34 +0800
Subject: [PATCH 145/203] minor

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index bf917f79..0a74ef7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ temp_data
 *.egg*
 __pycache__
 build/
+*.log

From 66923ab3643938370c3f1e89da7ff8c3c1077cfb Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 16 Dec 2024 15:01:53 +0800
Subject: [PATCH 146/203] minor

---
 .gitignore | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index aa5948e7..5adb8a91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,10 +17,9 @@ build/
 *.log
 
 #ignore example result
-examples/tuning/**/*.log
+examples/**/*.h5ad
 examples/tuning/**/*.yaml
 examples/tuning/**/*.csv
-examples/tuning/**/*.h5ad
 examples/tuning/**/*.sh
 examples/tuning/**/*.h5
 examples/tuning/**/*.tar.gz

From 5859db7d4c5a2e2572ab2ccdb58fd90d08331991 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Mon, 16 Dec 2024 15:52:23 +0800
Subject: [PATCH 147/203] minor

---
 .../tuning/joint_embedding_scmvae/main.py     | 100 ++++++------------
 1 file changed, 30 insertions(+), 70 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 8b091eb7..1f778b6a 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -9,9 +9,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE
@@ -73,31 +73,18 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
         wandb_config = wandb.config
-        if "run_kwargs" in pipeline_planer.config:
-            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
-                wandb_config = wandb_config["run_kwargs"]
-            else:
-                wandb.log({"skip": 1})
-                wandb.finish()
-                return
-        try:
-            dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
-            data = dataset.load_data()
-        wandb_config = wandb.config
-        if "run_kwargs" in pipeline_planer.config:
-            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
-                wandb_config = wandb_config["run_kwargs"]
-            else:
-                wandb.log({"skip": 1})
-                wandb.finish()
-                return
         try:
+            wandb_config = wandb.config
+            if "run_kwargs" in pipeline_planer.config:
+                if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                    wandb_config = wandb_config["run_kwargs"]
+                else:
+                    wandb.log({"skip": 1})
+                    wandb.finish()
+                    return
             dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
             data = dataset.load_data()
 
-            le = preprocessing.LabelEncoder()
-            labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
-            data.mod["mod1"].obsm["labels"] = labels
             le = preprocessing.LabelEncoder()
             labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
             data.mod["mod1"].obsm["labels"] = labels
@@ -107,19 +94,23 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             preprocessing_pipeline = pipeline_planer.generate(**kwargs)
             print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
             preprocessing_pipeline(data)
-            train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
-            train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
-            test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
-
-            # train_size=data.mod["meta1"].shape[0]
-            # test_size=data.mod["mod1"].shape[0]-train_size
-            data.set_split_idx("train",train_idx)
-            data.set_split_idx("test",test_idx)
-            (x_train, y_train,x_train_raw,y_train_raw),_ = data.get_train_data(return_type="torch")
-            (x_test, y_test,x_test_raw,y_test_raw), labels = data.get_test_data(return_type="torch")
+            # train_name=[item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            # train_idx= [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            # test_idx=list(set([i for i in range(data.mod["mod1"].shape[0])]).difference(set(train_idx)))
+            # data.set_split_idx("train",train_idx)
+            # data.set_split_idx("test",test_idx)
+
+            (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch")
+            (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch")
+
+            train_size = len(x_train)
+            test_size = len(x_test)
+            train_idx = np.arange(train_size)
+            test_idx = np.arange(test_size) + train_size
+
             # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels)
-            lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()]))
-            lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()]))
+            lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train.numpy(), x_test.numpy()]))
+            lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train.numpy(), y_test.numpy()]))
             lib_mean1 = torch.from_numpy(lib_mean1)
             lib_var1 = torch.from_numpy(lib_var1)
             lib_mean2 = torch.from_numpy(lib_mean2)
@@ -127,13 +118,13 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
 
             Nfeature1 = x_train.shape[1]
             Nfeature2 = y_train.shape[1]
-            # train_size = len(data.get_split_idx("train"))
-            # train_size=x_train.shape[0]
+
+            temp = lib_mean1[train_idx]
             train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx],
-                                            lib_var2[train_idx], y_train)
+                                             lib_var2[train_idx], y_train)
 
             valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx],
-                                            lib_var2[test_idx], y_test)
+                                             lib_var2[test_idx], y_test)
 
             total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
             total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
@@ -171,37 +162,6 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             model.to(device)
             model.init_gmm_params(total_loader)
             model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
-            x_test = torch.cat([x_train, x_test])
-            y_test = torch.cat([y_train, y_test])
-            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
-            model = scMVAE(
-                encoder_1=[Nfeature1, 1024, 128, 128],
-                hidden_1=128,
-                Z_DIMS=22,
-                decoder_share=[22, 128, 256],
-                share_hidden=128,
-                decoder_1=[128, 128, 1024],
-                hidden_2=1024,
-                encoder_l=[Nfeature1, 128],
-                hidden3=128,
-                encoder_2=[Nfeature2, 1024, 128, 128],
-                hidden_4=128,
-                encoder_l1=[Nfeature2, 128],
-                hidden3_1=128,
-                decoder_2=[128, 128, 1024],
-                hidden_5=1024,
-                drop_rate=0.1,
-                log_variational=True,
-                Type="ZINB",
-                device=device,
-                n_centroids=22,
-                penality="GMM",
-                model=1,
-            )
-            model.to(device)
-            model.init_gmm_params(total_loader)
-            model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
-
             # embeds = model.predict(x_test, y_test).cpu().numpy()
             score = model.score(x_test, y_test, labels)
             # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
@@ -210,7 +170,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             wandb.log(score)
             wandb.finish()
         finally:
-            locals_keys=list(locals().keys())
+            locals_keys = list(locals().keys())
             for var in locals_keys:
                 try:
                     exec(f"del {var}")

From 00c1b82136f0d5bd052dc2e5f5e0eb6d0f018f6b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Dec 2024 07:54:38 +0000
Subject: [PATCH 148/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmvae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index 1f778b6a..e20be682 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -9,9 +9,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE

From 7bbef3ea5390265e688dbdd86d3fed00a5adc880 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Tue, 17 Dec 2024 10:26:47 +0800
Subject: [PATCH 149/203] update pre-commit

---
 .pre-commit-config.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 194f88fa..3af9d357 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,13 +32,13 @@ repos:
         name: Sort imports
         args: [--line-width, "120", --profile, black]
 
-  # - repo: https://github.com/PyCQA/docformatter
-  #   rev: v1.7.5
-  #   hooks:
-  #     - id: docformatter
-  #       name: Format docstring
-  #       additional_dependencies: [tomli]
-  #       args: [--config, pyproject.toml]
+  - repo: https://github.com/PyCQA/docformatter
+    rev: eb1df34
+    hooks:
+      - id: docformatter
+        name: Format docstring
+        additional_dependencies: [tomli]
+        args: [--config, pyproject.toml]
 
   - repo: https://github.com/executablebooks/mdformat
     rev: 0.7.17

From 505e5d61d3b31ebc9d7a4b64f122d2c43484e625 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Dec 2024 02:28:03 +0000
Subject: [PATCH 150/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/sc_similarity/anndata_similarity.py | 36 +++++++++--------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
index 9a84fa4b..5409fdb6 100644
--- a/dance/sc_similarity/anndata_similarity.py
+++ b/dance/sc_similarity/anndata_similarity.py
@@ -55,13 +55,12 @@ def filter_gene(self, n_top_genes=3000):
         self.common_genes = common_hvg
 
     def preprocess(self):
-        """Preprocess the data, including log normalization and normalization to probability distribution."""
+        """Preprocess the data, including log normalization and normalization to
+        probability distribution."""
         self.filter_gene()
 
     def sample_cells(self, random_state):
-        """
-        Randomly sample cells from each dataset if sample_size is specified.
-        """
+        """Randomly sample cells from each dataset if sample_size is specified."""
         np.random.seed(random_state)
         if self.sample_size is None:
             self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs)  #need to think
@@ -77,9 +76,7 @@ def sample_cells(self, random_state):
             self.sampled_adata2 = self.adata2.copy()
 
     def normalize_data(self):  # I am not sure
-        """
-        Normalize the data by total counts per cell and log-transform.
-        """
+        """Normalize the data by total counts per cell and log-transform."""
         sc.pp.normalize_total(self.adata1, target_sum=1e4)
         sc.pp.log1p(self.adata1)
         sc.pp.normalize_total(self.adata2, target_sum=1e4)
@@ -98,18 +95,16 @@ def set_prob_data(self, sampled=False):
         self.Y = np.nan_to_num(prob_adata2).toarray()
 
     def cosine_sim_sampled(self) -> pd.DataFrame:
-        """
-        Computes the average cosine similarity between all pairs of cells from the two datasets.
-        """
+        """Computes the average cosine similarity between all pairs of cells from the
+        two datasets."""
         # Compute cosine similarity matrix
         sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X)
         # Return the average similarity
         return sim_matrix.mean()
 
     def pearson_corr_sampled(self) -> pd.DataFrame:
-        """
-        Computes the average Pearson correlation coefficient between all pairs of cells from the two datasets.
-        """
+        """Computes the average Pearson correlation coefficient between all pairs of
+        cells from the two datasets."""
         # Compute Pearson correlation matrix
         corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(),
                                   self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs,
@@ -118,9 +113,8 @@ def pearson_corr_sampled(self) -> pd.DataFrame:
         return np.nanmean(corr_matrix)
 
     def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
-        """
-        Computes the average Jaccard similarity between all pairs of binarized cells from the two datasets.
-        """
+        """Computes the average Jaccard similarity between all pairs of binarized cells
+        from the two datasets."""
         # Binarize the data
         binary_adata1 = (self.sampled_adata1.X > threshold).astype(int)
         binary_adata2 = (self.sampled_adata2.X > threshold).astype(int)
@@ -131,9 +125,8 @@ def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
         return similarity_matrix.mean()
 
     def js_divergence_sampled(self) -> float:
-        """
-        Computes the average Jensen-Shannon divergence between all pairs of cells from the two datasets.
-        """
+        """Computes the average Jensen-Shannon divergence between all pairs of cells
+        from the two datasets."""
         # Normalize the data to probability distributions
         prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
         prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
@@ -195,9 +188,8 @@ def data_company(self):
         raise NotImplementedError("data company")
 
     def wasserstein_dist(self) -> float:
-        """
-        Computes the average Wasserstein distance between all pairs of cells from the two datasets.
-        """
+        """Computes the average Wasserstein distance between all pairs of cells from the
+        two datasets."""
         X = self.X
         Y = self.Y
         a = np.ones((X.shape[0], )) / X.shape[0]

From b7683af51bd8824860c9fdcf32c042a293be7582 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Tue, 17 Dec 2024 12:59:45 +0800
Subject: [PATCH 151/203] minor

---
 dance/datasets/multimodality.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
index 14e9c5c6..87c8689c 100644
--- a/dance/datasets/multimodality.py
+++ b/dance/datasets/multimodality.py
@@ -575,7 +575,7 @@ def __init__(self, subtask, root="./data", preprocess=None, normalize=False, pre
 
     def _raw_to_dance(self, raw_data):
         mod1, mod2, meta1, meta2, test_sol = self._maybe_preprocess(raw_data)
-        self.to_array([mod1, mod2, meta1, meta2, test_sol])
+        self._to_csr([mod1, mod2, meta1, meta2, test_sol])
 
         assert all(mod2.obs_names == mod1.obs_names), "Modalities not aligned"
         mdata = md.MuData({"mod1": mod1, "mod2": mod2, "meta1": meta1, "meta2": meta2, "test_sol": test_sol})
@@ -585,7 +585,7 @@ def _raw_to_dance(self, raw_data):
 
         return data
 
-    def to_array(self, datas):
+    def _to_csr(self, datas):
         for data in datas:
             if scipy.sparse.issparse(data.X):
                 if not isinstance(data.X, scipy.sparse.csr_matrix):

From 7026ee0f1db028438f235e82ced2c42451c70504 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Tue, 17 Dec 2024 14:34:48 +0800
Subject: [PATCH 152/203] minor

---
 dance/sc_similarity/anndata_similarity.py | 36 +++++++++--------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
index 9a84fa4b..5409fdb6 100644
--- a/dance/sc_similarity/anndata_similarity.py
+++ b/dance/sc_similarity/anndata_similarity.py
@@ -55,13 +55,12 @@ def filter_gene(self, n_top_genes=3000):
         self.common_genes = common_hvg
 
     def preprocess(self):
-        """Preprocess the data, including log normalization and normalization to probability distribution."""
+        """Preprocess the data, including log normalization and normalization to
+        probability distribution."""
         self.filter_gene()
 
     def sample_cells(self, random_state):
-        """
-        Randomly sample cells from each dataset if sample_size is specified.
-        """
+        """Randomly sample cells from each dataset if sample_size is specified."""
         np.random.seed(random_state)
         if self.sample_size is None:
             self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs)  #need to think
@@ -77,9 +76,7 @@ def sample_cells(self, random_state):
             self.sampled_adata2 = self.adata2.copy()
 
     def normalize_data(self):  # I am not sure
-        """
-        Normalize the data by total counts per cell and log-transform.
-        """
+        """Normalize the data by total counts per cell and log-transform."""
         sc.pp.normalize_total(self.adata1, target_sum=1e4)
         sc.pp.log1p(self.adata1)
         sc.pp.normalize_total(self.adata2, target_sum=1e4)
@@ -98,18 +95,16 @@ def set_prob_data(self, sampled=False):
         self.Y = np.nan_to_num(prob_adata2).toarray()
 
     def cosine_sim_sampled(self) -> pd.DataFrame:
-        """
-        Computes the average cosine similarity between all pairs of cells from the two datasets.
-        """
+        """Computes the average cosine similarity between all pairs of cells from the
+        two datasets."""
         # Compute cosine similarity matrix
         sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X)
         # Return the average similarity
         return sim_matrix.mean()
 
     def pearson_corr_sampled(self) -> pd.DataFrame:
-        """
-        Computes the average Pearson correlation coefficient between all pairs of cells from the two datasets.
-        """
+        """Computes the average Pearson correlation coefficient between all pairs of
+        cells from the two datasets."""
         # Compute Pearson correlation matrix
         corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(),
                                   self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs,
@@ -118,9 +113,8 @@ def pearson_corr_sampled(self) -> pd.DataFrame:
         return np.nanmean(corr_matrix)
 
     def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
-        """
-        Computes the average Jaccard similarity between all pairs of binarized cells from the two datasets.
-        """
+        """Computes the average Jaccard similarity between all pairs of binarized cells
+        from the two datasets."""
         # Binarize the data
         binary_adata1 = (self.sampled_adata1.X > threshold).astype(int)
         binary_adata2 = (self.sampled_adata2.X > threshold).astype(int)
@@ -131,9 +125,8 @@ def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
         return similarity_matrix.mean()
 
     def js_divergence_sampled(self) -> float:
-        """
-        Computes the average Jensen-Shannon divergence between all pairs of cells from the two datasets.
-        """
+        """Computes the average Jensen-Shannon divergence between all pairs of cells
+        from the two datasets."""
         # Normalize the data to probability distributions
         prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
         prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
@@ -195,9 +188,8 @@ def data_company(self):
         raise NotImplementedError("data company")
 
     def wasserstein_dist(self) -> float:
-        """
-        Computes the average Wasserstein distance between all pairs of cells from the two datasets.
-        """
+        """Computes the average Wasserstein distance between all pairs of cells from the
+        two datasets."""
         X = self.X
         Y = self.Y
         a = np.ones((X.shape[0], )) / X.shape[0]

From d7c63add4ae8ab3e32661d45ee0d6c0b70bd20c9 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Tue, 17 Dec 2024 14:42:02 +0800
Subject: [PATCH 153/203] minor

---
 examples/tuning/cta_celltypist/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py
index c625065f..58a10303 100644
--- a/examples/tuning/cta_celltypist/main.py
+++ b/examples/tuning/cta_celltypist/main.py
@@ -42,6 +42,8 @@
     args = parser.parse_args()
     logger.setLevel(args.log_level)
     logger.info(f"Running Celltypist with the following parameters:\n{pprint.pformat(vars(args))}")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+    # os.environ["WANDB_AGENT_DISABLE_FLAPPING"]="true"
     file_root_path = Path(
         args.root_path, "_".join([
             "-".join([str(num) for num in dataset])
@@ -51,7 +53,6 @@
     logger.info(f"\n files is saved in {file_root_path}")
     MAINDIR = Path(__file__).resolve().parent
     pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
-    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
 
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))

From 81b83309d842edc18906ce035ef10803dfbd14f4 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 17 Dec 2024 21:46:21 +0800
Subject: [PATCH 154/203] update data

---
 dance/metadata/scdeepsort.csv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 0f537ded..44091d47 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -151,8 +151,8 @@ human,Intestine,2720,train,,,train_human_Intestine6a270451-b4d9-43e0-aa89-e33aac
 human,Intestine,10000,train,,,train_human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67(Intestine)_data.h5ad,https://www.dropbox.com/scl/fi/zygwfo73ukmc5mt260ni8/human_Intestinefa27492b-82ff-4ab7-ac61-0e2b184eee67-Intestine-_data.h5ad?rlkey=48nu6ho30540pirxxznzfdgs1&st=u1nnlhbw&dl=1
 human,Intestine,7443,train,,,train_human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad,https://www.dropbox.com/scl/fi/i9271g8nx3kmxi6rb12iv/human_Intestined6dfdef1-406d-4efb-808c-3c5eddbfe0cb_data.h5ad?rlkey=2vfdh7k5amnul3d786g2zmr6g&dl=1
 human,Kidney,10000,train,,,train_human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad,https://www.dropbox.com/scl/fi/3zfgu6g7mzv2v7f7qpwvq/human_Kidneya51c6ece-5731-4128-8c1e-5060e80c69e4_data.h5ad?rlkey=euso7cvuwcpauk21o3yme3959&dl=1
-human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&dl=1
-human,Kidney,10000,train,,,train_human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad,https://www.dropbox.com/scl/fi/krhe97shrnuofdlthnopj/human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353_data.h5ad?rlkey=0lft0ebfaz4e0rzo1xpotmt79&dl=1
+human,Kidney,10000,train,,,train_human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67(Kidney)_data.h5ad,https://www.dropbox.com/scl/fi/cqtulramrepe6rzvmjvh7/human_Kidneyfa27492b-82ff-4ab7-ac61-0e2b184eee67-Kidney-_data.h5ad?rlkey=qe991uhuf6m3o0yabjev1gepr&st=jwhicxrp&dl=1
+human,Kidney,10000,train,,,train_human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353(Kidney)_data.h5ad,https://www.dropbox.com/scl/fi/krhe97shrnuofdlthnopj/human_Kidneyf7c1c579-2dc0-47e2-ba19-8165c5a0e353-Kidney-_data.h5ad?rlkey=0lft0ebfaz4e0rzo1xpotmt79&st=7ie3uc6i&dl=1
 human,Kidney,10000,train,,,train_human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad,https://www.dropbox.com/scl/fi/mhr92khp2j4ydit9mjiky/human_Kidney105c7dad-0468-4628-a5be-2bb42c6a8ae4_data.h5ad?rlkey=9eg9ota05td575buw4vh6qdjd&dl=1
 human,Kidney,10000,train,,,train_human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad,https://www.dropbox.com/scl/fi/29d15brkbpsx14tr9qt0w/human_Kidney5af90777-6760-4003-9dba-8f945fec6fdf_data.h5ad?rlkey=sxzcd0btog8nhm13sf36r0q78&dl=1
 human,Kidney,10000,train,,,train_human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad,https://www.dropbox.com/scl/fi/gkqbvzh99bhegjar2b9q0/human_Kidneydea717d4-7bc0-4e46-950f-fd7e1cc8df7d_data.h5ad?rlkey=emfdlopj8v08yqqp9c82n16je&dl=1

From 7f37be45ec8410ebec7486754c3bdccedf87ff78 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 09:21:09 +0800
Subject: [PATCH 155/203] update notes

---
 dance/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/pipeline.py b/dance/pipeline.py
index 0841eed7..aa4a0c00 100644
--- a/dance/pipeline.py
+++ b/dance/pipeline.py
@@ -1068,7 +1068,7 @@ def get_step3_yaml(conf_save_path="config_yamls/params/", conf_load_path="step3_
                             for target, d_p in p1.default_params.items():
                                 if target == p2["target"]:
                                     p2["params"] = d_p
-        #顺序不对，参考_sanitize_pipeline进行修改 TODO
+        #The order is wrong, refer to _sanitize_pipeline for modification TODO use test to check
         step2_pipeline = step2_pipeline_planer.config.pipeline
         # step2_pipeline=sorted(step2_pipeline_planer.config.pipeline,key=lambda x: float(x.split('.')[1]))
         for p1, p2 in zip(step2_pipeline, pipeline):  #need order

From 759912422da292a90b1fc5108c9a9cff42181f32 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 10:18:09 +0800
Subject: [PATCH 156/203] update forest and other minor changes

---
 .../atlas/sc_similarity/anndata_similarity.py | 520 ++++++++++++++++++
 dance/sc_similarity/download_data.py          |   9 -
 .../multi_modality/joint_embedding/jae.py     |   4 +-
 .../multi_modality/joint_embedding/scmogcn.py |   4 +-
 .../result_analysis/get_important_pattern.py  | 156 ++++--
 .../get_important_pattern_sweep.py            |  77 ++-
 examples/result_analysis/get_num.py           |  30 +
 7 files changed, 745 insertions(+), 55 deletions(-)
 create mode 100644 dance/atlas/sc_similarity/anndata_similarity.py
 delete mode 100644 dance/sc_similarity/download_data.py
 create mode 100644 examples/result_analysis/get_num.py

diff --git a/dance/atlas/sc_similarity/anndata_similarity.py b/dance/atlas/sc_similarity/anndata_similarity.py
new file mode 100644
index 00000000..ab44e6e7
--- /dev/null
+++ b/dance/atlas/sc_similarity/anndata_similarity.py
@@ -0,0 +1,520 @@
+# anndata_similarity.py
+# TODO translate notes
+import re
+import warnings
+from typing import Callable, Dict, List, Optional
+
+import anndata
+import anndata as ad
+import numpy as np
+import ot
+import pandas as pd
+import scanpy as sc
+import scipy
+import yaml
+from omegaconf import OmegaConf
+from scipy.linalg import sqrtm
+from scipy.spatial import cKDTree
+from scipy.spatial.distance import cdist, directed_hausdorff, jaccard, jensenshannon
+from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel
+
+# Suppress scipy warnings for constant input in Pearson correlation
+warnings.filterwarnings("ignore", message="An input array is constant")
+from dance.datasets.singlemodality import CellTypeAnnotationDataset
+
+
+def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "h5ad", train_dataset=[],
+                test_dataset=[], valid_dataset=[], data_dir="../temp_data"):
+    data = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset,
+                                     valid_dataset=valid_dataset, data_dir=data_dir, tissue=tissue, species=species,
+                                     filetype=filetype).load_data()
+    return data.data
+
+
+class AnnDataSimilarity:
+
+    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None,
+                 init_random_state: Optional[int] = None, n_runs: int = 10,
+                 ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None,
+                 adata2_name: Optional[str] = None,
+                 methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"):
+        """Initialize the AnnDataSimilarity object and perform data preprocessing."""
+        self.origin_adata1 = adata1.copy()
+        self.origin_adata2 = adata2.copy()
+        self.sample_size = sample_size
+        self.init_random_state = init_random_state
+        self.preprocess()
+        self.results = {}
+        self.ground_truth_conf_path = ground_truth_conf_path
+        self.adata1_name = adata1_name
+        self.adata2_name = adata2_name
+        self.methods = methods
+        self.tissue = tissue
+        self.n_runs = n_runs
+
+    def filter_gene(self, n_top_genes=3000):
+        sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3')
+        sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3')
+
+        common_hvg = self.origin_adata1.var_names[self.origin_adata1.var['highly_variable']].intersection(
+            self.origin_adata2.var_names[self.origin_adata2.var['highly_variable']])
+
+        self.origin_adata1 = self.origin_adata1[:, common_hvg].copy()
+        self.origin_adata2 = self.origin_adata2[:, common_hvg].copy()
+        self.common_genes = common_hvg
+
+    def preprocess(self):
+        """Preprocess the data, including log normalization and normalization to
+        probability distribution."""
+        self.filter_gene()
+
+    def sample_cells(self, random_state):
+        """Randomly sample cells from each dataset if sample_size is specified."""
+        np.random.seed(random_state)
+        if self.sample_size is None:
+            self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs)  #need to think
+        if self.adata1.n_obs > self.sample_size:
+            indices1 = np.random.choice(self.adata1.n_obs, size=self.sample_size, replace=False)
+            self.sampled_adata1 = self.adata1[indices1, :].copy()
+        else:
+            self.sampled_adata1 = self.adata1.copy()
+        if self.adata2.n_obs > self.sample_size:
+            indices2 = np.random.choice(self.adata2.n_obs, size=self.sample_size, replace=False)
+            self.sampled_adata2 = self.adata2[indices2, :].copy()
+        else:
+            self.sampled_adata2 = self.adata2.copy()
+
+    def normalize_data(self):  # I am not sure
+        """Normalize the data by total counts per cell and log-transform."""
+        sc.pp.normalize_total(self.adata1, target_sum=1e4)
+        sc.pp.log1p(self.adata1)
+        sc.pp.normalize_total(self.adata2, target_sum=1e4)
+        sc.pp.log1p(self.adata2)
+
+    def set_prob_data(self, sampled=False):
+        # Normalize the data to probability distributions
+        if sampled:
+            prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
+            prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
+        else:
+            prob_adata1 = self.adata1.X / self.adata1.X.sum(axis=1)
+            prob_adata2 = self.adata2.X / self.adata2.X.sum(axis=1)
+        # Handle any NaN values resulting from division by zero
+        self.X = np.nan_to_num(prob_adata1).toarray()
+        self.Y = np.nan_to_num(prob_adata2).toarray()
+
+    def cosine_sim_sampled(self) -> pd.DataFrame:
+        """Computes the average cosine similarity between all pairs of cells from the
+        two datasets."""
+        # Compute cosine similarity matrix
+        sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X)
+        # Return the average similarity
+        return sim_matrix.mean()
+
+    def pearson_corr_sampled(self) -> pd.DataFrame:
+        """Computes the average Pearson correlation coefficient between all pairs of
+        cells from the two datasets."""
+        # Compute Pearson correlation matrix
+        corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(),
+                                  self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs,
+                                                                   self.sampled_adata1.n_obs:]
+        # Return the average correlation
+        return np.nanmean(corr_matrix)
+
+    def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
+        """Computes the average Jaccard similarity between all pairs of binarized cells
+        from the two datasets."""
+        # Binarize the data
+        binary_adata1 = (self.sampled_adata1.X > threshold).astype(int)
+        binary_adata2 = (self.sampled_adata2.X > threshold).astype(int)
+        # Compute Jaccard distance matrix
+        distance_matrix = cdist(binary_adata1.A, binary_adata2.A, metric='jaccard')
+        # Convert to similarity and compute the average
+        similarity_matrix = 1 - distance_matrix
+        return similarity_matrix.mean()
+
+    def js_divergence_sampled(self) -> float:
+        """Computes the average Jensen-Shannon divergence between all pairs of cells
+        from the two datasets."""
+        # Normalize the data to probability distributions
+        prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
+        prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
+        # Handle any NaN values resulting from division by zero
+        prob_adata1 = np.nan_to_num(prob_adata1).toarray()
+        prob_adata2 = np.nan_to_num(prob_adata2).toarray()
+
+        # Define a function to compute JS divergence for a pair of probability vectors
+        def jsd(p, q):
+            return jensenshannon(p, q)
+
+        # Compute JS divergence matrix
+        jsd_vectorized = np.vectorize(jsd, signature='(n),(n)->()')
+        divergence_matrix = np.zeros((prob_adata1.shape[0], prob_adata2.shape[0]))
+        for i in range(prob_adata1.shape[0]):
+            divergence_matrix[i, :] = jsd_vectorized(
+                np.repeat(prob_adata1[i, :], prob_adata2.shape[0], axis=0).reshape(-1, prob_adata1.shape[1]),
+                prob_adata2)
+
+        # Convert divergence to similarity and compute the average
+        similarity_matrix = 1 - divergence_matrix
+        return np.nanmean(similarity_matrix)
+
+    def compute_mmd(self) -> float:
+        X = self.X
+        Y = self.Y
+        kernel = "rbf"
+        gamma = 1.0
+        if kernel == 'rbf':
+            K_X = np.exp(-gamma * cdist(X, X, 'sqeuclidean'))
+            K_Y = np.exp(-gamma * cdist(Y, Y, 'sqeuclidean'))
+            K_XY = np.exp(-gamma * cdist(X, Y, 'sqeuclidean'))
+        elif kernel == 'linear':
+            K_X = np.dot(X, X.T)
+            K_Y = np.dot(Y, Y.T)
+            K_XY = np.dot(X, Y.T)
+        else:
+            raise ValueError("Unsupported kernel type")
+
+        m = X.shape[0]
+        n = Y.shape[0]
+
+        sum_X = (np.sum(K_X) - np.sum(np.diag(K_X))) / (m * (m - 1))
+        sum_Y = (np.sum(K_Y) - np.sum(np.diag(K_Y))) / (n * (n - 1))
+        sum_XY = np.sum(K_XY) / (m * n)
+
+        mmd_squared = sum_X + sum_Y - 2 * sum_XY
+        mmd = np.sqrt(max(mmd_squared, 0))
+        return 1 / (1 + mmd)
+
+    def common_genes_num(self):
+        return len(self.common_genes)
+
+    def otdd(self):
+        """Compute the OTDD between two data sets."""
+        raise NotImplementedError("OTDD!")
+
+    def data_company(self):
+        raise NotImplementedError("data company")
+
+    def wasserstein_dist(self) -> float:
+        """Computes the average Wasserstein distance between all pairs of cells from the
+        two datasets."""
+        X = self.X
+        Y = self.Y
+        a = np.ones((X.shape[0], )) / X.shape[0]
+        b = np.ones((Y.shape[0], )) / Y.shape[0]
+        M = ot.dist(X, Y, metric='euclidean')
+        wasserstein_dist = ot.emd2(a, b, M)
+        return 1 / 1 + wasserstein_dist
+
+    def get_Hausdorff(self):
+        X = self.X
+        Y = self.Y
+        forward = directed_hausdorff(X, Y)[0]
+        backward = directed_hausdorff(X, Y)[0]
+        hausdorff_distance = max(forward, backward)
+        normalized_hausdorff = hausdorff_distance / np.sqrt(X.shape[1])
+        similarity = 1 - normalized_hausdorff
+        return similarity
+
+    def chamfer_distance(self):
+        X = self.X
+        Y = self.Y
+        tree_A = cKDTree(X)
+        tree_B = cKDTree(Y)
+
+        distances_A_to_B, _ = tree_A.query(Y)
+        distances_B_to_A, _ = tree_B.query(X)
+
+        chamfer_A_to_B = np.mean(distances_A_to_B)
+        chamfer_B_to_A = np.mean(distances_B_to_A)
+        distance = chamfer_A_to_B + chamfer_B_to_A
+        normalized_chamfer = distance / np.sqrt(X.shape[1])
+        similarity = 1 - normalized_chamfer
+        return similarity
+
+    def energy_distance_metric(self):
+        X = self.X
+        Y = self.Y
+        XX = cdist(X, X, 'euclidean')
+        YY = cdist(Y, Y, 'euclidean')
+        XY = cdist(X, Y, 'euclidean')
+        distance = 2 * np.mean(XY) - np.mean(XX) - np.mean(YY)
+        return 1 / (1 + distance)
+
+    def get_sinkhorn2(self):
+        X = self.X
+        Y = self.Y
+        a = np.ones(X.shape[0]) / X.shape[0]
+        b = np.ones(Y.shape[0]) / Y.shape[0]
+        M = ot.dist(X, Y, metric='euclidean')
+        reg = 0.1
+        sinkhorn_dist = ot.sinkhorn2(a, b, M, reg)
+        return 1 / (1 + sinkhorn_dist)
+
+    def bures_distance(self):
+        X = self.X
+        Y = self.Y
+        C1 = np.cov(X, rowvar=False)
+        C2 = np.cov(Y, rowvar=False)
+        sqrt_C1 = sqrtm(C1)
+        product = sqrt_C1 @ C2 @ sqrt_C1
+        sqrt_product = sqrtm(product)
+        trace = np.trace(C1) + np.trace(C2) - 2 * np.trace(sqrt_product)
+        return 1 / (1 + np.sqrt(max(trace, 0)))
+
+    def spectral_distance(self):
+        X = self.X
+        Y = self.Y
+        C1 = np.cov(X, rowvar=False)
+        C2 = np.cov(Y, rowvar=False)
+        eig_A = np.linalg.eigvalsh(C1)
+        eig_B = np.linalg.eigvalsh(C2)
+        return 1 / (1 + np.linalg.norm(eig_A - eig_B))
+
+    def get_dataset_meta_sim(self):
+        # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general']
+        con_cols = [
+            "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean",
+            "n_counts_var", "var_n_counts_mean", "var_n_counts_var"
+        ]
+        dis_cols = ['assay', 'tissue']
+
+        def get_discrete_sim(col_list1, col_list2):
+            set1 = set(col_list1)
+            set2 = set(col_list2)
+            intersection = len(set1.intersection(set2))
+            union = len(set1.union(set2))
+            return intersection / union
+
+        def get_con_sim(con_data_1, con_data_2):
+            return abs(con_data_1 - con_data_2) / max(con_data_1, con_data_2)
+
+        def get_dataset_info(data: ad.AnnData):
+            con_sim = {}
+            con_sim["nnz_mean"] = np.mean(data.obs["nnz"])
+            con_sim["nnz_var"] = np.var(data.obs["nnz"])
+            nnz_values = data.X[data.X.nonzero()]
+            con_sim["nnz_counts_mean"] = np.mean(nnz_values)
+            con_sim["nnz_counts_var"] = np.var(nnz_values)
+            con_sim["n_measured_vars"] = np.mean(data.obs["n_measured_vars"])
+            con_sim["cell_num"] = len(data.obs)
+            con_sim["gene_num"] = len(data.var)
+            con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"])
+            con_sim["n_counts_var"] = np.var(data.obs["n_counts"])
+            if "n_counts" not in data.var.columns:
+                if scipy.sparse.issparse(data.X):
+                    gene_counts = np.array(data.X.sum(axis=0)).flatten()
+                else:
+                    gene_counts = data.X.sum(axis=0)
+            data.var["n_counts"] = gene_counts
+            data.var["n_counts"] = data.var["n_counts"].astype(float)
+            con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"])
+            con_sim["var_n_counts_var"] = np.var(data.var["n_counts"])
+            data.uns["con_sim"] = con_sim
+            return data
+
+        data_1 = self.adata1.copy()
+        data_2 = self.adata2.copy()
+        data_1 = get_dataset_info(data_1)
+        data_2 = get_dataset_info(data_2)
+        ans = {}
+        obs_1 = data_1.obs
+        obs_2 = data_2.obs
+        con_sim_1 = data_1.uns["con_sim"]
+        con_sim_2 = data_2.uns["con_sim"]
+        for dis_col in dis_cols:
+            ans[f"{dis_col}_sim"] = get_discrete_sim(obs_1[dis_col].values, obs_2[dis_col].values)
+        for con_col in con_cols:
+            ans[f"{con_col}_sim"] = get_con_sim(con_sim_1[con_col], con_sim_2[con_col])
+        return np.mean(list(ans.values()))
+
+    def get_ground_truth(self):
+        assert self.ground_truth_conf_path is not None
+        assert self.adata1_name is not None
+        assert self.adata2_name is not None
+        ground_truth_conf = pd.read_excel(self.ground_truth_conf_path, sheet_name=self.tissue, index_col=0)
+
+        def get_targets(dataset_truth: str):
+            dataset_truth = OmegaConf.create(fix_yaml_string(dataset_truth))
+            targets = []
+            for item in dataset_truth:
+                targets.append(item["target"])
+            return targets
+
+        sim_targets = []
+        for method in self.methods:
+            query_dataset_truth = ground_truth_conf.loc[self.adata1_name, f"{method}_method"]
+            atlas_dataset_truth = ground_truth_conf.loc[self.adata2_name, f"{method}_method"]
+            query_targets = get_targets(query_dataset_truth)
+            atlas_targets = get_targets(atlas_dataset_truth)
+            assert len(query_targets) == len(atlas_targets)
+            sim_targets.append((sum(a == b for a, b in zip(query_targets, atlas_targets)), len(query_targets)))
+        sim_targets.append((sum(x for x, y in sim_targets), sum(y for x, y in sim_targets)))
+        return sim_targets
+
+    def compute_similarity(
+        self, random_state: int, methods: List[str] = [
+            'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim"
+        ]
+    ) -> Dict[str, float]:
+        """Computes the specified similarity measure. Parameters:
+
+        methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
+        Returns:
+        Dictionary containing the similarity matrices
+
+        """
+        self.adata1 = self.origin_adata1.copy()
+        self.adata2 = self.origin_adata2.copy()
+        self.normalize_data()
+        self.sample_cells(random_state)
+        self.set_prob_data()
+
+        results = {}
+        for method in methods:
+            print(method)
+            if method == 'cosine':
+                results['cosine'] = self.cosine_sim_sampled()
+            elif method == 'pearson':
+                results['pearson'] = self.pearson_corr_sampled()
+            elif method == 'jaccard':
+                results['jaccard'] = self.jaccard_sim_sampled()
+            elif method == 'js_distance':
+                results['js_distance'] = self.js_divergence_sampled()
+            elif method == 'wasserstein':
+                results['wasserstein'] = self.wasserstein_dist()
+            elif method == "common_genes_num":
+                results["common_genes_num"] = self.common_genes_num()
+            elif method == "Hausdorff":
+                results["Hausdorff"] = self.get_Hausdorff()
+            elif method == "chamfer":
+                results["chamfer"] = self.chamfer_distance()
+            elif method == "energy":
+                results["energy"] = self.energy_distance_metric()
+            elif method == "sinkhorn2":
+                results["sinkhorn2"] = self.get_sinkhorn2()
+            elif method == "bures":
+                results["bures"] = self.bures_distance()
+            elif method == "spectral":
+                results["spectral"] = self.spectral_distance()
+            elif method == "otdd":
+                results['otdd'] = self.otdd()
+            elif method == "ground_truth":
+                results["ground_truth"] = self.get_ground_truth()
+            elif method == "metadata_sim":
+                results["metadata_sim"] = self.get_dataset_meta_sim()
+            elif method == "mmd":
+                results["mmd"] = self.compute_mmd()
+            else:
+                raise ValueError(f"Unsupported similarity method: {method}")
+        return results
+
+    def get_similarity_matrix_A2B(
+        self, methods: List[str] = [
+            "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num",
+            "ground_truth", "metadata_sim", "mmd"
+        ]
+    ) -> Dict[str, float]:
+        """Same as compute_similarity, keeping method name consistency."""
+        cumulative_results = {method: 0.0 for method in methods}
+
+        for run in range(self.n_runs):
+            # Update random state for each run
+            if self.init_random_state is not None:
+                current_random_state = self.init_random_state + run
+            else:
+                current_random_state = None
+            run_results = self.compute_similarity(methods=methods, random_state=current_random_state)
+            for method in methods:
+                if method in ["ground_truth"]:
+                    cumulative_results[method] = run_results[method]
+                else:
+                    cumulative_results[method] += run_results[method]
+    # Average the results over the number of runs
+        averaged_results = {
+            method:
+            cumulative_results[method] if method in ["ground_truth"] else cumulative_results[method] / self.n_runs
+            for method in methods
+        }
+        return averaged_results
+
+    # def get_max_similarity_A_to_B(self):
+    #     if self.results is None:
+    #         raise ValueError(f"need results!")
+    #     else:
+    #         self.results_score = {}
+    #         for key in self.results:
+    #             if key not in ["common_genes_num", "ground_truth", "metadata_sim"]:
+    #                 self.results_score[key] = self._get_max_similarity(self.results[key])
+    #             else:
+    #                 self.results_score[key] = self.results[key]
+    #     return self.results_score
+
+    # def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
+    #     """Maximum matching average similarity score."""
+    #     matched_values = [
+    #         similarity_matrix.loc[label,
+    #                               label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max()
+    #         for label in similarity_matrix.index
+    #     ]  # need to ask
+    #     overall_similarity = np.mean(matched_values)
+    #     return overall_similarity
+
+
+def extract_type_target_params(item_text):
+    lines = item_text.strip().split('\n')
+    item_dict = {}
+    params_dict = {}
+    current_param_key = None
+    in_params = False
+    for line in lines:
+        stripped_line = line.strip()
+        if stripped_line.startswith('- type:'):
+            item_dict['type'] = stripped_line.split(':', 1)[1].strip()
+        elif stripped_line.startswith('target:'):
+            item_dict['target'] = stripped_line.split(':', 1)[1].strip()
+        elif stripped_line.startswith('params:'):
+            params_content = stripped_line.split(':', 1)[1].strip()
+            if params_content == '{}':
+                params_dict = {}
+                in_params = False
+            else:
+                params_dict = {}
+                in_params = True
+        elif in_params:
+            if re.match(r'^\w+:$', stripped_line):
+                current_param_key = stripped_line[:-1].strip()
+                params_dict[current_param_key] = {}
+            elif re.match(r'^- ', stripped_line):
+                list_item = stripped_line[2:].strip()
+                if current_param_key:
+                    if not isinstance(params_dict[current_param_key], list):
+                        params_dict[current_param_key] = []
+                    params_dict[current_param_key].append(list_item)
+            elif ':' in stripped_line:
+                key, value = map(str.strip, stripped_line.split(':', 1))
+                if current_param_key and isinstance(params_dict.get(current_param_key, None), dict):
+                    params_dict[current_param_key][key] = yaml.safe_load(value)
+                else:
+                    params_dict[key] = yaml.safe_load(value)
+    item_dict['params'] = params_dict
+    return item_dict
+
+
+def fix_yaml_string(original_str):
+    #It will be deleted
+    yaml_str = original_str.replace('\\n', '\n').strip()
+    items = re.split(r'(?=-\s*type:)', yaml_str)
+    config_list = []
+    for item in items:
+        if not item.strip():
+            continue
+        if not item.strip().startswith('- type:'):
+            print(item)
+            print("Warning: An item does not start with '- type:', skipping this item.")
+            continue
+        item_dict = extract_type_target_params(item)
+        config_list.append(item_dict)
+    fixed_yaml = yaml.dump(config_list, sort_keys=False)
+    return fixed_yaml
diff --git a/dance/sc_similarity/download_data.py b/dance/sc_similarity/download_data.py
deleted file mode 100644
index 83c705fd..00000000
--- a/dance/sc_similarity/download_data.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from dance.datasets.singlemodality import CellTypeAnnotationDataset
-
-
-def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "h5ad", train_dataset=[],
-                test_dataset=[], valid_dataset=[], data_dir="../temp_data"):
-    data = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset,
-                                     valid_dataset=valid_dataset, data_dir=data_dir, tissue=tissue, species=species,
-                                     filetype=filetype).load_data()
-    return data.data
diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py
index c726dd8b..0b1d79bb 100644
--- a/examples/multi_modality/joint_embedding/jae.py
+++ b/examples/multi_modality/joint_embedding/jae.py
@@ -11,9 +11,9 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[
+        "-t", "--subtask", default="openproblems_2022_multi_atac2gex", choices=[
             "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex",
-            "GSE140203_SKIN_atac2gex"
+            "GSE140203_SKIN_atac2gex", "openproblems_2022_multi_atac2gex"
         ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py
index 1ef52647..44e2a748 100644
--- a/examples/multi_modality/joint_embedding/scmogcn.py
+++ b/examples/multi_modality/joint_embedding/scmogcn.py
@@ -12,9 +12,9 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-t", "--subtask", default="GSE140203_SKIN_atac2gex", choices=[
+        "-t", "--subtask", default="openproblems_2022_multi_atac2gex", choices=[
             "openproblems_bmmc_cite_phase2", "openproblems_bmmc_multiome_phase2", "GSE140203_BRAIN_atac2gex",
-            "GSE140203_SKIN_atac2gex"
+            "GSE140203_SKIN_atac2gex", "openproblems_2022_multi_atac2gex"
         ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py
index b03731cc..657fe9b0 100644
--- a/examples/result_analysis/get_important_pattern.py
+++ b/examples/result_analysis/get_important_pattern.py
@@ -4,7 +4,9 @@
 import itertools
 import pathlib
 from collections import Counter
+from copy import deepcopy
 from itertools import combinations
+from os import X_OK
 from pathlib import Path
 
 import matplotlib.pyplot as plt
@@ -12,10 +14,18 @@
 import pandas as pd
 import scikit_posthocs as sp
 import seaborn as sns
+import shapiq
 from mlxtend.frequent_patterns import apriori
 from mlxtend.preprocessing import TransactionEncoder
 from networkx import parse_adjlist
 from scipy import cluster, stats
+from scipy.stats import pointbiserialr
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import GridSearchCV, KFold, LeaveOneOut, cross_val_score
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from typing_extensions import deprecated
 
 
 #TODO need to sync all files or get sweep,not file
@@ -68,7 +78,7 @@ def change_real_rank(rank_item, real_rank):
         return []
 
 
-def replace_nan_in_2d(lst):
+def replace_nan_in_2d(lst):  #nan应该是个极差的值而不是直接删掉
     return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst]
 
 
@@ -83,7 +93,9 @@ def are_all_elements_same_direct(list_2d):
     return True if first_element is not None else True
 
 
-def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1):
+def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1, multi_mod=False):
+    if multi_mod:
+        raise NotImplementedError("need multimod")
     threshold = int(len(step2_data) * threshold_per)
     step2_data.loc[:, metric_name] = step2_data.loc[:, metric_name].astype(float)
     df_sorted = step2_data.sort_values(metric_name, ascending=ascending)
@@ -93,43 +105,35 @@ def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1)
     te = TransactionEncoder()
     te_ary = te.fit(transactions).transform(transactions)
     df = pd.DataFrame(te_ary, columns=te.columns_)
-    frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
+    frequent_itemsets = apriori(df, use_colnames=True, min_support=0.3)
     # print(frequent_itemsets)
     # rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
-    return [tuple(a) for a in frequent_itemsets["itemsets"]]
+    frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: tuple(x))
+    return frequent_itemsets.to_dict(orient='records')
 
 
-# def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False):
-#     if not data:
-#         return []
-
-#     n = max(1, n)
-
-#     mean = np.mean(data)
-#     std = np.std(data)
-
-#     if std == 0:
-#         return sorted(data, reverse=not ascending)[:n]
-
-#     z_scores = [(x, (x - mean) / std) for x in data]
-
-#     significant_values = [x for x, z in z_scores if z > threshold]
-
-#     significant_values_sorted = sorted(significant_values, reverse=not ascending)
-
-#     if len(significant_values_sorted) < n:
-#         remaining = sorted(data, reverse=not ascending)[:n - len(significant_values_sorted)]
-#         significant_values_sorted.extend(remaining)
-
-#     return significant_values_sorted[:n]
+def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False):
+    if not data:
+        return []
+    n = max(1, n)
+    mean = np.mean(data)
+    std = np.std(data)
+    if std == 0:
+        return sorted(data, reverse=not ascending)[:n]
+    z_scores = [(x, (x - mean) / std) for x in data]
+    significant_values = [x for x, z in z_scores if z > threshold]
+    significant_values_sorted = sorted(significant_values, reverse=not ascending)
+    if len(significant_values_sorted) < n:
+        remaining = sorted(data, reverse=not ascending)[:n - len(significant_values_sorted)]
+        significant_values_sorted.extend(remaining)
+    return significant_values_sorted[:n]
 
 
-def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05):
-    ans_all = []
+def get_test_acc_and_names(step2_data, metric_name):
     columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
     test_accs = []
     test_acc_names = []
-    for r in range(1, len(columns)):  #全流程的单独处理
+    for r in range(1, len(columns) + 1):
         for com in itertools.combinations(columns, r):
             test_accs_arrays = []
             groups = step2_data.groupby(by=list(com))
@@ -142,7 +146,14 @@ def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05):
             # if are_all_elements_same_direct(test_accs):
             #     continue
     test_accs = replace_nan_in_2d(test_accs)
-    final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title=" ".join(list(com)), vis=vis)
+    return test_accs, test_acc_names
+
+
+@deprecated("not used")
+def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05):
+    ans_all = []
+    test_accs, test_acc_names = get_test_acc_and_names(step2_data, metric_name)
+    final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title="all_pattern", vis=vis)
     if len(final_ranks) > 0:  #TODO maybe need to think ascending
         max_rank = max(final_ranks)
         max_rank_count = final_ranks.count(max_rank)
@@ -155,6 +166,89 @@ def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05):
     return ans_all
 
 
+def get_significant_items(data):
+    abs_values = np.abs(list(data.values()))
+    percentile = 60
+    threshold = np.percentile(abs_values, percentile)
+    significant_items = {k: v for k, v in data.items() if abs(v) >= threshold}
+    return significant_items
+
+
+def get_forest_model_pattern(step2_data, metric_name):
+    columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
+    X = step2_data.loc[:, columns]
+    y = step2_data.loc[:, metric_name]
+    preprocessor = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'),
+                                                    columns)  # drop='first'防止虚拟变量陷阱
+                                                   ])
+    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                               ('regressor',
+                                RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_split=2,
+                                                      min_samples_leaf=1, random_state=42))])
+
+    param_grid = {
+        'regressor__n_estimators': [10, 50, 100, 200],
+        'regressor__max_depth': [3, 5, 7],
+        'regressor__min_samples_split': [2, 5],
+        'regressor__min_samples_leaf': [1, 2]
+    }
+    loo = LeaveOneOut()
+
+    grid_search = GridSearchCV(
+        estimator=pipeline,
+        param_grid=param_grid,
+        cv=loo,
+        scoring='neg_mean_squared_error',
+        n_jobs=-1,
+        verbose=1,
+        refit=True  # 确保在所有数据上重新训练最佳模型
+    )
+    grid_search.fit(X, y)
+    best_pipeline = grid_search.best_estimator_
+    model = best_pipeline.named_steps['regressor']
+    X_preprocessed = best_pipeline.named_steps['preprocessor'].transform(
+        X)  #TODO best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)是否和X_preprocessed一定是对应的？
+    explainer = shapiq.TreeExplainer(model=model, index="k-SII", max_order=3)  #思考为什么没有负值，因为是绝对值相加，可能是为了正负值不会相互抵消
+    list_of_interaction_values = explainer.explain_X(X_preprocessed.toarray(), n_jobs=96, random_state=42)
+    plt.cla()
+    ax = shapiq.plot.bar_plot(list_of_interaction_values,
+                              feature_names=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns),
+                              max_display=None, show=False, need_abbreviate=False)
+    ax.yaxis.get_major_locator().MAXTICKS = 1000000
+    plt.show()
+    rects = ax.containers[0]
+    yticklabels = ax.get_yticklabels()  #label和rect是否重合需要验证
+    shap_ans = {}
+    for rect, label in zip(rects, yticklabels):
+        xy = rect.get_xy()
+        height = rect.get_height()
+        width = rect.get_width()
+        k = label.get_text()
+        v = width
+        if k in shap_ans:
+            raise RuntimeError("Features should not be repeated")
+        shap_ans[k] = v
+
+    ans = get_significant_items(shap_ans)  #检查一下是不是真的pattern，好像结果不太好，再检验一下
+    preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), index=X.index,
+                                   columns=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns))
+    preprocessed_df[metric_name] = step2_data[metric_name]
+    preprocessed_df_copy = deepcopy(preprocessed_df)
+    real_ans = {}
+    for k, v in ans.items():
+        feature_name = k.split(' x ')
+        one_col = f"{','.join(feature_name)}__all__one"
+        preprocessed_df_copy[one_col] = preprocessed_df_copy[feature_name].eq(1).all(axis=1)
+        # method='pearson'
+        # pearson_corr = preprocessed_df_copy.loc[:,one_col].corr(preprocessed_df_copy.loc[:,metric_name], method=method)
+        r_pb, p_value = pointbiserialr(preprocessed_df_copy.loc[:, one_col].astype('category'),
+                                       preprocessed_df_copy.loc[:, metric_name])
+        real_ans[k] = {"shapiq": v, "pointbiserialr": {"r_pb": r_pb, "p_value": p_value}}
+    real_ans["best_params"] = grid_search.best_params_
+    real_ans["best_mse"] = -grid_search.best_score_
+    return real_ans
+
+
 def summary_pattern(data_path, metric_name, ascending, alpha=0.05, vis=False):
     step2_origin_data = pd.read_csv(data_path)
     step2_data = step2_origin_data.dropna()
diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py
index 7010505d..896a0d0a 100644
--- a/examples/result_analysis/get_important_pattern_sweep.py
+++ b/examples/result_analysis/get_important_pattern_sweep.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import sys
 from pathlib import Path
@@ -5,7 +6,8 @@
 
 import pandas as pd
 import requests
-from get_important_pattern import get_com_all, get_frequent_itemsets
+from get_important_pattern import get_com_all, get_forest_model_pattern, get_frequent_itemsets
+from numpy import choose
 
 sys.path.append("..")
 from get_result_web import spilt_web
@@ -18,11 +20,26 @@
 tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"]
 mertic_names = ["test_acc", "acc", "MRE", "ARI", "MSE"]
 ascendings = [False, False, True, False, True]
+
+multi_mod = False
+if multi_mod:
+    raise NotImplementedError("multi mod")
+
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("--positive", action='store_true')
+parser.add_argument("--only_apr", action='store_true')
+parser.add_argument("--choose_tasks", nargs="+", default=tasks)
+args = parser.parse_args()
+choose_tasks = args.choose_tasks
+positive = args.positive
+only_apr = args.only_apr
+if not positive:
+    assert only_apr
+    ascendings = [not item for item in ascendings]
 file_root = Path(__file__).resolve().parent
 prefix = f'https://wandb.ai/{entity}/{project}'
 runs_sum = 0
 wandb = try_import("wandb")
-positive = True
 
 
 def get_additional_sweep(sweep_id):
@@ -47,28 +64,61 @@ def get_additional_sweep(sweep_id):
 
 def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=False):
     # try:
-    step2_data = step2_origin_data.dropna()
-    com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha)
+    columns = sorted([col for col in step2_origin_data.columns if col.startswith("pipeline")])
+    step2_data = step2_origin_data.loc[:, columns + [metric_name]]
+    # com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha)
+    step2_data[metric_name] = step2_data[metric_name].astype(float)
+    if not ascending:
+        min_metric = step2_data[metric_name].min()
+        if pd.isna(min_metric):
+            return {
+                "error":
+                f"All {metric_name} values ​​are NaN and the minimum cannot be calculated. Please check your data."
+            }
+        step2_data[metric_name] = step2_data[metric_name].fillna(0)  #if ascending=False
+    else:
+        max_metric = step2_data[metric_name].max()
+        if pd.isna(max_metric):
+            return {
+                "error":
+                f"All {metric_name} values ​​are NaN and the maximum cannot be calculated. Please check your data."
+            }
+        print(f"\nmax {metric_name}:{max_metric}")
+        buffer_percentage = 0.2  # 20%
+        replacement = max_metric * (1 + buffer_percentage)
+        step2_data[metric_name] = step2_data[metric_name].fillna(replacement)
     apr_ans = get_frequent_itemsets(step2_data, metric_name, ascending)
-    return list(set(com_ans) & set(apr_ans))
+    if positive and not only_apr:
+        return {"forest_model": get_forest_model_pattern(step2_data, metric_name), "apr_ans": apr_ans}
+    else:
+        return {"apr_ans": apr_ans}
     # except Exception as e:
     #     print(e)
     #     return str(e)
 
 
 if __name__ == "__main__":
+    start = True
     ans_all = []
     for i, task in enumerate(tasks):
+
+        if task not in choose_tasks:
+            continue
         data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str)
         data = data.ffill().set_index(['Methods'])
         for row_idx in range(data.shape[0]):
             for col_idx in range(data.shape[1]):
+
                 method = data.index[row_idx]
                 dataset = data.columns[col_idx]
                 value = data.iloc[row_idx, col_idx]
                 step_name = data.iloc[row_idx]["Unnamed: 1"]
-                if method != "SVM" or dataset != "Dataset 1: GSE67835 Brain":
+                # if dataset=="Dataset6:pancreatic_cancer" and method == "Stlearn":
+                #     start=True
+                if not start:
                     continue
+                # if method !="ACTINN" :
+                #     continue
                 if isinstance(value, str) and value.startswith(prefix) and (
                         str(step_name).lower() == "step2" or str(step_name).lower() == "step 2"):  #TODO add step3
                     sweep_url = value
@@ -86,13 +136,18 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F
                         summary_data.append(flatten_dict(result))  # get result and config
                 ans = pd.DataFrame(summary_data).set_index(["id"])
                 ans.sort_index(axis=1, inplace=True)
-                print(dataset)
-                print(method)
-                ans_all.append({
+                ans_single = {
                     "task": task,
                     "dataset": dataset,
                     "method": method,
                     "pattern": summary_pattern(ans, mertic_names[i], ascendings[i])
-                })
-    with open(f"positive:{positive}_pattern.json", "w") as f:
+                }
+                with open(
+                        f"dance_auto_preprocess/patterns/{'only_apr_' if only_apr else ''}{'neg_' if not positive else ''}{task}_{dataset}_{method}_pattern.json",
+                        "w") as f:
+                    json.dump(ans_single, f, indent=2)
+                ans_all.append(ans_single)
+                print(dataset)
+                print(method)
+    with open(f"pattern.json", "w") as f:
         json.dump(ans_all, f, indent=2)
diff --git a/examples/result_analysis/get_num.py b/examples/result_analysis/get_num.py
new file mode 100644
index 00000000..6573bd70
--- /dev/null
+++ b/examples/result_analysis/get_num.py
@@ -0,0 +1,30 @@
+import sys
+from pathlib import Path
+
+import pandas as pd
+
+sys.path.append("..")
+import urllib
+
+from get_result_web import spilt_web
+
+from dance.utils import try_import
+
+wandb = try_import("wandb")
+entity = "xzy11632"
+project = "dance-dev"
+tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"]
+file_root = Path(__file__).resolve().parent
+prefix = 'https://wandb.ai/xzy11632/dance-dev'
+
+runs_sum = 0
+
+for task in tasks:
+    data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str)
+    matched_list = data.applymap(lambda x: x if isinstance(x, str) and x.startswith(prefix) else None).stack().tolist()
+    for sweep_url in matched_list:
+        _, _, sweep_id = spilt_web(sweep_url)
+        print(sweep_id)
+        sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+        runs_sum += (len(sweep.runs))
+print(runs_sum)

From 8e1d33f6df65c9145e5572661740d5ef5290cd3a Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 10:24:47 +0800
Subject: [PATCH 157/203] minor

---
 dance/transforms/filter.py                      | 2 +-
 dance/transforms/misc.py                        | 2 +-
 examples/single_modality/clustering/graphsc.py  | 2 +-
 examples/single_modality/imputation/graphsci.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py
index 8ec0ac1f..d34600fc 100644
--- a/dance/transforms/filter.py
+++ b/dance/transforms/filter.py
@@ -157,7 +157,7 @@ def prepCounts(self, x):
 
 
 @register_preprocessor("filter", "cell")
-class FilterCellsScanpy(FilterScanpy):
+class FilterCellsScanpy(FilterScanpy):  
     """Scanpy filtering cell transformation with additional options.
 
     Allow passing gene counts as ratio
diff --git a/dance/transforms/misc.py b/dance/transforms/misc.py
index 8b47c8b5..877eb6f6 100644
--- a/dance/transforms/misc.py
+++ b/dance/transforms/misc.py
@@ -169,7 +169,7 @@ def __init__(self, **kwargs):
     def __call__(self, data: Data) -> Data:
         mod1, mod2, meta1, meta2, test_sol = data.data.mod.values()
         meta1 = meta1[:, mod1.var.index]
-        meta2 = meta2[:, mod2.var.index]
+        meta2 = meta2[:, mod2.var.index]  
         test_sol = test_sol[:, mod1.var.index]
         data.data.mod["meta1"] = meta1
         data.data.mod["meta2"] = meta2
diff --git a/examples/single_modality/clustering/graphsc.py b/examples/single_modality/clustering/graphsc.py
index b1d72cef..af2c7576 100644
--- a/examples/single_modality/clustering/graphsc.py
+++ b/examples/single_modality/clustering/graphsc.py
@@ -37,7 +37,7 @@
     parser.add_argument("-data", "--dataset", default="10X_PBMC",
                         choices=["10X_PBMC", "mouse_bladder_cell", "mouse_ES_cell", "worm_neuron_cell"])
     parser.add_argument("--seed", type=int, default=0, help="Initial seed random, offset for each repeatition")
-    parser.add_argument("--num_runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("--num_runs", type=int, default=5, help="Number of repetitions")
     parser.add_argument("--cache", action="store_true", help="Cache processed data.")
     args = parser.parse_args()
     aris = []
diff --git a/examples/single_modality/imputation/graphsci.py b/examples/single_modality/imputation/graphsci.py
index c107614d..be3a272b 100644
--- a/examples/single_modality/imputation/graphsci.py
+++ b/examples/single_modality/imputation/graphsci.py
@@ -32,7 +32,7 @@
     parser.add_argument("--cache", action="store_true", help="Cache processed data.")
     parser.add_argument("--mask", type=bool, default=True, help="Mask data for validation.")
     parser.add_argument("--seed", type=int, default=0, help="Initial seed random, offset for each repeatition")
-    parser.add_argument("--num_runs", type=int, default=1, help="Number of repetitions")
+    parser.add_argument("--num_runs", type=int, default=5, help="Number of repetitions")
     params = parser.parse_args()
     print(vars(params))
     rmses = []

From 1087e3c17a0344e8210efd3e29e9be8a620cce67 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 10:27:12 +0800
Subject: [PATCH 158/203] minor

---
 examples/tuning/get_important_pattern.py | 90 +++++++++++++++++++-----
 1 file changed, 74 insertions(+), 16 deletions(-)

diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py
index 39fcfb2f..c93d7114 100644
--- a/examples/tuning/get_important_pattern.py
+++ b/examples/tuning/get_important_pattern.py
@@ -10,6 +10,7 @@
 import seaborn as sns
 from mlxtend.frequent_patterns import apriori
 from mlxtend.preprocessing import TransactionEncoder
+from mlxtend.frequent_patterns import association_rules
 from networkx import parse_adjlist
 from scipy import stats
 
@@ -17,13 +18,13 @@
 ascending = False
 
 
-def get_important_pattern(test_accs, vis=True, alpha=0.8, title=""):
+def get_important_pattern(test_accs, vis=True, alpha=0.8, title="",test_acc_names=None):
     medians = [np.median(group) for group in test_accs]
     _, p_value = stats.kruskal(*test_accs)
     if vis:
         fig = plt.figure(figsize=(12, 4))
         sns.boxplot(data=test_accs)
-        plt.xticks(list(range(len(test_accs))), [f"{i}" for i in range(len(test_accs))])
+        plt.xticks(list(range(len(test_accs))), ([f"{i}" for i in range(len(test_accs))] if  test_acc_names is None else test_acc_names),rotation=45, fontsize=10)
         plt.title(title)
         plt.show()
     if p_value < alpha:
@@ -71,7 +72,7 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True):
             test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])})
         test_accs = [i[metric_name] for i in test_accs_arrays]
         test_acc_names = [i["name"] for i in test_accs_arrays]
-        final_ranks = get_important_pattern(test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis)
+        final_ranks = get_important_pattern(test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis,test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names])
         if len(final_ranks) > 0:
             max_rank = max(final_ranks)
             max_rank_count = final_ranks.count(max_rank)
@@ -82,9 +83,57 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True):
                             print(f"index={index},name={test_acc_name},rank={rank}")
                         ans.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, ))
     return ans
-
-
-def get_frequent_itemsets(step2_data, threshold_per=0.1):
+def draw_graph(rules, rules_to_show):
+  import networkx as nx  
+  G1 = nx.DiGraph()
+   
+  color_map=[]
+  N = 50
+  colors = np.random.rand(N)    
+  strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']   
+   
+   
+  for i in range (rules_to_show):      
+    G1.add_nodes_from(["R"+str(i)])
+    
+     
+    for a in rules.iloc[i]['antecedents']:
+                
+        G1.add_nodes_from([a])
+        
+        G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2)
+       
+    for c in rules.iloc[i]['consequents']:
+             
+            G1.add_nodes_from([c])
+            
+            G1.add_edge("R"+str(i), c, color=colors[i],  weight=2)
+ 
+  for node in G1:
+       found_a_string = False
+       for item in strs: 
+           if node==item:
+                found_a_string = True
+       if found_a_string:
+            color_map.append('yellow')
+       else:
+            color_map.append('green')       
+ 
+ 
+   
+  edges = G1.edges()
+  colors = [G1[u][v]['color'] for u,v in edges]
+  weights = [G1[u][v]['weight'] for u,v in edges]
+ 
+  pos = nx.spring_layout(G1, k=16, scale=1)
+  nx.draw(G1, pos, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)            
+   
+  for p in pos:  # raise text positions
+           pos[p][1] += 0.07
+  nx.draw_networkx_labels(G1, pos)
+  plt.show()
+
+def get_frequent_itemsets(step2_data, threshold_per=0.1,vis=False):
     threshold = int(len(step2_data) * threshold_per)
     df_sorted = step2_data.sort_values(metric_name, ascending=ascending)
     top_10_percent = df_sorted.head(threshold)
@@ -95,7 +144,16 @@ def get_frequent_itemsets(step2_data, threshold_per=0.1):
     df = pd.DataFrame(te_ary, columns=te.columns_)
     frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
     # print(frequent_itemsets)
-    # rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
+    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
+    if vis:
+        # print(frequent_itemsets)
+        # print(frequent_itemsets)
+        # draw_graph(rules=rules,rules_to_show=10)
+        frequent_itemsets_copy=frequent_itemsets.copy()
+        frequent_itemsets_copy=frequent_itemsets_copy.sort_values(by="support")
+        frequent_itemsets_copy.plot(x="itemsets",y="support",kind="bar")
+        plt.xticks(rotation=30, fontsize=7)
+        # print(type(rules))
     return [tuple(a) for a in frequent_itemsets["itemsets"]]
 
 
@@ -111,7 +169,7 @@ def summary_pattern(data_path, alpha=0.8, vis=False):
     step2_origin_data = pd.read_csv(data_path)
     step2_data = step2_origin_data.dropna()
     com_ans = get_com_all(step2_data, vis=vis, alpha=alpha)
-    apr_ans = get_frequent_itemsets(step2_data)
+    apr_ans = get_frequent_itemsets(step2_data,vis=vis)
     return list(set(com_ans) & set(apr_ans))
 
 
@@ -136,11 +194,11 @@ def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False)
 
 
 if __name__ == "__main__":
-    directories = []
-    for path in Path('/home/zyxing/dance/examples/tuning').iterdir():
-        if path.is_dir():
-            if str(path.name).startswith("cluster"):
-                directories.append(path)
-    list_files(directories)
-
-    # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_scdeepsort/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True))
+    # directories = []
+    # for path in Path('/home/zyxing/dance/examples/tuning').iterdir():
+    #     if path.is_dir():
+    #         if str(path.name).startswith("cluster"):
+    #             directories.append(path)
+    # list_files(directories)
+
+    print(summary_pattern("/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv",alpha=0.3,vis=False))

From 1e843d3507e280c4c9b6633481fdcd3296eee14e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 18 Dec 2024 02:27:58 +0000
Subject: [PATCH 159/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/transforms/filter.py               |   2 +-
 dance/transforms/misc.py                 |   2 +-
 examples/tuning/get_important_pattern.py | 119 ++++++++++++-----------
 3 files changed, 64 insertions(+), 59 deletions(-)

diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py
index d34600fc..8ec0ac1f 100644
--- a/dance/transforms/filter.py
+++ b/dance/transforms/filter.py
@@ -157,7 +157,7 @@ def prepCounts(self, x):
 
 
 @register_preprocessor("filter", "cell")
-class FilterCellsScanpy(FilterScanpy):  
+class FilterCellsScanpy(FilterScanpy):
     """Scanpy filtering cell transformation with additional options.
 
     Allow passing gene counts as ratio
diff --git a/dance/transforms/misc.py b/dance/transforms/misc.py
index 877eb6f6..8b47c8b5 100644
--- a/dance/transforms/misc.py
+++ b/dance/transforms/misc.py
@@ -169,7 +169,7 @@ def __init__(self, **kwargs):
     def __call__(self, data: Data) -> Data:
         mod1, mod2, meta1, meta2, test_sol = data.data.mod.values()
         meta1 = meta1[:, mod1.var.index]
-        meta2 = meta2[:, mod2.var.index]  
+        meta2 = meta2[:, mod2.var.index]
         test_sol = test_sol[:, mod1.var.index]
         data.data.mod["meta1"] = meta1
         data.data.mod["meta2"] = meta2
diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py
index c93d7114..04c0dc44 100644
--- a/examples/tuning/get_important_pattern.py
+++ b/examples/tuning/get_important_pattern.py
@@ -8,9 +8,8 @@
 import pandas as pd
 import scikit_posthocs as sp
 import seaborn as sns
-from mlxtend.frequent_patterns import apriori
+from mlxtend.frequent_patterns import apriori, association_rules
 from mlxtend.preprocessing import TransactionEncoder
-from mlxtend.frequent_patterns import association_rules
 from networkx import parse_adjlist
 from scipy import stats
 
@@ -18,13 +17,15 @@
 ascending = False
 
 
-def get_important_pattern(test_accs, vis=True, alpha=0.8, title="",test_acc_names=None):
+def get_important_pattern(test_accs, vis=True, alpha=0.8, title="", test_acc_names=None):
     medians = [np.median(group) for group in test_accs]
     _, p_value = stats.kruskal(*test_accs)
     if vis:
         fig = plt.figure(figsize=(12, 4))
         sns.boxplot(data=test_accs)
-        plt.xticks(list(range(len(test_accs))), ([f"{i}" for i in range(len(test_accs))] if  test_acc_names is None else test_acc_names),rotation=45, fontsize=10)
+        plt.xticks(list(range(len(test_accs))),
+                   ([f"{i}" for i in range(len(test_accs))] if test_acc_names is None else test_acc_names), rotation=45,
+                   fontsize=10)
         plt.title(title)
         plt.show()
     if p_value < alpha:
@@ -72,7 +73,9 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True):
             test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])})
         test_accs = [i[metric_name] for i in test_accs_arrays]
         test_acc_names = [i["name"] for i in test_accs_arrays]
-        final_ranks = get_important_pattern(test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis,test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names])
+        final_ranks = get_important_pattern(
+            test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis,
+            test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names])
         if len(final_ranks) > 0:
             max_rank = max(final_ranks)
             max_rank_count = final_ranks.count(max_rank)
@@ -83,57 +86,56 @@ def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True):
                             print(f"index={index},name={test_acc_name},rank={rank}")
                         ans.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, ))
     return ans
+
+
 def draw_graph(rules, rules_to_show):
-  import networkx as nx  
-  G1 = nx.DiGraph()
-   
-  color_map=[]
-  N = 50
-  colors = np.random.rand(N)    
-  strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']   
-   
-   
-  for i in range (rules_to_show):      
-    G1.add_nodes_from(["R"+str(i)])
-    
-     
-    for a in rules.iloc[i]['antecedents']:
-                
-        G1.add_nodes_from([a])
-        
-        G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2)
-       
-    for c in rules.iloc[i]['consequents']:
-             
+    import networkx as nx
+    G1 = nx.DiGraph()
+
+    color_map = []
+    N = 50
+    colors = np.random.rand(N)
+    strs = ['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
+
+    for i in range(rules_to_show):
+        G1.add_nodes_from(["R" + str(i)])
+
+        for a in rules.iloc[i]['antecedents']:
+
+            G1.add_nodes_from([a])
+
+            G1.add_edge(a, "R" + str(i), color=colors[i], weight=2)
+
+        for c in rules.iloc[i]['consequents']:
+
             G1.add_nodes_from([c])
-            
-            G1.add_edge("R"+str(i), c, color=colors[i],  weight=2)
- 
-  for node in G1:
-       found_a_string = False
-       for item in strs: 
-           if node==item:
+
+            G1.add_edge("R" + str(i), c, color=colors[i], weight=2)
+
+    for node in G1:
+        found_a_string = False
+        for item in strs:
+            if node == item:
                 found_a_string = True
-       if found_a_string:
+        if found_a_string:
             color_map.append('yellow')
-       else:
-            color_map.append('green')       
- 
- 
-   
-  edges = G1.edges()
-  colors = [G1[u][v]['color'] for u,v in edges]
-  weights = [G1[u][v]['weight'] for u,v in edges]
- 
-  pos = nx.spring_layout(G1, k=16, scale=1)
-  nx.draw(G1, pos, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)            
-   
-  for p in pos:  # raise text positions
-           pos[p][1] += 0.07
-  nx.draw_networkx_labels(G1, pos)
-  plt.show()
-
-def get_frequent_itemsets(step2_data, threshold_per=0.1,vis=False):
+        else:
+            color_map.append('green')
+
+    edges = G1.edges()
+    colors = [G1[u][v]['color'] for u, v in edges]
+    weights = [G1[u][v]['weight'] for u, v in edges]
+
+    pos = nx.spring_layout(G1, k=16, scale=1)
+    nx.draw(G1, pos, node_color=color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)
+
+    for p in pos:  # raise text positions
+        pos[p][1] += 0.07
+    nx.draw_networkx_labels(G1, pos)
+    plt.show()
+
+
+def get_frequent_itemsets(step2_data, threshold_per=0.1, vis=False):
     threshold = int(len(step2_data) * threshold_per)
     df_sorted = step2_data.sort_values(metric_name, ascending=ascending)
     top_10_percent = df_sorted.head(threshold)
@@ -149,9 +151,9 @@ def get_frequent_itemsets(step2_data, threshold_per=0.1,vis=False):
         # print(frequent_itemsets)
         # print(frequent_itemsets)
         # draw_graph(rules=rules,rules_to_show=10)
-        frequent_itemsets_copy=frequent_itemsets.copy()
-        frequent_itemsets_copy=frequent_itemsets_copy.sort_values(by="support")
-        frequent_itemsets_copy.plot(x="itemsets",y="support",kind="bar")
+        frequent_itemsets_copy = frequent_itemsets.copy()
+        frequent_itemsets_copy = frequent_itemsets_copy.sort_values(by="support")
+        frequent_itemsets_copy.plot(x="itemsets", y="support", kind="bar")
         plt.xticks(rotation=30, fontsize=7)
         # print(type(rules))
     return [tuple(a) for a in frequent_itemsets["itemsets"]]
@@ -169,7 +171,7 @@ def summary_pattern(data_path, alpha=0.8, vis=False):
     step2_origin_data = pd.read_csv(data_path)
     step2_data = step2_origin_data.dropna()
     com_ans = get_com_all(step2_data, vis=vis, alpha=alpha)
-    apr_ans = get_frequent_itemsets(step2_data,vis=vis)
+    apr_ans = get_frequent_itemsets(step2_data, vis=vis)
     return list(set(com_ans) & set(apr_ans))
 
 
@@ -201,4 +203,7 @@ def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False)
     #             directories.append(path)
     # list_files(directories)
 
-    print(summary_pattern("/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv",alpha=0.3,vis=False))
+    print(
+        summary_pattern(
+            "/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv",
+            alpha=0.3, vis=False))

From 46b5cb4a5109dc299383110a94dd2bc58bb6f8b3 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 10:41:01 +0800
Subject: [PATCH 160/203] minor

---
 examples/tuning/get_important_pattern.py    | 19 +++++++++++--------
 examples/tuning/joint_embedding_jae/main.py |  7 ++++---
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py
index 39fcfb2f..25c1b90c 100644
--- a/examples/tuning/get_important_pattern.py
+++ b/examples/tuning/get_important_pattern.py
@@ -136,11 +136,14 @@ def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False)
 
 
 if __name__ == "__main__":
-    directories = []
-    for path in Path('/home/zyxing/dance/examples/tuning').iterdir():
-        if path.is_dir():
-            if str(path.name).startswith("cluster"):
-                directories.append(path)
-    list_files(directories)
-
-    # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_scdeepsort/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True))
+    # directories = []
+    # for path in Path('/home/zyxing/dance/examples/tuning').iterdir():
+    #     if path.is_dir():
+    #         if str(path.name).startswith("cluster"):
+    #             directories.append(path)
+    # list_files(directories)
+
+    print(
+        summary_pattern(
+            "/home/zyxing/dance/examples/tuning/cta_actinn/1013-1247-598-732-767-768-770-784-845-864_315-340-376-381-390-404-437-490-551-559/results/pipeline/best_test_acc.csv",
+            alpha=0.3, vis=True))
diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index 3ba455d0..a45a9b40 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper
@@ -21,7 +21,7 @@
     parser.add_argument(
         "-t", "--subtask", default="openproblems_bmmc_cite_phase2", choices=[
             "GSE140203_BRAIN_atac2gex", "GSE140203_SKIN_atac2gex", "openproblems_bmmc_cite_phase2",
-            "openproblems_bmmc_multiome_phase2"
+            "openproblems_bmmc_multiome_phase2", "openproblems_2022_multi_atac2gex"
         ])
     parser.add_argument("-d", "--data_folder", default="./data/joint_embedding")
     parser.add_argument("-pre", "--pretrained_folder", default="./data/joint_embedding/pretrained")
@@ -136,7 +136,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
                        conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
                        root_path=file_root_path,
                        required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
-                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
+                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize],
+                       metric="ARI")  # need to delete required_funs and required_indexes
         if args.tune_mode == "pipeline_params":
             run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
 """To reproduce JAE on other samples, please refer to command lines belows:

From 10f82995a7ab0547e705681847614340ab636ac4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 18 Dec 2024 02:45:22 +0000
Subject: [PATCH 161/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_jae/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_jae/main.py b/examples/tuning/joint_embedding_jae/main.py
index a45a9b40..0c5d283b 100644
--- a/examples/tuning/joint_embedding_jae/main.py
+++ b/examples/tuning/joint_embedding_jae/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.jae import JAEWrapper

From f6b39897c0c97deece5c364bd44ed6c4f82c52da Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Wed, 18 Dec 2024 10:46:18 +0800
Subject: [PATCH 162/203] add scmvae

---
 .../joint_embedding_scmvae/main.py            | 208 ++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py

diff --git a/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py b/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py
new file mode 100644
index 00000000..9fb85885
--- /dev/null
+++ b/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py
@@ -0,0 +1,208 @@
+import argparse
+import gc
+import os
+import pprint
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.utils.data as data_utils
+import wandb
+from sklearn import preprocessing
+
+from dance import logger
+from dance.datasets.multimodality import JointEmbeddingNIPSDataset
+from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE
+from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.transforms.preprocess import calculate_log_library_size
+from dance.utils import set_seed
+
+
+def parameter_setting():
+    parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis")
+
+    parser.add_argument("--workdir", "-wk", type=str, default="./new_test", help="work path")
+    parser.add_argument("--outdir", "-od", type=str, default="./new_test", help="Output path")
+
+    parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay")
+    parser.add_argument("--eps", type=float, default=0.01, help="eps")
+    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
+
+    parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size")
+    parser.add_argument('-seed', '--seed', type=int, default=1, help='Random seed for repeat results')
+    parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim")
+    parser.add_argument("--max_epoch", "-me", type=int, default=25, help="Max epoches")
+    parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration")
+    parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch")
+    parser.add_argument("--epoch_per_test", "-ept", type=int, default=1,
+                        help="Epoch per test, must smaller than max iteration.")
+    parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
+    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
+    parser.add_argument("-device", "--device", default="cuda")
+    parser.add_argument("--final_rate", type=float, default=1e-4)
+    parser.add_argument("--scale_factor", type=float, default=4)
+
+    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
+    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
+    parser.add_argument("--count", type=int, default=2)
+    parser.add_argument("--sweep_id", type=str, default=None)
+    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
+    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = parameter_setting()
+    args = parser.parse_args()
+    assert args.max_iteration > args.epoch_per_test
+    device = torch.device(args.device)
+    args.lr = 0.001
+    args.anneal_epoch = 200
+    res = None
+    logger.info(f"\n{pprint.pformat(vars(args))}")
+    file_root_path = Path(args.root_path, args.subtask).resolve()
+    logger.info(f"\n files is saved in {file_root_path}")
+    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
+    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
+
+    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
+        wandb.init(settings=wandb.Settings(start_method='thread'))
+        set_seed(args.seed)
+        wandb_config = wandb.config
+        if "run_kwargs" in pipeline_planer.config:
+            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
+                wandb_config = wandb_config["run_kwargs"]
+            else:
+                wandb.log({"skip": 1})
+                wandb.finish()
+                return
+        try:
+            dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
+            data = dataset.load_data()
+
+            le = preprocessing.LabelEncoder()
+            labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+            data.mod["mod1"].obsm["labels"] = labels
+
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb_config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
+            preprocessing_pipeline(data)
+            train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
+            train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
+            test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx)))
+
+            # train_size=data.mod["meta1"].shape[0]
+            # test_size=data.mod["mod1"].shape[0]-train_size
+            data.set_split_idx("train", train_idx)
+            data.set_split_idx("test", test_idx)
+            (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch")
+            (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch")
+            # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels)
+            lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()]))
+            lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()]))
+            lib_mean1 = torch.from_numpy(lib_mean1)
+            lib_var1 = torch.from_numpy(lib_var1)
+            lib_mean2 = torch.from_numpy(lib_mean2)
+            lib_var2 = torch.from_numpy(lib_var2)
+
+            Nfeature1 = x_train.shape[1]
+            Nfeature2 = y_train.shape[1]
+            # train_size = len(data.get_split_idx("train"))
+            # train_size=x_train.shape[0]
+            train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx],
+                                             lib_var2[train_idx], y_train)
+
+            valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx],
+                                             lib_var2[test_idx], y_test)
+
+            total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
+
+            total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
+
+            x_test = torch.cat([x_train, x_test])
+            y_test = torch.cat([y_train, y_test])
+            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
+            model = scMVAE(
+                encoder_1=[Nfeature1, 1024, 128, 128],
+                hidden_1=128,
+                Z_DIMS=22,
+                decoder_share=[22, 128, 256],
+                share_hidden=128,
+                decoder_1=[128, 128, 1024],
+                hidden_2=1024,
+                encoder_l=[Nfeature1, 128],
+                hidden3=128,
+                encoder_2=[Nfeature2, 1024, 128, 128],
+                hidden_4=128,
+                encoder_l1=[Nfeature2, 128],
+                hidden3_1=128,
+                decoder_2=[128, 128, 1024],
+                hidden_5=1024,
+                drop_rate=0.1,
+                log_variational=True,
+                Type="ZINB",
+                device=device,
+                n_centroids=22,
+                penality="GMM",
+                model=1,
+            )
+            model.to(device)
+            model.init_gmm_params(total_loader)
+            model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
+
+            # embeds = model.predict(x_test, y_test).cpu().numpy()
+            score = model.score(x_test, y_test, labels)
+            # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
+            score["ARI"] = score["dance_ari"]
+            del score["dance_ari"]
+            wandb.log(score)
+            wandb.finish()
+        finally:
+            locals_keys = list(locals().keys())
+            for var in locals_keys:
+                try:
+                    exec(f"del {var}")
+                    logger.info(f"Deleted '{var}'")
+                except NameError:
+                    logger.info(f"Variable '{var}' does not exist, continuing...")
+            torch.cuda.empty_cache()
+            gc.collect()
+        # score.update({
+        #     'seed': args.seed + k,
+        #     'subtask': args.subtask,
+        #     'method': 'scmvae',
+        # })
+
+        # if res is not None:
+        #     res = res.append(score, ignore_index=True)
+        # else:
+        #     for s in score:
+        #         score[s] = [score[s]]
+        #     res = pd.DataFrame(score)
+
+    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
+        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
+    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
+    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
+        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
+                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
+                       root_path=file_root_path,
+                       required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
+                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
+        if args.tune_mode == "pipeline_params":
+            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
+"""To reproduce scMVAE on other samples, please refer to command lines belows:
+
+GEX-ADT:
+$ python scmvae.py --subtask openproblems_bmmc_cite_phase2 --device cuda
+
+GEX-ATAC:
+$ python scmvae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda
+
+"""

From d0c5c07e6a6f5f4ee59c15323b5febd13bf910b7 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Tue, 17 Dec 2024 21:54:12 -0500
Subject: [PATCH 163/203] minor

---
 examples/tuning/cta_celltypist/main.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py
index c625065f..c8936b90 100644
--- a/examples/tuning/cta_celltypist/main.py
+++ b/examples/tuning/cta_celltypist/main.py
@@ -7,12 +7,12 @@
 
 import numpy as np
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist
-from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
+from dance.pipeline import Pipeline, PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
 from dance.typing import LogLevel
 from dance.utils import set_seed
 
@@ -56,12 +56,20 @@
     def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
         wandb.init(settings=wandb.Settings(start_method='thread'))
         set_seed(args.seed)
+        if "run_kwargs" in pipeline_planer.config and tune_mode == "params":
+            wandb_config = dict(wandb.config)
+            config = {'pipeline': wandb_config["run_kwargs"], "type": "preprocessor"}
+            preprocessing_pipeline = Pipeline(config)
 
+        else:
+            # Prepare preprocessing pipeline and apply it to data
+            kwargs = {tune_mode: dict(wandb.config)}
+            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
         # Initialize model and get model specific preprocessing pipeline
         model = Celltypist(majority_voting=args.majority_voting)
         # Prepare preprocessing pipeline and apply it to data
-        kwargs = {tune_mode: dict(wandb.config)}
-        preprocessing_pipeline = pipeline_planer.generate(**kwargs)
+        # kwargs = {tune_mode: dict(wandb.config)}
+        # preprocessing_pipeline = pipeline_planer.generate(**kwargs)
 
         # Load data and perform necessary preprocessing
         data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset,

From 577919a0fc7809877bcf6a1778fedb1fee9cfc17 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 18 Dec 2024 02:56:25 +0000
Subject: [PATCH 164/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/cta_celltypist/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/cta_celltypist/main.py b/examples/tuning/cta_celltypist/main.py
index 089e63a2..58870699 100644
--- a/examples/tuning/cta_celltypist/main.py
+++ b/examples/tuning/cta_celltypist/main.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.celltypist import Celltypist

From c7169ebf6e769a15736a299394cfd0d4aba5e246 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 11:11:00 +0800
Subject: [PATCH 165/203] minor

---
 dance/atlas/sc_similarity/anndata_similarity.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dance/atlas/sc_similarity/anndata_similarity.py b/dance/atlas/sc_similarity/anndata_similarity.py
index ab44e6e7..88784a9d 100644
--- a/dance/atlas/sc_similarity/anndata_similarity.py
+++ b/dance/atlas/sc_similarity/anndata_similarity.py
@@ -1,5 +1,4 @@
 # anndata_similarity.py
-# TODO translate notes
 import re
 import warnings
 from typing import Callable, Dict, List, Optional

From fa7be0a73b90a81d995557d6f5a5c60b987e956b Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 11:13:04 +0800
Subject: [PATCH 166/203] minor

---
 dance/sc_similarity/anndata_similarity.py | 511 ----------------------
 1 file changed, 511 deletions(-)
 delete mode 100644 dance/sc_similarity/anndata_similarity.py

diff --git a/dance/sc_similarity/anndata_similarity.py b/dance/sc_similarity/anndata_similarity.py
deleted file mode 100644
index 5409fdb6..00000000
--- a/dance/sc_similarity/anndata_similarity.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# anndata_similarity.py
-# TODO translate notes
-import re
-import warnings
-from typing import Callable, Dict, List, Optional
-
-import anndata
-import anndata as ad
-import numpy as np
-import ot
-import pandas as pd
-import scanpy as sc
-import scipy
-import yaml
-from omegaconf import OmegaConf
-from scipy.linalg import sqrtm
-from scipy.spatial import cKDTree
-from scipy.spatial.distance import cdist, directed_hausdorff, jaccard, jensenshannon
-from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel
-
-# Suppress scipy warnings for constant input in Pearson correlation
-warnings.filterwarnings("ignore", message="An input array is constant")
-
-
-class AnnDataSimilarity:
-
-    def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None,
-                 init_random_state: Optional[int] = None, n_runs: int = 10,
-                 ground_truth_conf_path: Optional[str] = None, adata1_name: Optional[str] = None,
-                 adata2_name: Optional[str] = None,
-                 methods=['cta_actinn', 'cta_celltypist', 'cta_scdeepsort', 'cta_singlecellnet'], tissue="blood"):
-        """Initialize the AnnDataSimilarity object and perform data preprocessing."""
-        self.origin_adata1 = adata1.copy()
-        self.origin_adata2 = adata2.copy()
-        self.sample_size = sample_size
-        self.init_random_state = init_random_state
-        self.preprocess()
-        self.results = {}
-        self.ground_truth_conf_path = ground_truth_conf_path
-        self.adata1_name = adata1_name
-        self.adata2_name = adata2_name
-        self.methods = methods
-        self.tissue = tissue
-        self.n_runs = n_runs
-
-    def filter_gene(self, n_top_genes=3000):
-        sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3')
-        sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3')
-
-        common_hvg = self.origin_adata1.var_names[self.origin_adata1.var['highly_variable']].intersection(
-            self.origin_adata2.var_names[self.origin_adata2.var['highly_variable']])
-
-        self.origin_adata1 = self.origin_adata1[:, common_hvg].copy()
-        self.origin_adata2 = self.origin_adata2[:, common_hvg].copy()
-        self.common_genes = common_hvg
-
-    def preprocess(self):
-        """Preprocess the data, including log normalization and normalization to
-        probability distribution."""
-        self.filter_gene()
-
-    def sample_cells(self, random_state):
-        """Randomly sample cells from each dataset if sample_size is specified."""
-        np.random.seed(random_state)
-        if self.sample_size is None:
-            self.sample_size = min(self.adata1.n_obs, self.adata2.n_obs)  #need to think
-        if self.adata1.n_obs > self.sample_size:
-            indices1 = np.random.choice(self.adata1.n_obs, size=self.sample_size, replace=False)
-            self.sampled_adata1 = self.adata1[indices1, :].copy()
-        else:
-            self.sampled_adata1 = self.adata1.copy()
-        if self.adata2.n_obs > self.sample_size:
-            indices2 = np.random.choice(self.adata2.n_obs, size=self.sample_size, replace=False)
-            self.sampled_adata2 = self.adata2[indices2, :].copy()
-        else:
-            self.sampled_adata2 = self.adata2.copy()
-
-    def normalize_data(self):  # I am not sure
-        """Normalize the data by total counts per cell and log-transform."""
-        sc.pp.normalize_total(self.adata1, target_sum=1e4)
-        sc.pp.log1p(self.adata1)
-        sc.pp.normalize_total(self.adata2, target_sum=1e4)
-        sc.pp.log1p(self.adata2)
-
-    def set_prob_data(self, sampled=False):
-        # Normalize the data to probability distributions
-        if sampled:
-            prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
-            prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
-        else:
-            prob_adata1 = self.adata1.X / self.adata1.X.sum(axis=1)
-            prob_adata2 = self.adata2.X / self.adata2.X.sum(axis=1)
-        # Handle any NaN values resulting from division by zero
-        self.X = np.nan_to_num(prob_adata1).toarray()
-        self.Y = np.nan_to_num(prob_adata2).toarray()
-
-    def cosine_sim_sampled(self) -> pd.DataFrame:
-        """Computes the average cosine similarity between all pairs of cells from the
-        two datasets."""
-        # Compute cosine similarity matrix
-        sim_matrix = cosine_similarity(self.sampled_adata1.X, self.sampled_adata2.X)
-        # Return the average similarity
-        return sim_matrix.mean()
-
-    def pearson_corr_sampled(self) -> pd.DataFrame:
-        """Computes the average Pearson correlation coefficient between all pairs of
-        cells from the two datasets."""
-        # Compute Pearson correlation matrix
-        corr_matrix = np.corrcoef(self.sampled_adata1.X.toarray(),
-                                  self.sampled_adata2.X.toarray())[:self.sampled_adata1.n_obs,
-                                                                   self.sampled_adata1.n_obs:]
-        # Return the average correlation
-        return np.nanmean(corr_matrix)
-
-    def jaccard_sim_sampled(self, threshold: float = 0.5) -> pd.DataFrame:
-        """Computes the average Jaccard similarity between all pairs of binarized cells
-        from the two datasets."""
-        # Binarize the data
-        binary_adata1 = (self.sampled_adata1.X > threshold).astype(int)
-        binary_adata2 = (self.sampled_adata2.X > threshold).astype(int)
-        # Compute Jaccard distance matrix
-        distance_matrix = cdist(binary_adata1.A, binary_adata2.A, metric='jaccard')
-        # Convert to similarity and compute the average
-        similarity_matrix = 1 - distance_matrix
-        return similarity_matrix.mean()
-
-    def js_divergence_sampled(self) -> float:
-        """Computes the average Jensen-Shannon divergence between all pairs of cells
-        from the two datasets."""
-        # Normalize the data to probability distributions
-        prob_adata1 = self.sampled_adata1.X / self.sampled_adata1.X.sum(axis=1)
-        prob_adata2 = self.sampled_adata2.X / self.sampled_adata2.X.sum(axis=1)
-        # Handle any NaN values resulting from division by zero
-        prob_adata1 = np.nan_to_num(prob_adata1).toarray()
-        prob_adata2 = np.nan_to_num(prob_adata2).toarray()
-
-        # Define a function to compute JS divergence for a pair of probability vectors
-        def jsd(p, q):
-            return jensenshannon(p, q)
-
-        # Compute JS divergence matrix
-        jsd_vectorized = np.vectorize(jsd, signature='(n),(n)->()')
-        divergence_matrix = np.zeros((prob_adata1.shape[0], prob_adata2.shape[0]))
-        for i in range(prob_adata1.shape[0]):
-            divergence_matrix[i, :] = jsd_vectorized(
-                np.repeat(prob_adata1[i, :], prob_adata2.shape[0], axis=0).reshape(-1, prob_adata1.shape[1]),
-                prob_adata2)
-
-        # Convert divergence to similarity and compute the average
-        similarity_matrix = 1 - divergence_matrix
-        return np.nanmean(similarity_matrix)
-
-    def compute_mmd(self) -> float:
-        X = self.X
-        Y = self.Y
-        kernel = "rbf"
-        gamma = 1.0
-        if kernel == 'rbf':
-            K_X = np.exp(-gamma * cdist(X, X, 'sqeuclidean'))
-            K_Y = np.exp(-gamma * cdist(Y, Y, 'sqeuclidean'))
-            K_XY = np.exp(-gamma * cdist(X, Y, 'sqeuclidean'))
-        elif kernel == 'linear':
-            K_X = np.dot(X, X.T)
-            K_Y = np.dot(Y, Y.T)
-            K_XY = np.dot(X, Y.T)
-        else:
-            raise ValueError("Unsupported kernel type")
-
-        m = X.shape[0]
-        n = Y.shape[0]
-
-        sum_X = (np.sum(K_X) - np.sum(np.diag(K_X))) / (m * (m - 1))
-        sum_Y = (np.sum(K_Y) - np.sum(np.diag(K_Y))) / (n * (n - 1))
-        sum_XY = np.sum(K_XY) / (m * n)
-
-        mmd_squared = sum_X + sum_Y - 2 * sum_XY
-        mmd = np.sqrt(max(mmd_squared, 0))
-        return 1 / (1 + mmd)
-
-    def common_genes_num(self):
-        return len(self.common_genes)
-
-    def otdd(self):
-        """Compute the OTDD between two data sets."""
-        raise NotImplementedError("OTDD!")
-
-    def data_company(self):
-        raise NotImplementedError("data company")
-
-    def wasserstein_dist(self) -> float:
-        """Computes the average Wasserstein distance between all pairs of cells from the
-        two datasets."""
-        X = self.X
-        Y = self.Y
-        a = np.ones((X.shape[0], )) / X.shape[0]
-        b = np.ones((Y.shape[0], )) / Y.shape[0]
-        M = ot.dist(X, Y, metric='euclidean')
-        wasserstein_dist = ot.emd2(a, b, M)
-        return 1 / 1 + wasserstein_dist
-
-    def get_Hausdorff(self):
-        X = self.X
-        Y = self.Y
-        forward = directed_hausdorff(X, Y)[0]
-        backward = directed_hausdorff(X, Y)[0]
-        hausdorff_distance = max(forward, backward)
-        normalized_hausdorff = hausdorff_distance / np.sqrt(X.shape[1])
-        similarity = 1 - normalized_hausdorff
-        return similarity
-
-    def chamfer_distance(self):
-        X = self.X
-        Y = self.Y
-        tree_A = cKDTree(X)
-        tree_B = cKDTree(Y)
-
-        distances_A_to_B, _ = tree_A.query(Y)
-        distances_B_to_A, _ = tree_B.query(X)
-
-        chamfer_A_to_B = np.mean(distances_A_to_B)
-        chamfer_B_to_A = np.mean(distances_B_to_A)
-        distance = chamfer_A_to_B + chamfer_B_to_A
-        normalized_chamfer = distance / np.sqrt(X.shape[1])
-        similarity = 1 - normalized_chamfer
-        return similarity
-
-    def energy_distance_metric(self):
-        X = self.X
-        Y = self.Y
-        XX = cdist(X, X, 'euclidean')
-        YY = cdist(Y, Y, 'euclidean')
-        XY = cdist(X, Y, 'euclidean')
-        distance = 2 * np.mean(XY) - np.mean(XX) - np.mean(YY)
-        return 1 / (1 + distance)
-
-    def get_sinkhorn2(self):
-        X = self.X
-        Y = self.Y
-        a = np.ones(X.shape[0]) / X.shape[0]
-        b = np.ones(Y.shape[0]) / Y.shape[0]
-        M = ot.dist(X, Y, metric='euclidean')
-        reg = 0.1
-        sinkhorn_dist = ot.sinkhorn2(a, b, M, reg)
-        return 1 / (1 + sinkhorn_dist)
-
-    def bures_distance(self):
-        X = self.X
-        Y = self.Y
-        C1 = np.cov(X, rowvar=False)
-        C2 = np.cov(Y, rowvar=False)
-        sqrt_C1 = sqrtm(C1)
-        product = sqrt_C1 @ C2 @ sqrt_C1
-        sqrt_product = sqrtm(product)
-        trace = np.trace(C1) + np.trace(C2) - 2 * np.trace(sqrt_product)
-        return 1 / (1 + np.sqrt(max(trace, 0)))
-
-    def spectral_distance(self):
-        X = self.X
-        Y = self.Y
-        C1 = np.cov(X, rowvar=False)
-        C2 = np.cov(Y, rowvar=False)
-        eig_A = np.linalg.eigvalsh(C1)
-        eig_B = np.linalg.eigvalsh(C2)
-        return 1 / (1 + np.linalg.norm(eig_A - eig_B))
-
-    def get_dataset_meta_sim(self):
-        # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general']
-        con_cols = [
-            "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean",
-            "n_counts_var", "var_n_counts_mean", "var_n_counts_var"
-        ]
-        dis_cols = ['assay', 'tissue']
-
-        def get_discrete_sim(col_list1, col_list2):
-            set1 = set(col_list1)
-            set2 = set(col_list2)
-            intersection = len(set1.intersection(set2))
-            union = len(set1.union(set2))
-            return intersection / union
-
-        def get_con_sim(con_data_1, con_data_2):
-            return abs(con_data_1 - con_data_2) / max(con_data_1, con_data_2)
-
-        def get_dataset_info(data: ad.AnnData):
-            con_sim = {}
-            con_sim["nnz_mean"] = np.mean(data.obs["nnz"])  #sample 10000之后这里是应该更新的
-            con_sim["nnz_var"] = np.var(data.obs["nnz"])
-            nnz_values = data.X[data.X.nonzero()]
-            con_sim["nnz_counts_mean"] = np.mean(nnz_values)
-            con_sim["nnz_counts_var"] = np.var(nnz_values)
-            con_sim["n_measured_vars"] = np.mean(data.obs["n_measured_vars"])
-            con_sim["cell_num"] = len(data.obs)
-            con_sim["gene_num"] = len(data.var)
-            con_sim["n_counts_mean"] = np.mean(data.obs["n_counts"])
-            con_sim["n_counts_var"] = np.var(data.obs["n_counts"])
-            if "n_counts" not in data.var.columns:
-                if scipy.sparse.issparse(data.X):
-                    gene_counts = np.array(data.X.sum(axis=0)).flatten()
-                else:
-                    gene_counts = data.X.sum(axis=0)
-            data.var["n_counts"] = gene_counts
-            data.var["n_counts"] = data.var["n_counts"].astype(float)
-            con_sim["var_n_counts_mean"] = np.mean(data.var["n_counts"])
-            con_sim["var_n_counts_var"] = np.var(data.var["n_counts"])
-            data.uns["con_sim"] = con_sim
-            return data
-
-        data_1 = self.adata1.copy()
-        data_2 = self.adata2.copy()
-        data_1 = get_dataset_info(data_1)
-        data_2 = get_dataset_info(data_2)
-        ans = {}
-        obs_1 = data_1.obs
-        obs_2 = data_2.obs
-        con_sim_1 = data_1.uns["con_sim"]
-        con_sim_2 = data_2.uns["con_sim"]
-        for dis_col in dis_cols:
-            ans[f"{dis_col}_sim"] = get_discrete_sim(obs_1[dis_col].values, obs_2[dis_col].values)
-        for con_col in con_cols:
-            ans[f"{con_col}_sim"] = get_con_sim(con_sim_1[con_col], con_sim_2[con_col])
-        return np.mean(list(ans.values()))
-
-    def get_ground_truth(self):
-        assert self.ground_truth_conf_path is not None
-        assert self.adata1_name is not None
-        assert self.adata2_name is not None
-        ground_truth_conf = pd.read_excel(self.ground_truth_conf_path, sheet_name=self.tissue, index_col=0)
-
-        def get_targets(dataset_truth: str):
-            dataset_truth = OmegaConf.create(fix_yaml_string(dataset_truth))
-            targets = []
-            for item in dataset_truth:
-                targets.append(item["target"])
-            return targets
-
-        sim_targets = []
-        for method in self.methods:
-            query_dataset_truth = ground_truth_conf.loc[self.adata1_name, f"{method}_method"]
-            atlas_dataset_truth = ground_truth_conf.loc[self.adata2_name, f"{method}_method"]
-            query_targets = get_targets(query_dataset_truth)
-            atlas_targets = get_targets(atlas_dataset_truth)
-            assert len(query_targets) == len(atlas_targets)
-            sim_targets.append((sum(a == b for a, b in zip(query_targets, atlas_targets)), len(query_targets)))
-        sim_targets.append((sum(x for x, y in sim_targets), sum(y for x, y in sim_targets)))
-        return sim_targets
-
-    def compute_similarity(
-        self, random_state: int, methods: List[str] = [
-            'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim"
-        ]
-    ) -> Dict[str, float]:
-        """Computes the specified similarity measure. Parameters:
-
-        methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
-        Returns:
-        Dictionary containing the similarity matrices
-
-        """
-        self.adata1 = self.origin_adata1.copy()
-        self.adata2 = self.origin_adata2.copy()
-        self.normalize_data()
-        self.sample_cells(random_state)
-        self.set_prob_data()
-
-        results = {}
-        for method in methods:
-            print(method)
-            if method == 'cosine':
-                results['cosine'] = self.cosine_sim_sampled()
-            elif method == 'pearson':
-                results['pearson'] = self.pearson_corr_sampled()
-            elif method == 'jaccard':
-                results['jaccard'] = self.jaccard_sim_sampled()
-            elif method == 'js_distance':
-                results['js_distance'] = self.js_divergence_sampled()
-            elif method == 'wasserstein':
-                results['wasserstein'] = self.wasserstein_dist()
-            elif method == "common_genes_num":
-                results["common_genes_num"] = self.common_genes_num()
-            elif method == "Hausdorff":
-                results["Hausdorff"] = self.get_Hausdorff()
-            elif method == "chamfer":
-                results["chamfer"] = self.chamfer_distance()
-            elif method == "energy":
-                results["energy"] = self.energy_distance_metric()
-            elif method == "sinkhorn2":
-                results["sinkhorn2"] = self.get_sinkhorn2()
-            elif method == "bures":
-                results["bures"] = self.bures_distance()
-            elif method == "spectral":
-                results["spectral"] = self.spectral_distance()
-            elif method == "otdd":
-                results['otdd'] = self.otdd()
-            elif method == "ground_truth":
-                results["ground_truth"] = self.get_ground_truth()
-            elif method == "metadata_sim":
-                results["metadata_sim"] = self.get_dataset_meta_sim()
-            elif method == "mmd":
-                results["mmd"] = self.compute_mmd()
-            else:
-                raise ValueError(f"Unsupported similarity method: {method}")
-        return results
-
-    def get_similarity_matrix_A2B(
-        self, methods: List[str] = [
-            "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num",
-            "ground_truth", "metadata_sim", "mmd"
-        ]
-    ) -> Dict[str, float]:
-        """Same as compute_similarity, keeping method name consistency."""
-        cumulative_results = {method: 0.0 for method in methods}
-
-        for run in range(self.n_runs):
-            # Update random state for each run
-            if self.init_random_state is not None:
-                current_random_state = self.init_random_state + run
-            else:
-                current_random_state = None
-            run_results = self.compute_similarity(methods=methods, random_state=current_random_state)
-            for method in methods:
-                if method in ["ground_truth"]:
-                    cumulative_results[method] = run_results[method]
-                else:
-                    cumulative_results[method] += run_results[method]
-    # Average the results over the number of runs
-        averaged_results = {
-            method:
-            cumulative_results[method] if method in ["ground_truth"] else cumulative_results[method] / self.n_runs
-            for method in methods
-        }
-        return averaged_results
-
-    # def get_max_similarity_A_to_B(self):
-    #     if self.results is None:
-    #         raise ValueError(f"need results!")
-    #     else:
-    #         self.results_score = {}
-    #         for key in self.results:
-    #             if key not in ["common_genes_num", "ground_truth", "metadata_sim"]:
-    #                 self.results_score[key] = self._get_max_similarity(self.results[key])
-    #             else:
-    #                 self.results_score[key] = self.results[key]
-    #     return self.results_score
-
-    # def _get_max_similarity(self, similarity_matrix: pd.DataFrame):
-    #     """Maximum matching average similarity score."""
-    #     matched_values = [
-    #         similarity_matrix.loc[label,
-    #                               label] if label in similarity_matrix.columns else similarity_matrix.loc[label].max()
-    #         for label in similarity_matrix.index
-    #     ]  # need to ask
-    #     overall_similarity = np.mean(matched_values)
-    #     return overall_similarity
-
-
-def extract_type_target_params(item_text):
-    lines = item_text.strip().split('\n')
-    item_dict = {}
-    params_dict = {}
-    current_param_key = None
-    in_params = False
-    for line in lines:
-        stripped_line = line.strip()
-        if stripped_line.startswith('- type:'):
-            item_dict['type'] = stripped_line.split(':', 1)[1].strip()
-        elif stripped_line.startswith('target:'):
-            item_dict['target'] = stripped_line.split(':', 1)[1].strip()
-        elif stripped_line.startswith('params:'):
-            params_content = stripped_line.split(':', 1)[1].strip()
-            if params_content == '{}':
-                params_dict = {}
-                in_params = False
-            else:
-                params_dict = {}
-                in_params = True
-        elif in_params:
-            if re.match(r'^\w+:$', stripped_line):
-                current_param_key = stripped_line[:-1].strip()
-                params_dict[current_param_key] = {}
-            elif re.match(r'^- ', stripped_line):
-                list_item = stripped_line[2:].strip()
-                if current_param_key:
-                    if not isinstance(params_dict[current_param_key], list):
-                        params_dict[current_param_key] = []
-                    params_dict[current_param_key].append(list_item)
-            elif ':' in stripped_line:
-                key, value = map(str.strip, stripped_line.split(':', 1))
-                if current_param_key and isinstance(params_dict.get(current_param_key, None), dict):
-                    params_dict[current_param_key][key] = yaml.safe_load(value)
-                else:
-                    params_dict[key] = yaml.safe_load(value)
-    item_dict['params'] = params_dict
-    return item_dict
-
-
-def fix_yaml_string(original_str):
-    #It will be deleted
-    yaml_str = original_str.replace('\\n', '\n').strip()
-    items = re.split(r'(?=-\s*type:)', yaml_str)
-    config_list = []
-    for item in items:
-        if not item.strip():
-            continue
-        if not item.strip().startswith('- type:'):
-            print(item)
-            print("警告: 某个项未以 '- type:' 开头，跳过此项.")
-            continue
-        item_dict = extract_type_target_params(item)
-        config_list.append(item_dict)
-    fixed_yaml = yaml.dump(config_list, sort_keys=False)
-    return fixed_yaml

From a66bf43829b35b4d63386ff1389744d372c5ae86 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Wed, 18 Dec 2024 11:31:10 +0800
Subject: [PATCH 167/203] add note

---
 dance/transforms/filter.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/dance/transforms/filter.py b/dance/transforms/filter.py
index 8ec0ac1f..ef703d0d 100644
--- a/dance/transforms/filter.py
+++ b/dance/transforms/filter.py
@@ -274,6 +274,20 @@ def __init__(
 @register_preprocessor("filter", "cell")
 @add_mod_and_transform
 class FilterCellsCommonMod(BaseTransform):
+    """Initialize the FilterCellsCommonMod class.
+
+    Parameters
+    ----------
+    mod1 : str
+        Name of the first modality in the single-cell dataset.
+    mod2 : str
+        Name of the second modality in the single-cell dataset.
+    sol : Optional[str], default=None
+        Name of the optional solution dataset containing cell labels or annotations.
+    **kwargs : dict
+        Additional keyword arguments passed to the base transformation class.
+
+    """
 
     def __init__(self, mod1: str, mod2: str, sol: Optional[str] = None, **kwargs):
         super().__init__(**kwargs)

From 78493331a45724c09ece9fab37eb0705ed67f176 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Wed, 18 Dec 2024 11:49:24 +0800
Subject: [PATCH 168/203] minor

---
 dance/utils/wrappers.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py
index 28ffefe8..74336b9c 100644
--- a/dance/utils/wrappers.py
+++ b/dance/utils/wrappers.py
@@ -107,6 +107,16 @@ def new_init(self, *args, **kwargs):
 
     @functools.wraps(original_call)
     def new_call(self, data: Data, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        data : Data
+            The input data object containing the `mudata` with multiple modalities.
+        Returns
+        -------
+        Any
+            The result of the original_call method.
+        """
         if hasattr(self, 'mod') and self.mod is not None:
             md_data = data.data
             ad_data = Data(data=transform_mod_to_anndata(md_data, self.mod))

From 1b8dd4769a7778cf0a7f4e2450e57ee7dcdb4b6a Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Wed, 18 Dec 2024 11:51:33 +0800
Subject: [PATCH 169/203] minor

---
 dance/utils/wrappers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dance/utils/wrappers.py b/dance/utils/wrappers.py
index 74336b9c..a7f308ee 100644
--- a/dance/utils/wrappers.py
+++ b/dance/utils/wrappers.py
@@ -95,6 +95,8 @@ def wrapped_func(*args):
 
 
 def add_mod_and_transform(cls):
+    """A decorator that modifies a class to add functionality for working with specific
+    modalities (`mod`) in a `mudata` object."""
     original_init = cls.__init__
     original_call = cls.__call__
     cls.add_mod_and_transform = "add_mod_and_transform"

From e420a6d049014762be0e8fb87846ab053c7ec523 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 17:00:36 +0800
Subject: [PATCH 170/203] translate notes

---
 examples/get_result_web.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/get_result_web.py b/examples/get_result_web.py
index 891c98d9..c4d6a0d8 100644
--- a/examples/get_result_web.py
+++ b/examples/get_result_web.py
@@ -16,22 +16,24 @@
 
 def check_identical_strings(string_list):
     if not string_list:
-        raise ValueError("列表为空")
+        raise ValueError("The list is empty")
 
     arr = np.array(string_list)
     if not np.all(arr == arr[0]):
-        raise ValueError("发现不同的字符串")
+        raise ValueError("Different strings found")
 
     return string_list[0]
 
 
     # if not string_list:
-    #     raise ValueError("列表为空")
+    #     raise ValueError("The list is empty")
     # first_string = string_list[0]
     # for s in string_list[1:]:
     #     if s != first_string:
-    #         raise ValueError(f"发现不同的字符串: '{first_string}' 和 '{s}'")
+    #         raise ValueError(f"Different strings found: '{first_string}' and '{s}'")
     # return first_string
+
+
 def get_sweep_url(step_csv: pd.DataFrame, single=True):
     ids = step_csv["id"]
     sweep_urls = []
@@ -57,7 +59,7 @@ def spilt_web(url: str):
     if match:
         entity = match.group(1)
         project = match.group(2)
-        pattern = r'/sweeps/([^/?]+)'  # 正则表达式模式
+        pattern = r'/sweeps/([^/?]+)'  # Regular expression pattern
         match = re.search(pattern, url)
         if match:
             sweep_id = match.group(1)
@@ -161,8 +163,8 @@ def write_ans(tissue):
             step3_urls = []
             for i in range(3):
                 file_csv = f"{file_path}/results/params/{i}_best_test_acc.csv"
-                if not os.path.exists(file_csv):  #no parameter
-                    print(f"文件 {file_csv} 不存在，跳过。")
+                if not os.path.exists(file_csv):  # no parameter
+                    print(f"File {file_csv} does not exist, skipping.")
                     continue
                 step3_urls.append(get_sweep_url(pd.read_csv(file_csv)))
             step3_str = ",".join(step3_urls)

From 67fb57b97f0a8bde492c22d2fa27043446087938 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 18 Dec 2024 09:02:17 +0000
Subject: [PATCH 171/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/get_result_web.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/get_result_web.py b/examples/get_result_web.py
index c4d6a0d8..c69e9d4e 100644
--- a/examples/get_result_web.py
+++ b/examples/get_result_web.py
@@ -24,7 +24,6 @@ def check_identical_strings(string_list):
 
     return string_list[0]
 
-
     # if not string_list:
     #     raise ValueError("The list is empty")
     # first_string = string_list[0]

From d4529734e25a5b8c1db01012a8642412728cd2d4 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 20:07:17 +0800
Subject: [PATCH 172/203] add argument preprocess

---
 examples/multi_modality/joint_embedding/jae.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/jae.py b/examples/multi_modality/joint_embedding/jae.py
index 0b1d79bb..7504f2e7 100644
--- a/examples/multi_modality/joint_embedding/jae.py
+++ b/examples/multi_modality/joint_embedding/jae.py
@@ -25,7 +25,7 @@
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
     parser.add_argument("--span", type=float, default=0.3)
-
+    parser.add_argument("--preprocess", default="aux")
     args = parser.parse_args()
 
     device = args.device
@@ -34,7 +34,7 @@
     rndseed = args.seed
     set_seed(rndseed)
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess, normalize=True,
                                         span=args.span)
     data = dataset.load_data()
 
@@ -44,7 +44,7 @@
         feature_channel=["X_pca", "X_pca"],
         label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
     )
-    if True:
+    if args.preprocess != "aux":
         cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
         cell_type_labels_unique = list(np.unique(cell_type_labels))
         c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])

From 631ce663eefa9478cb1c4d3fb906e56dbcd50ec1 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 20:10:29 +0800
Subject: [PATCH 173/203] add argument preprocess

---
 examples/multi_modality/joint_embedding/scmogcn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/multi_modality/joint_embedding/scmogcn.py b/examples/multi_modality/joint_embedding/scmogcn.py
index 44e2a748..51e556c2 100644
--- a/examples/multi_modality/joint_embedding/scmogcn.py
+++ b/examples/multi_modality/joint_embedding/scmogcn.py
@@ -28,7 +28,7 @@
     parser.add_argument("-nm", "--normalize", default=1, type=int, choices=[0, 1])
     parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
     parser.add_argument("--span", type=float, default=0.3)
-
+    parser.add_argument("--preprocess", default="aux")
     args = parser.parse_args()
 
     device = args.device
@@ -37,7 +37,7 @@
     rndseed = args.seed
     set_seed(rndseed)
 
-    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess="pca", normalize=True,
+    dataset = JointEmbeddingNIPSDataset(args.subtask, root=args.data_folder, preprocess=args.preprocess, normalize=True,
                                         span=args.span)
     data = dataset.load_data()
     train_size = len(data.get_split_idx("train"))
@@ -50,7 +50,7 @@
         feature_channel=["X_pca", "X_pca"],
         label_channel=["cell_type", "batch_label", "phase_labels", "S_scores", "G2M_scores"],
     )
-    if True:
+    if args.preprocess != "aux":
         cell_type_labels = data.data['test_sol'].obs["cell_type"].to_numpy()
         cell_type_labels_unique = list(np.unique(cell_type_labels))
         c_labels = np.array([cell_type_labels_unique.index(item) for item in cell_type_labels])

From 0f3c2686f7d4a05c8b52f0a058c3689f048cce31 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 21:15:09 +0800
Subject: [PATCH 174/203] translate notes

---
 .../result_analysis/get_important_pattern.py  | 38 ++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py
index 657fe9b0..e44a871c 100644
--- a/examples/result_analysis/get_important_pattern.py
+++ b/examples/result_analysis/get_important_pattern.py
@@ -8,6 +8,7 @@
 from itertools import combinations
 from os import X_OK
 from pathlib import Path
+from venv import logger
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -28,9 +29,9 @@
 from typing_extensions import deprecated
 
 
-#TODO need to sync all files or get sweep,not file
+#use get_important_pattern_sweep.py
 #asceding need to think
-#负向的pattern，换一下顺序就可以吧
+#Negative pattern, just need to change the order
 def get_important_pattern(test_accs, ascending, vis=True, alpha=0.05, title=""):
 
     if vis:
@@ -78,10 +79,6 @@ def change_real_rank(rank_item, real_rank):
         return []
 
 
-def replace_nan_in_2d(lst):  #nan应该是个极差的值而不是直接删掉
-    return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst]
-
-
 def are_all_elements_same_direct(list_2d):
     first_element = None
     for sublist in list_2d:
@@ -130,6 +127,10 @@ def get_significant_top_n_zscore(data, n=3, threshold=1.0, ascending=False):
 
 
 def get_test_acc_and_names(step2_data, metric_name):
+
+    def replace_nan_in_2d(lst):  #nan should be an extreme value rather than being directly deleted
+        return [[np.nan if item == 'NaN' else item for item in sublist] for sublist in lst]
+
     columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
     test_accs = []
     test_acc_names = []
@@ -154,7 +155,7 @@ def get_com_all(step2_data, metric_name, ascending, vis=True, alpha=0.05):
     ans_all = []
     test_accs, test_acc_names = get_test_acc_and_names(step2_data, metric_name)
     final_ranks = get_important_pattern(test_accs, ascending, alpha=alpha, title="all_pattern", vis=vis)
-    if len(final_ranks) > 0:  #TODO maybe need to think ascending
+    if len(final_ranks) > 0:
         max_rank = max(final_ranks)
         max_rank_count = final_ranks.count(max_rank)
         if max_rank_count < len(final_ranks) / 2:
@@ -179,7 +180,7 @@ def get_forest_model_pattern(step2_data, metric_name):
     X = step2_data.loc[:, columns]
     y = step2_data.loc[:, metric_name]
     preprocessor = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'),
-                                                    columns)  # drop='first'防止虚拟变量陷阱
+                                                    columns)  # drop='first' to prevent dummy variable trap
                                                    ])
     pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor',
@@ -201,23 +202,26 @@ def get_forest_model_pattern(step2_data, metric_name):
         scoring='neg_mean_squared_error',
         n_jobs=-1,
         verbose=1,
-        refit=True  # 确保在所有数据上重新训练最佳模型
+        refit=True  # Ensure the best model is retrained on all data
     )
     grid_search.fit(X, y)
     best_pipeline = grid_search.best_estimator_
     model = best_pipeline.named_steps['regressor']
-    X_preprocessed = best_pipeline.named_steps['preprocessor'].transform(
-        X)  #TODO best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)是否和X_preprocessed一定是对应的？
-    explainer = shapiq.TreeExplainer(model=model, index="k-SII", max_order=3)  #思考为什么没有负值，因为是绝对值相加，可能是为了正负值不会相互抵消
+    X_preprocessed = best_pipeline.named_steps['preprocessor'].transform(X)
+    feature_names = best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns)
+    logger.info(f"X_preprocessed.columns={X_preprocessed.columns}")
+    logger.info(f"feature_names={feature_names}")
+    explainer = shapiq.TreeExplainer(
+        model=model, index="k-SII", max_order=3
+    )  # Consider why there are no negative values, possibly to prevent cancellation of positive and negative values
     list_of_interaction_values = explainer.explain_X(X_preprocessed.toarray(), n_jobs=96, random_state=42)
     plt.cla()
-    ax = shapiq.plot.bar_plot(list_of_interaction_values,
-                              feature_names=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns),
-                              max_display=None, show=False, need_abbreviate=False)
+    ax = shapiq.plot.bar_plot(list_of_interaction_values, feature_names=feature_names, max_display=None, show=False,
+                              need_abbreviate=False)
     ax.yaxis.get_major_locator().MAXTICKS = 1000000
     plt.show()
     rects = ax.containers[0]
-    yticklabels = ax.get_yticklabels()  #label和rect是否重合需要验证
+    yticklabels = ax.get_yticklabels()  # Need to verify if labels and rectangles overlap
     shap_ans = {}
     for rect, label in zip(rects, yticklabels):
         xy = rect.get_xy()
@@ -229,7 +233,7 @@ def get_forest_model_pattern(step2_data, metric_name):
             raise RuntimeError("Features should not be repeated")
         shap_ans[k] = v
 
-    ans = get_significant_items(shap_ans)  #检查一下是不是真的pattern，好像结果不太好，再检验一下
+    ans = get_significant_items(shap_ans)  # Check if it's really a pattern, the results seem not good, need to verify
     preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), index=X.index,
                                    columns=best_pipeline.named_steps['preprocessor'].get_feature_names_out(columns))
     preprocessed_df[metric_name] = step2_data[metric_name]

From 53551304dcc4d2b0252d2f862de9e9f17b4b9ea3 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Wed, 18 Dec 2024 21:34:47 +0800
Subject: [PATCH 175/203] translate notes

---
 examples/result_analysis/get_important_pattern_sweep.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py
index 896a0d0a..9fe72ada 100644
--- a/examples/result_analysis/get_important_pattern_sweep.py
+++ b/examples/result_analysis/get_important_pattern_sweep.py
@@ -50,7 +50,7 @@ def get_additional_sweep(sweep_id):
     #last run command
     run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None)
     additional_sweep_ids = [sweep_id]
-    if run is None:  #check summary data num,note aznph5wt,数量可能不一致。
+    if run is None:  # check summary data count, note aznph5wt, quantities may be inconsistent
         return additional_sweep_ids
     run_id = run.id
     web_abs = requests.get(f"https://api.wandb.ai/files/{run.entity}/{run.project}/{run_id}/wandb-metadata.json")
@@ -73,7 +73,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F
         if pd.isna(min_metric):
             return {
                 "error":
-                f"All {metric_name} values ​​are NaN and the minimum cannot be calculated. Please check your data."
+                f"All {metric_name} values are NaN and the minimum cannot be calculated. Please check your data."
             }
         step2_data[metric_name] = step2_data[metric_name].fillna(0)  #if ascending=False
     else:
@@ -81,7 +81,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F
         if pd.isna(max_metric):
             return {
                 "error":
-                f"All {metric_name} values ​​are NaN and the maximum cannot be calculated. Please check your data."
+                f"All {metric_name} values are NaN and the maximum cannot be calculated. Please check your data."
             }
         print(f"\nmax {metric_name}:{max_metric}")
         buffer_percentage = 0.2  # 20%

From 7c8c6575110847a30121e0df68cd14b43c81b631 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Dec 2024 17:14:16 +0800
Subject: [PATCH 176/203] add atlas

---
 dance/atlas/data_dropbox_upload.py            | 101 ++++++++++++++++++
 .../pipeline_params_tuning_config.yaml        |  73 +++++++++++++
 examples/atlas/config/commands.yaml           |   2 +
 examples/atlas/config/run_config.yaml         |   8 ++
 examples/atlas/setup_run.py                   |  71 ++++++++++++
 examples/atlas/upload_data.py                 |  69 ++++++++++++
 .../get_important_pattern_sweep.py            |   6 +-
 7 files changed, 327 insertions(+), 3 deletions(-)
 create mode 100644 dance/atlas/data_dropbox_upload.py
 create mode 100644 examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml
 create mode 100644 examples/atlas/config/commands.yaml
 create mode 100644 examples/atlas/config/run_config.yaml
 create mode 100644 examples/atlas/setup_run.py
 create mode 100644 examples/atlas/upload_data.py

diff --git a/dance/atlas/data_dropbox_upload.py b/dance/atlas/data_dropbox_upload.py
new file mode 100644
index 00000000..718e07c9
--- /dev/null
+++ b/dance/atlas/data_dropbox_upload.py
@@ -0,0 +1,101 @@
+import json
+import os
+import pathlib
+
+import dropbox
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from dropbox.exceptions import ApiError, AuthError
+
+from dance.utils import logger
+
+
+def upload_file_to_dropbox(dropbox_path, access_token, local_path):
+    dbx = dropbox.Dropbox(access_token)
+
+    # Verify access token
+    try:
+        dbx.users_get_current_account()
+    except AuthError as err:
+        print("ERROR: Invalid access token; please check your access token.")
+        return None
+    try:
+        file_upload(dbx=dbx, local_path=local_path, remote_path=dropbox_path)
+        print("Upload successful.")
+    except ApiError as err:
+        print(f"API error: {err}")
+        return None
+
+
+def file_upload(dbx: dropbox.Dropbox, local_path: pathlib.Path, remote_path: str):
+    CHUNKSIZE = 100 * 1024 * 1024
+    upload_session_start_result = dbx.files_upload_session_start(b'')
+    cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id, offset=0)
+    with local_path.open("rb") as f:
+        while True:
+            data = f.read(CHUNKSIZE)
+            if data == b"":
+                break
+            logger.debug("Pushing %d bytes", len(data))
+            dbx.files_upload_session_append_v2(data, cursor)
+            cursor.offset += len(data)
+    commit = dropbox.files.CommitInfo(path=remote_path)
+    dbx.files_upload_session_finish(b'', cursor, commit)
+
+
+def create_shared_link(dbx, dropbox_path):
+    """Create or get existing shared link.
+
+    :param dbx: Dropbox object
+    :param dropbox_path: File path on Dropbox
+    :return: Shared link URL
+
+    """
+    try:
+        links = dbx.sharing_list_shared_links(path=dropbox_path, direct_only=True).links
+        if links:
+            # If shared link already exists, return the first one
+            return links[0].url
+        else:
+            # Create a new shared link
+            link = dbx.sharing_create_shared_link_with_settings(dropbox_path)
+            return link.url
+    except ApiError as err:
+        print(f"Error creating shared link: {err}")
+        return None
+
+
+def get_link(data_fname, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH):
+    DROPBOX_DEST_PATH = DROPBOX_DEST_PATH + "/" + data_fname
+
+    upload_file_to_dropbox(dropbox_path=DROPBOX_DEST_PATH, access_token=ACCESS_TOKEN, local_path=local_path)
+
+    # Create Dropbox object to get shared link
+    dbx = dropbox.Dropbox(ACCESS_TOKEN)
+    # Get shared link
+    shared_link = create_shared_link(dbx, DROPBOX_DEST_PATH)
+    if shared_link:
+        # Dropbox shared link defaults to `dl=0` at the end, which means preview in browser.
+        # change it to `dl=1`.
+        download_link = shared_link.replace('&dl=0', '&dl=1')
+        print(f"Download link: {download_link}")
+        return download_link
+    else:
+        print("Unable to get shared link.")
+
+
+def get_ans(data: sc.AnnData, tissue: str, dataset_id: str, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH):
+    # keys=["species","tissue","dataset","split","celltype_fname","celltype_url","data_fname","data_url"]
+    ans = {}
+    ans["species"] = "human"
+    ans["tissue"] = tissue.capitalize()
+    ans["dataset"] = data.n_obs
+    ans["split"] = "train"
+    ans["celltype_fname"] = ""
+    ans["celltype_url"] = ""
+    ans["data_fname"] = f"train_human_{tissue.capitalize()}{dataset_id}_data.h5ad"
+    ans["data_url"] = get_link(data_fname=ans["data_fname"].split("_", 1)[1], local_path=local_path,
+                               ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH)
+    ans["is_ALL_Integer"] = np.all(np.equal(data.X.data, data.X.data.astype(int)))
+    return ans
diff --git a/examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml b/examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml
new file mode 100644
index 00000000..fb607022
--- /dev/null
+++ b/examples/atlas/config/atlas_template_yamls/cta_actinn/pipeline_params_tuning_config.yaml
@@ -0,0 +1,73 @@
+type: preprocessor
+tune_mode: pipeline_params
+pipeline_tuning_top_k: 3
+parameter_tuning_freq_n: 20
+pipeline:
+  - type: filter.gene
+    include:
+      - FilterGenesPercentile
+      - FilterGenesScanpyOrder
+      - FilterGenesPlaceHolder
+    default_params:
+      FilterGenesScanpyOrder:
+        order: ["min_counts", "min_cells", "max_counts", "max_cells"]
+        min_counts: 0.01
+        max_counts: 0.99
+        min_cells: 0.01
+        max_cells: 0.99
+  - type: normalize
+    include:
+      - ScaleFeature
+      - ScTransform
+      - Log1P
+      - NormalizeTotal
+      - NormalizePlaceHolder
+    default_params:
+      ScTransform:
+        processes_num: 8
+  - type: filter.gene
+    include:
+      - HighlyVariableGenesLogarithmizedByMeanAndDisp
+      - HighlyVariableGenesRawCount
+      - HighlyVariableGenesLogarithmizedByTopGenes
+      - FilterGenesTopK
+      - FilterGenesRegression
+      - FilterGenesNumberPlaceHolder
+    default_params:
+      FilterGenesTopK:
+        num_genes: 3000
+      FilterGenesRegression:
+        num_genes: 3000
+      HighlyVariableGenesRawCount:
+        n_top_genes: 3000
+      HighlyVariableGenesLogarithmizedByTopGenes:
+        n_top_genes: 3000
+  - type: feature.cell
+    include:
+      - WeightedFeaturePCA
+      - WeightedFeatureSVD
+      - CellPCA
+      - CellSVD
+      - GaussRandProjFeature  # Registered custom preprocessing func
+      - FeatureCellPlaceHolder
+    params:
+      out: feature.cell
+      log_level: INFO
+    default_params:
+      WeightedFeaturePCA:
+        split_name: train
+      WeightedFeatureSVD:
+        split_name: train
+  - type: misc
+    target: SetConfig
+    params:
+      config_dict:
+        feature_channel: feature.cell
+        label_channel: cell_type
+wandb:
+  entity: xzy11632
+  project: dance-dev
+  method: grid #try grid to provide a comprehensive search
+  metric:
+    name: acc  # val/acc
+    goal: maximize
diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml
new file mode 100644
index 00000000..fc68d057
--- /dev/null
+++ b/examples/atlas/config/commands.yaml
@@ -0,0 +1,2 @@
+cta_actinn:
+    command:"python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"
diff --git a/examples/atlas/config/run_config.yaml b/examples/atlas/config/run_config.yaml
new file mode 100644
index 00000000..c88f3378
--- /dev/null
+++ b/examples/atlas/config/run_config.yaml
@@ -0,0 +1,8 @@
+runs:
+  - algorithm_name: cta_actinn
+    dataset_id: "0bc7235a-ae5a-479d-a487-510435377e55"
+    species: human
+    tissue: Brain
+    filetype: h5ad
+    count: 800
+    device: cuda:0
diff --git a/examples/atlas/setup_run.py b/examples/atlas/setup_run.py
new file mode 100644
index 00000000..d29d9e37
--- /dev/null
+++ b/examples/atlas/setup_run.py
@@ -0,0 +1,71 @@
+import argparse
+import os
+import shutil
+import sys
+
+import yaml
+
+from dance.settings import DANCEDIR
+
+
+def load_commands(config_path):
+    with open(config_path, encoding='utf-8') as f:
+        return yaml.safe_load(f)
+
+
+def load_run_configs(run_config_path):
+    with open(run_config_path, encoding='utf-8') as f:
+        return yaml.safe_load(f)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Setup run parameters')
+    parser.add_argument('--config', type=str, default="config/run_config.yaml", help='Run configuration YAML file')
+
+    args = parser.parse_args()
+
+    run_configs = load_run_configs(args.config)
+
+    commands_config = load_commands("commands.yaml")
+
+    for run in run_configs.get("runs", []):
+        algorithm_name = run.get('algorithm_name')
+        dataset_id = run.get('dataset_id')
+        species = run.get('species')
+        tissue = run.get('tissue')
+        filetype = run.get('filetype')
+        count = run.get('count')
+        device = run.get('device')
+
+        # Define paths
+        template_path = os.path.join("config/atlas_template_yamls",
+                                     f"{algorithm_name}/pipeline_params_tuning_config.yaml")
+        config_dir = f"{DANCEDIR}/examples/tuning/{algorithm_name}/{dataset_id}"
+        os.makedirs(config_dir, exist_ok=True)
+        config_filename = f"pipeline_params_tuning_config.yaml"
+        config_path = os.path.join(config_dir, config_filename)
+
+        # Copy configuration file
+        shutil.copy(template_path, config_path)
+        print(f"Template copied to {config_path}")
+
+        if algorithm_name not in commands_config.get("algorithms", {}):
+            print(f"Error: Command not found for algorithm '{algorithm_name}'. Please check commands.yaml file.")
+            continue
+
+        command_template = commands_config["algorithms"][algorithm_name]["command"]
+        run_command = command_template.format(dataset_id=dataset_id, species=species, tissue=tissue, filetype=filetype,
+                                              count=count, device=device)
+
+        # Append the run command to run.sh
+        run_sh_path = f"{DANCEDIR}/examples/tuning/{algorithm_name}/run.sh"
+        with open(run_sh_path, "a", encoding='utf-8') as run_script:
+            run_script.write(f"{run_command}\n")
+
+        print(f"Run command appended to {run_sh_path}: {run_command}")
+
+    print("All run configurations have been processed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/atlas/upload_data.py b/examples/atlas/upload_data.py
new file mode 100644
index 00000000..29b45a18
--- /dev/null
+++ b/examples/atlas/upload_data.py
@@ -0,0 +1,69 @@
+import argparse
+import json
+import pathlib
+
+import pandas as pd
+import scanpy as sc
+
+from dance.atlas.data_dropbox_upload import get_ans, get_data
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument("--maindir", type=str)
+    args.add_argument("--filedir", type=str)
+    args.add_argument("--tissues", type=str, nargs="+")
+    args.add_argument("--access_token", type=str)
+    args.add_argument("--dropbox_dest_path", type=str,
+                      default="/preprocessing_benchmarking/cell_type_annotation/TEMP_Tran_5_Datasets/human")
+    args = args.parse_args()
+    MAINDIR = pathlib.Path(args.maindir)
+    FILEDIR = pathlib.Path(args.filedir)
+    tissues = args.tissues
+    # tissues=["kidney","lung","pancreas"]
+    # Configuration parameters
+    ACCESS_TOKEN = args.access_token
+    DROPBOX_DEST_PATH = args.dropbox_dest_path  # Destination path on Dropbox
+
+    def get_data(dataset_id, in_atlas=False, large=False):
+        if large:
+            if in_atlas:
+                local_path = MAINDIR / f"sampled-10000/{tissue}/{dataset_id}.h5ad"
+            else:
+                local_path = FILEDIR / f"sampled-10000/{tissue}/{dataset_id}.h5ad"
+        else:
+            local_path = MAINDIR / f"{tissue}/{dataset_id}.h5ad"
+        data = sc.read_h5ad(local_path)
+        return data, local_path
+
+    ans_all = []
+
+    with open(FILEDIR / "results/atlas_result.json") as f:
+        result = json.load(f)
+    with open(FILEDIR / "results/query_result.json") as f:
+        query_result = json.load(f)
+    for tissue in tissues:
+        large_dataset_ids = result[tissue][0]
+        small_dataset_ids = result[tissue][1]
+        for large_dataset_id in large_dataset_ids:
+            data, local_path = get_data(dataset_id=large_dataset_id, in_atlas=True, large=True)
+            ans_all.append(
+                get_ans(dataset_id=large_dataset_id, tissue=tissue, data=data, local_path=local_path,
+                        ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH))
+        for small_dataset_id in small_dataset_ids:
+            data, local_path = get_data(dataset_id=small_dataset_id, in_atlas=True, large=False)
+            ans_all.append(
+                get_ans(dataset_id=small_dataset_id, tissue=tissue, data=data, local_path=local_path,
+                        ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH))
+        large_query_dataset_ids = query_result[tissue][0]
+        small_query_dataset_ids = query_result[tissue][1]
+        for large_query_dataset_id in large_query_dataset_ids:
+            data, local_path = get_data(dataset_id=large_query_dataset_id, in_atlas=False, large=True)
+            ans_all.append(
+                get_ans(dataset_id=large_query_dataset_id, tissue=tissue, data=data, local_path=local_path,
+                        ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH))
+        for small_query_dataset_id in small_query_dataset_ids:
+            data, local_path = get_data(dataset_id=small_query_dataset_id, in_atlas=False, large=False)
+            ans_all.append(
+                get_ans(dataset_id=small_query_dataset_id, tissue=tissue, data=data, local_path=local_path,
+                        ACCESS_TOKEN=ACCESS_TOKEN, DROPBOX_DEST_PATH=DROPBOX_DEST_PATH))
+    pd.DataFrame(ans_all).set_index("species").to_csv(",".join(tissues) + "scdeeepsort.csv")
diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py
index 896a0d0a..9fe72ada 100644
--- a/examples/result_analysis/get_important_pattern_sweep.py
+++ b/examples/result_analysis/get_important_pattern_sweep.py
@@ -50,7 +50,7 @@ def get_additional_sweep(sweep_id):
     #last run command
     run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None)
     additional_sweep_ids = [sweep_id]
-    if run is None:  #check summary data num,note aznph5wt,数量可能不一致。
+    if run is None:  # check summary data count, note aznph5wt, quantities may be inconsistent
         return additional_sweep_ids
     run_id = run.id
     web_abs = requests.get(f"https://api.wandb.ai/files/{run.entity}/{run.project}/{run_id}/wandb-metadata.json")
@@ -73,7 +73,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F
         if pd.isna(min_metric):
             return {
                 "error":
-                f"All {metric_name} values ​​are NaN and the minimum cannot be calculated. Please check your data."
+                f"All {metric_name} values are NaN and the minimum cannot be calculated. Please check your data."
             }
         step2_data[metric_name] = step2_data[metric_name].fillna(0)  #if ascending=False
     else:
@@ -81,7 +81,7 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F
         if pd.isna(max_metric):
             return {
                 "error":
-                f"All {metric_name} values ​​are NaN and the maximum cannot be calculated. Please check your data."
+                f"All {metric_name} values are NaN and the maximum cannot be calculated. Please check your data."
             }
         print(f"\nmax {metric_name}:{max_metric}")
         buffer_percentage = 0.2  # 20%

From 7738d7d0d24ace2b61b9d91d1037336e9a0f3ceb Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Dec 2024 17:27:47 +0800
Subject: [PATCH 177/203] minor

---
 examples/atlas/upload_data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/atlas/upload_data.py b/examples/atlas/upload_data.py
index 29b45a18..2a6f3f8b 100644
--- a/examples/atlas/upload_data.py
+++ b/examples/atlas/upload_data.py
@@ -13,8 +13,7 @@
     args.add_argument("--filedir", type=str)
     args.add_argument("--tissues", type=str, nargs="+")
     args.add_argument("--access_token", type=str)
-    args.add_argument("--dropbox_dest_path", type=str,
-                      default="/preprocessing_benchmarking/cell_type_annotation/TEMP_Tran_5_Datasets/human")
+    args.add_argument("--dropbox_dest_path", type=str)
     args = args.parse_args()
     MAINDIR = pathlib.Path(args.maindir)
     FILEDIR = pathlib.Path(args.filedir)

From 35b105c06b813677dbc6738aa08eb3991867594f Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Dec 2024 21:25:04 +0800
Subject: [PATCH 178/203] add atlas

---
 examples/atlas/config/commands.yaml   |  5 ++--
 examples/atlas/config/run_config.csv  |  9 ++++++++
 examples/atlas/config/run_config.yaml |  8 -------
 examples/atlas/setup_run.py           | 33 +++++++++++++++------------
 4 files changed, 31 insertions(+), 24 deletions(-)
 create mode 100644 examples/atlas/config/run_config.csv
 delete mode 100644 examples/atlas/config/run_config.yaml

diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml
index fc68d057..32c66dd0 100644
--- a/examples/atlas/config/commands.yaml
+++ b/examples/atlas/config/commands.yaml
@@ -1,2 +1,3 @@
-cta_actinn:
-    command:"python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"
+algorithms:
+  cta_actinn:
+    ommand: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"
diff --git a/examples/atlas/config/run_config.csv b/examples/atlas/config/run_config.csv
new file mode 100644
index 00000000..e08184cc
--- /dev/null
+++ b/examples/atlas/config/run_config.csv
@@ -0,0 +1,9 @@
+algorithm_name,dataset_id,species,tissue,filetype,count,device
+cta_actinn,6a30bf44-c490-41ac-965b-0bb58432b10a,human,Kidney,h5ad,800,cuda:0
+cta_actinn,f801b7a9-80a6-4d09-9161-71474deb58ae,human,Kidney,h5ad,800,cuda:1
+cta_actinn,20d87640-4be8-487f-93d4-dce38378d00f,human,Kidney,h5ad,800,cuda:2
+cta_actinn,2d31c0ca-0233-41ce-bd1a-05aa8404b073,human,Kidney,h5ad,800,cuda:3
+cta_actinn,fd072bc3-2dfb-46f8-b4e3-467cb3223182,human,Kidney,h5ad,800,cuda:4
+cta_actinn,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Kidney),human,Kidney,h5ad,800,cuda:5
+cta_actinn,0b75c598-0893-4216-afe8-5414cab7739d,human,Kidney,h5ad,800,cuda:6
+cta_actinn,2aa1c93c-4ef3-4e9a-98e7-0bd37933953c,human,Kidney,h5ad,800,cuda:7
diff --git a/examples/atlas/config/run_config.yaml b/examples/atlas/config/run_config.yaml
deleted file mode 100644
index c88f3378..00000000
--- a/examples/atlas/config/run_config.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-runs:
-  - algorithm_name: cta_actinn
-    dataset_id: "0bc7235a-ae5a-479d-a487-510435377e55"
-    species: human
-    tissue: Brain
-    filetype: h5ad
-    count: 800
-    device: cuda:0
diff --git a/examples/atlas/setup_run.py b/examples/atlas/setup_run.py
index d29d9e37..4b3bdf5f 100644
--- a/examples/atlas/setup_run.py
+++ b/examples/atlas/setup_run.py
@@ -3,9 +3,11 @@
 import shutil
 import sys
 
+import pandas as pd
 import yaml
 
 from dance.settings import DANCEDIR
+from dance.utils import logger
 
 
 def load_commands(config_path):
@@ -14,34 +16,37 @@ def load_commands(config_path):
 
 
 def load_run_configs(run_config_path):
-    with open(run_config_path, encoding='utf-8') as f:
-        return yaml.safe_load(f)
+    return pd.read_csv(run_config_path)
 
 
 def main():
     parser = argparse.ArgumentParser(description='Setup run parameters')
-    parser.add_argument('--config', type=str, default="config/run_config.yaml", help='Run configuration YAML file')
+    parser.add_argument('--config', type=str, default="config/run_config.csv", help='Run configuration CSV file')
 
     args = parser.parse_args()
 
-    run_configs = load_run_configs(args.config)
+    run_configs_df = load_run_configs(args.config)
 
-    commands_config = load_commands("commands.yaml")
+    commands_config = load_commands("config/commands.yaml")
 
-    for run in run_configs.get("runs", []):
-        algorithm_name = run.get('algorithm_name')
-        dataset_id = run.get('dataset_id')
-        species = run.get('species')
-        tissue = run.get('tissue')
-        filetype = run.get('filetype')
-        count = run.get('count')
-        device = run.get('device')
+    for _, run in run_configs_df.iterrows():
+        algorithm_name = run['algorithm_name']
+        dataset_id = run['dataset_id']
+        species = run['species']
+        tissue = run['tissue']
+        filetype = run['filetype']
+        count = run['count']
+        device = run['device']
 
         # Define paths
         template_path = os.path.join("config/atlas_template_yamls",
                                      f"{algorithm_name}/pipeline_params_tuning_config.yaml")
         config_dir = f"{DANCEDIR}/examples/tuning/{algorithm_name}/{dataset_id}"
-        os.makedirs(config_dir, exist_ok=True)
+        try:
+            os.makedirs(config_dir, exist_ok=False)
+        except FileExistsError:
+            logger.warning(f"Error: Directory {config_dir} already exists. Please remove it before running again.")
+            continue
         config_filename = f"pipeline_params_tuning_config.yaml"
         config_path = os.path.join(config_dir, config_filename)
 

From 4d3a0d263c3d80dd70fac35677dcf49705951daa Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Dec 2024 21:40:09 +0800
Subject: [PATCH 179/203] update scdeepsort

---
 dance/metadata/scdeepsort.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 44091d47..39e41209 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -166,7 +166,7 @@ human,Kidney,6044,train,,,train_human_Kidneyf801b7a9-80a6-4d09-9161-71474deb58ae
 human,Kidney,7802,train,,,train_human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad,https://www.dropbox.com/scl/fi/xmzomvt0c8bza3fy8me0p/human_Kidney20d87640-4be8-487f-93d4-dce38378d00f_data.h5ad?rlkey=iqzword5254z5rujjdey1u8hc&dl=1
 human,Kidney,6847,train,,,train_human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad,https://www.dropbox.com/scl/fi/rhngz2alde48jotpy5c5v/human_Kidney2d31c0ca-0233-41ce-bd1a-05aa8404b073_data.h5ad?rlkey=u0x4dsnt569wq07l3h1rqjzum&dl=1
 human,Kidney,10000,train,,,train_human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad,https://www.dropbox.com/scl/fi/ybml7y2bth0qjnv3x1ieg/human_Kidneyfd072bc3-2dfb-46f8-b4e3-467cb3223182_data.h5ad?rlkey=qkjgdqttk3s10ht54109a4cad&dl=1
-human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad,https://www.dropbox.com/scl/fi/553s0af5q2nibafj4nkux/human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf_data.h5ad?rlkey=p85qlsjixsuuutgwnms3w4y30&dl=1
+human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf(Kidney)_data.h5ad,https://www.dropbox.com/scl/fi/553s0af5q2nibafj4nkux/human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4bf-Kidney-_data.h5ad?rlkey=p85qlsjixsuuutgwnms3w4y30&st=igznlz90&dl=1
 human,Kidney,10000,train,,,train_human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad,https://www.dropbox.com/scl/fi/feklth6jvnc5qqwvgaydy/human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad?rlkey=28vpy2m90lnri9aekfthrsvr1&dl=1
 human,Kidney,5848,train,,,train_human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad,https://www.dropbox.com/scl/fi/1jq1wrqo1rcl041antcm8/human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad?rlkey=ssgfsiobqfah3pxgqnrsaff6l&dl=1
 human,Kidney,9641,train,,,train_human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad,https://www.dropbox.com/scl/fi/o2cnntkrd5j6coeqehv8b/human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad?rlkey=5tbupfd3cdvqzy2rix6scvwzu&dl=1

From c7ac320b1e23b38830f53a80002938b3669a9e3d Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Dec 2024 22:01:42 +0800
Subject: [PATCH 180/203] minor

---
 examples/atlas/config/commands.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml
index 32c66dd0..41a3f2ef 100644
--- a/examples/atlas/config/commands.yaml
+++ b/examples/atlas/config/commands.yaml
@@ -1,3 +1,5 @@
 algorithms:
   cta_actinn:
-    ommand: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"
+    command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"
+  cta_singlecellnet:
+    command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &"

From 2cf8e366ec7a46e427f2567ee675498be15e1702 Mon Sep 17 00:00:00 2001
From: xingzhongyu <xzy12386@163.com>
Date: Thu, 19 Dec 2024 22:21:38 +0800
Subject: [PATCH 181/203] minor

---
 examples/atlas/config/commands.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml
index 41a3f2ef..92b66890 100644
--- a/examples/atlas/config/commands.yaml
+++ b/examples/atlas/config/commands.yaml
@@ -3,3 +3,5 @@ algorithms:
     command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"
   cta_singlecellnet:
     command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &"
+  cta_celltypist:
+    command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &"

From 5b3d2de3bb164f09381fe4068a76d5f7134cd8ae Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Thu, 19 Dec 2024 20:33:04 -0500
Subject: [PATCH 182/203] minor

---
 examples/atlas/config/commands.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/atlas/config/commands.yaml b/examples/atlas/config/commands.yaml
index 92b66890..7d058c51 100644
--- a/examples/atlas/config/commands.yaml
+++ b/examples/atlas/config/commands.yaml
@@ -5,3 +5,5 @@ algorithms:
     command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &"
   cta_celltypist:
     command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} > {dataset_id}/out.log 2>&1 &"
+  cta_scdeepsort:
+    command: "python main.py --species {species} --tissue {tissue} --train_dataset {dataset_id}  --filetype {filetype} --count {count} --device {device} > {dataset_id}/out.log 2>&1 &"

From a18f3ae9878fdf836289e8c792f0aa2136c266af Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 09:48:19 +0800
Subject: [PATCH 183/203] add notes

---
 examples/atlas/setup_run.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/atlas/setup_run.py b/examples/atlas/setup_run.py
index 4b3bdf5f..06217714 100644
--- a/examples/atlas/setup_run.py
+++ b/examples/atlas/setup_run.py
@@ -11,11 +11,14 @@
 
 
 def load_commands(config_path):
+    """Load YAML configuration file containing command templates for different
+    algorithms."""
     with open(config_path, encoding='utf-8') as f:
         return yaml.safe_load(f)
 
 
 def load_run_configs(run_config_path):
+    """Load CSV file containing run configurations for different experiments."""
     return pd.read_csv(run_config_path)
 
 
@@ -25,11 +28,13 @@ def main():
 
     args = parser.parse_args()
 
+    # Load configuration files
     run_configs_df = load_run_configs(args.config)
-
     commands_config = load_commands("config/commands.yaml")
 
+    # Process each run configuration
     for _, run in run_configs_df.iterrows():
+        # Extract parameters for current run
         algorithm_name = run['algorithm_name']
         dataset_id = run['dataset_id']
         species = run['species']
@@ -38,15 +43,18 @@ def main():
         count = run['count']
         device = run['device']
 
-        # Define paths
+        # Setup directory structure for the algorithm configuration
         template_path = os.path.join("config/atlas_template_yamls",
                                      f"{algorithm_name}/pipeline_params_tuning_config.yaml")
         config_dir = f"{DANCEDIR}/examples/tuning/{algorithm_name}/{dataset_id}"
+
+        # Create configuration directory if it doesn't exist
         try:
             os.makedirs(config_dir, exist_ok=False)
         except FileExistsError:
             logger.warning(f"Error: Directory {config_dir} already exists. Please remove it before running again.")
             continue
+
         config_filename = f"pipeline_params_tuning_config.yaml"
         config_path = os.path.join(config_dir, config_filename)
 
@@ -54,15 +62,17 @@ def main():
         shutil.copy(template_path, config_path)
         print(f"Template copied to {config_path}")
 
+        # Validate algorithm exists in commands configuration
         if algorithm_name not in commands_config.get("algorithms", {}):
             print(f"Error: Command not found for algorithm '{algorithm_name}'. Please check commands.yaml file.")
             continue
 
+        # Format command template with run parameters
         command_template = commands_config["algorithms"][algorithm_name]["command"]
         run_command = command_template.format(dataset_id=dataset_id, species=species, tissue=tissue, filetype=filetype,
                                               count=count, device=device)
 
-        # Append the run command to run.sh
+        # Append generated command to run script
         run_sh_path = f"{DANCEDIR}/examples/tuning/{algorithm_name}/run.sh"
         with open(run_sh_path, "a", encoding='utf-8') as run_script:
             run_script.write(f"{run_command}\n")

From dbd1fa322ba5b0575627c6b1b0a180a451a43442 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 10:07:48 +0800
Subject: [PATCH 184/203] add notes

---
 examples/get_result_web.py | 57 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/examples/get_result_web.py b/examples/get_result_web.py
index c69e9d4e..2c0413da 100644
--- a/examples/get_result_web.py
+++ b/examples/get_result_web.py
@@ -15,6 +15,15 @@
 
 
 def check_identical_strings(string_list):
+    """
+    Check if all strings in the list are identical
+    Args:
+        string_list: List of strings to compare
+    Returns:
+        The common string if all strings are identical
+    Raises:
+        ValueError if list is empty or strings are different
+    """
     if not string_list:
         raise ValueError("The list is empty")
 
@@ -34,6 +43,14 @@ def check_identical_strings(string_list):
 
 
 def get_sweep_url(step_csv: pd.DataFrame, single=True):
+    """
+    Extract wandb sweep URL from a DataFrame containing run IDs
+    Args:
+        step_csv: DataFrame containing run IDs
+        single: If True, only process the first run
+    Returns:
+        The sweep URL
+    """
     ids = step_csv["id"]
     sweep_urls = []
     for run_id in tqdm(reversed(ids),
@@ -51,6 +68,13 @@ def get_sweep_url(step_csv: pd.DataFrame, single=True):
 
 
 def spilt_web(url: str):
+    """
+    Parse wandb URL to extract entity, project and sweep ID
+    Args:
+        url: wandb sweep URL
+    Returns:
+        Tuple of (entity, project, sweep_id) or None if parsing fails
+    """
     pattern = r"https://wandb\.ai/([^/]+)/([^/]+)/sweeps/([^/]+)"
 
     match = re.search(pattern, url)
@@ -70,6 +94,14 @@ def spilt_web(url: str):
 
 
 def get_best_method(urls, metric_col="test_acc"):
+    """
+    Find the best performing method across multiple sweeps
+    Args:
+        urls: List of sweep URLs to compare
+        metric_col: Metric column name to use for comparison
+    Returns:
+        Tuple of (best_step_name, best_run, best_metric_value)
+    """
     all_best_run = None
     all_best_step_name = None
     step_names = ["step2", "step3_0", "step3_1", "step3_2"]
@@ -105,6 +137,15 @@ def get_metric(run):
 
 
 def get_best_yaml(step_name, best_run, file_path):
+    """
+    Generate YAML configuration for the best performing run
+    Args:
+        step_name: Name of the step ('step2' or 'step3_X')
+        best_run: Best wandb run object
+        file_path: Path to configuration files
+    Returns:
+        YAML string containing the best configuration
+    """
     if step_name == "step2":
         conf = OmegaConf.load(f"{file_path}/pipeline_params_tuning_config.yaml")
         for i, fun in enumerate(conf["pipeline"]):
@@ -141,6 +182,13 @@ def get_best_yaml(step_name, best_run, file_path):
 
 
 def check_exist(file_path):
+    """
+    Check if results directory exists and contains multiple files
+    Args:
+        file_path: Path to check
+    Returns:
+        Boolean indicating if valid results exist
+    """
     file_path = f"{file_path}/results/params/"
     if os.path.exists(file_path) and os.path.isdir(file_path):
         file_num = len(os.listdir(file_path))
@@ -150,6 +198,12 @@ def check_exist(file_path):
 
 
 def write_ans(tissue):
+    """
+    Process results for a specific tissue type and write to CSV
+    Args:
+        tissue: Name of the tissue to process
+    Writes results to a CSV file named '{tissue}_ans.csv'
+    """
     ans = []
     collect_datasets = all_datasets[tissue]
 
@@ -182,9 +236,12 @@ def write_ans(tissue):
 
 
 if __name__ == "__main__":
+    # Initialize wandb and set global configuration
     wandb = try_import("wandb")
     entity = "xzy11632"
     project = "dance-dev"
+
+    # Load dataset configuration and process results for tissue
     file_root = str(Path(__file__).resolve().parent)
     with open(f"{file_root}/dataset_server.json") as f:
         all_datasets = json.load(f)

From b096c259e2b5db781c2c996ce5d5da9115476de0 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 10:45:05 +0800
Subject: [PATCH 185/203] add note

---
 .../result_analysis/get_important_pattern.py  | 114 ++++++++++++++----
 1 file changed, 92 insertions(+), 22 deletions(-)

diff --git a/examples/result_analysis/get_important_pattern.py b/examples/result_analysis/get_important_pattern.py
index e44a871c..af742553 100644
--- a/examples/result_analysis/get_important_pattern.py
+++ b/examples/result_analysis/get_important_pattern.py
@@ -30,9 +30,33 @@
 
 
 #use get_important_pattern_sweep.py
-#asceding need to think
 #Negative pattern, just need to change the order
 def get_important_pattern(test_accs, ascending, vis=True, alpha=0.05, title=""):
+    """Identify important patterns in test accuracies using statistical tests.
+
+    Given multiple groups of test accuracies, this function performs Kruskal-Wallis test followed by
+    Dunn's post-hoc test to identify statistically significant differences between groups. The results
+    are then used to rank the groups based on their relative performance.
+
+    Parameters
+    ----------
+    test_accs
+        List of test accuracy groups to compare.
+    ascending
+        Boolean indicating whether to sort results in ascending order.
+    vis
+        Whether to visualize the results using box plots.
+    alpha
+        Significance level for statistical tests.
+    title
+        Title for the visualization plot.
+
+    Returns
+    -------
+    list
+        List of ranks indicating the relative importance of each group.
+
+    """
 
     if vis:
         fig = plt.figure(figsize=(12, 4))
@@ -91,6 +115,30 @@ def are_all_elements_same_direct(list_2d):
 
 
 def get_frequent_itemsets(step2_data, metric_name, ascending, threshold_per=0.1, multi_mod=False):
+    """Extract frequent patterns from top performing pipeline configurations.
+
+    Given a DataFrame containing pipeline configurations and their performance metrics, this function
+    identifies frequent patterns in the top performing configurations using the Apriori algorithm.
+
+    Parameters
+    ----------
+    step2_data
+        DataFrame containing pipeline configurations and metrics.
+    metric_name
+        Name of the performance metric to optimize.
+    ascending
+        Boolean indicating whether to sort in ascending order.
+    threshold_per
+        Percentage of top configurations to consider.
+    multi_mod
+        Whether to use multiple modalities (not implemented).
+
+    Returns
+    -------
+    list
+        List of dictionaries containing frequent itemsets and their support values.
+
+    """
     if multi_mod:
         raise NotImplementedError("need multimod")
     threshold = int(len(step2_data) * threshold_per)
@@ -176,6 +224,28 @@ def get_significant_items(data):
 
 
 def get_forest_model_pattern(step2_data, metric_name):
+    """Analyze feature importance using Random Forest and SHAP values.
+
+    Given pipeline configurations and their performance metrics, this function trains a Random Forest model
+    and uses SHAP values to identify important feature interactions. It also computes point-biserial
+    correlations to validate the importance of identified patterns.
+
+    Parameters
+    ----------
+    step2_data
+        DataFrame containing pipeline configurations and metrics.
+    metric_name
+        Target metric to predict.
+
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - Important feature interactions and their SHAP values
+        - Point-biserial correlation statistics
+        - Best model parameters and MSE
+
+    """
     columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
     X = step2_data.loc[:, columns]
     y = step2_data.loc[:, metric_name]
@@ -287,24 +357,24 @@ def list_files(directories, metric_name, ascending, file_name="best_test_acc.csv
     return ans_all
 
 
-if __name__ == "__main__":
-    directories = []
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("task", default="cluster")
-    parser.add_argument("metric_name", default="acc")
-    parser.add_argument("ascending", default=False)
-    args = parser.parse_args()
-    task = args.task
-    metric_name = args.metric_name
-    ascending = args.ascending
-    file_root = Path(__file__).resolve().parent.parent / "tuning"
-    for path in file_root.iterdir():
-        if path.is_dir():
-            if str(path.name).startswith(task):
-                directories.append(path)
-    ans_all = list_files(directories, metric_name, ascending)
-    df = pd.DataFrame(ans_all)
-    pivot_df = df.pivot(index="dataset", columns="method", values="ans")
-    pivot_df.to_csv(f"{task}_pattern.csv")
-
-    # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_actinn/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True))
+# if __name__ == "__main__":
+#     directories = []
+#     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+#     parser.add_argument("task", default="cluster")
+#     parser.add_argument("metric_name", default="acc")
+#     parser.add_argument("ascending", default=False)
+#     args = parser.parse_args()
+#     task = args.task
+#     metric_name = args.metric_name
+#     ascending = args.ascending
+#     file_root = Path(__file__).resolve().parent.parent / "tuning"
+#     for path in file_root.iterdir():
+#         if path.is_dir():
+#             if str(path.name).startswith(task):
+#                 directories.append(path)
+#     ans_all = list_files(directories, metric_name, ascending)
+#     df = pd.DataFrame(ans_all)
+#     pivot_df = df.pivot(index="dataset", columns="method", values="ans")
+#     pivot_df.to_csv(f"{task}_pattern.csv")
+
+#     # print(summary_pattern("/home/zyxing/dance/examples/tuning/cta_actinn/328_138/results/pipeline/best_test_acc.csv",alpha=0.3,vis=True))

From 4fb61450325cf1311ca6c3ef80ac1516ca0842e9 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 10:56:38 +0800
Subject: [PATCH 186/203] add notes

---
 .../get_important_pattern_sweep.py            | 66 ++++++++++++++++---
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/examples/result_analysis/get_important_pattern_sweep.py b/examples/result_analysis/get_important_pattern_sweep.py
index 9fe72ada..93f30adc 100644
--- a/examples/result_analysis/get_important_pattern_sweep.py
+++ b/examples/result_analysis/get_important_pattern_sweep.py
@@ -15,10 +15,14 @@
 from dance.pipeline import flatten_dict
 from dance.utils import try_import
 
+# Define basic configuration parameters
 entity = "xzy11632"
 project = "dance-dev"
+# List of tasks to analyze
 tasks = ["cell type annotation new", "clustering", "imputation_new", "spatial domain", "cell type deconvolution"]
+# Corresponding metrics for each task
 mertic_names = ["test_acc", "acc", "MRE", "ARI", "MSE"]
+# Whether higher values are better for each metric
 ascendings = [False, False, True, False, True]
 
 multi_mod = False
@@ -43,13 +47,27 @@
 
 
 def get_additional_sweep(sweep_id):
-    # if sweep has piror runs
-    # every run get command , get additional sweep id
-    # or last run command
+    """Recursively retrieve all related sweep IDs from a given sweep.
+
+    Given a sweep ID, this function recursively finds all related sweep IDs by examining the command
+    arguments of the runs within each sweep. It handles cases where sweeps may have prior runs or
+    additional sweep references.
+
+    Parameters
+    ----------
+    sweep_id : str
+        The initial sweep ID to start the search from.
+
+    Returns
+    -------
+    list
+        A list containing all related sweep IDs, including the input sweep_id.
+
+    """
     sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+    additional_sweep_ids = [sweep_id]
     #last run command
     run = next((t_run for t_run in sweep.runs if t_run.state == "finished"), None)
-    additional_sweep_ids = [sweep_id]
     if run is None:  # check summary data count, note aznph5wt, quantities may be inconsistent
         return additional_sweep_ids
     run_id = run.id
@@ -63,7 +81,34 @@ def get_additional_sweep(sweep_id):
 
 
 def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=False):
-    # try:
+    """Analyze patterns in pipeline configurations and their impact on performance
+    metrics.
+
+    This function examines the relationship between pipeline configurations and their corresponding
+    performance metrics. It handles missing values differently based on whether higher or lower
+    metric values are better, and can optionally visualize the results.
+
+    Parameters
+    ----------
+    step2_origin_data : pd.DataFrame
+        DataFrame containing pipeline configurations and their results.
+    metric_name : str
+        Name of the performance metric to analyze.
+    ascending : bool
+        Whether higher metric values indicate better performance.
+    alpha : float, optional
+        Significance level for statistical tests, by default 0.05.
+    vis : bool, optional
+        Whether to generate visualizations, by default False.
+
+    Returns
+    -------
+    dict
+        A dictionary containing either:
+        - Error message if all metric values are NaN
+        - Pattern analysis results including forest model and/or APR analysis
+
+    """
     columns = sorted([col for col in step2_origin_data.columns if col.startswith("pipeline")])
     step2_data = step2_origin_data.loc[:, columns + [metric_name]]
     # com_ans = get_com_all(step2_data, metric_name, ascending, vis=vis, alpha=alpha)
@@ -92,23 +137,24 @@ def summary_pattern(step2_origin_data, metric_name, ascending, alpha=0.05, vis=F
         return {"forest_model": get_forest_model_pattern(step2_data, metric_name), "apr_ans": apr_ans}
     else:
         return {"apr_ans": apr_ans}
-    # except Exception as e:
-    #     print(e)
-    #     return str(e)
 
 
 if __name__ == "__main__":
     start = True
     ans_all = []
     for i, task in enumerate(tasks):
-
+        # Skip tasks not in choose_tasks list
         if task not in choose_tasks:
             continue
+
+        # Read and preprocess results from Excel file
         data = pd.read_excel(file_root / "results.xlsx", sheet_name=task, dtype=str)
         data = data.ffill().set_index(['Methods'])
+
+        # Iterate through each method and dataset combination
         for row_idx in range(data.shape[0]):
             for col_idx in range(data.shape[1]):
-
+                # Extract metadata
                 method = data.index[row_idx]
                 dataset = data.columns[col_idx]
                 value = data.iloc[row_idx, col_idx]

From 059984e1c9f2f43920482cab0109fc8f9005b8ca Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 11:42:10 +0800
Subject: [PATCH 187/203] add notes

---
 .../atlas/sc_similarity_examples/cal_w1_w2.py |  94 ++++++++
 .../example_usage_anndata.py                  | 121 ++++++++++
 .../sc_similarity_examples/sim_query_atlas.py |   0
 examples/atlas/sc_similarity_examples/vis.py  | 112 +++++++++
 examples/result_analysis/get_num.py           |  19 ++
 examples/tuning/get_important_pattern.py      | 215 ------------------
 6 files changed, 346 insertions(+), 215 deletions(-)
 create mode 100644 examples/atlas/sc_similarity_examples/cal_w1_w2.py
 create mode 100644 examples/atlas/sc_similarity_examples/example_usage_anndata.py
 rename examples/{ => atlas}/sc_similarity_examples/sim_query_atlas.py (100%)
 create mode 100644 examples/atlas/sc_similarity_examples/vis.py
 delete mode 100644 examples/tuning/get_important_pattern.py

diff --git a/examples/atlas/sc_similarity_examples/cal_w1_w2.py b/examples/atlas/sc_similarity_examples/cal_w1_w2.py
new file mode 100644
index 00000000..63b0e3f4
--- /dev/null
+++ b/examples/atlas/sc_similarity_examples/cal_w1_w2.py
@@ -0,0 +1,94 @@
+import ast
+import re
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from dance.utils import try_import
+
+wandb = try_import("wandb")
+entity = "xzy11632"
+project = "dance-dev"
+query_datasets = [
+    "c7775e88-49bf-4ba2-a03b-93f00447c958",
+    "456e8b9b-f872-488b-871d-94534090a865",
+    "738942eb-ac72-44ff-a64b-8943b5ecd8d9",
+    # "a5d95a42-0137-496f-8a60-101e17f263c8",
+    "71be997d-ff75-41b9-8a9f-1288c865f921"
+]
+methods = ["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"]
+file_root = Path(__file__).resolve().parent
+feature_names = ["wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "mmd"]
+
+
+def get_ans():
+    ans = {}
+    for query_dataset in query_datasets:
+        data = pd.read_excel(file_root / "Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0)
+        ans[query_dataset] = data
+    return ans
+
+
+def get_rank():
+    for query_dataset, data in ans.items():
+        for method in methods:
+            rank_col = 'rank_' + method
+            data.loc[rank_col, :] = data.loc[method, :].rank(ascending=False, method='min', na_option='bottom')
+            # data.loc[rank_col,:] = data.loc[rank_col,:].fillna(10000)
+
+
+def convert_to_complex(s):
+    if isinstance(s, float) or isinstance(s, int):
+        return float(s)
+    try:
+        return ast.literal_eval(s)
+    except (ValueError, SyntaxError):
+        return np.nan
+
+
+def objective(w1, feature_name):
+    w2 = 1 - w1
+    total_rank = 0
+    for query_dataset, data in ans.items():
+        df_A = data.copy()
+        if feature_name == "bures":
+            df_A.loc[feature_name, :] = df_A.loc[feature_name, :].apply(convert_to_complex)
+            df_A.loc[feature_name, :] = df_A.loc[feature_name, :].apply(lambda x: x.real
+                                                                        if isinstance(x, complex) else np.nan)
+            print(df_A.loc[feature_name, :])
+        df_A.loc['score_similarity', :] = w1 * df_A.loc[feature_name, :].values.astype(float) + w2 * df_A.loc[
+            'metadata_sim', :].values.astype(float)
+        # df_A.loc['score_similarity',:]= df_A.loc['score_similarity',:].fillna(0)
+        max_idx = df_A.loc['score_similarity', :].idxmax()
+        max_B = df_A.loc[:, max_idx]
+        ranks = []
+        for method in methods:
+            ranks.append(max_B.loc['rank_' + method])
+        total_rank += np.sum(ranks)
+    return total_rank
+
+
+ans = get_ans()
+get_rank()
+all_results = []
+for query_dataset, data in ans.items():
+    data.to_csv(f"ranks/{query_dataset}_rank.csv")
+for feature_name in feature_names:
+    w1_values = np.linspace(0, 1, 101)
+    results = []
+    for w1 in w1_values:
+        total_rank = objective(w1, feature_name)
+        results.append({'feature_name': feature_name, 'w1': w1, 'total_rank': total_rank})
+    all_results.extend(results)
+# for w1 in w1_values:
+#     total_rank = objective(w1)
+#     results.append({'w1': w1, 'total_rank': total_rank})
+
+results_df = pd.DataFrame(all_results)
+results_df.to_csv("temp/results_df.csv")
+best_result = results_df.loc[results_df['total_rank'].idxmin()]
+
+print('最佳相似性特征:', best_result['feature_name'])
+print('最佳 w1:', best_result['w1'])
+print('对应的总排名:', best_result['total_rank'])
diff --git a/examples/atlas/sc_similarity_examples/example_usage_anndata.py b/examples/atlas/sc_similarity_examples/example_usage_anndata.py
new file mode 100644
index 00000000..8b124f10
--- /dev/null
+++ b/examples/atlas/sc_similarity_examples/example_usage_anndata.py
@@ -0,0 +1,121 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import scanpy as sc
+import torch
+from anndata import AnnData
+from scipy.sparse import issparse
+from torch.utils.data import TensorDataset
+
+from dance.atlas.sc_similarity import AnnDataSimilarity, get_anndata
+from dance.otdd.pytorch.distance import DatasetDistance
+from dance.utils import set_seed
+
+data_root = "/home/zyxing/dance/examples/tuning/temp_data/train/human"
+
+target_files = [
+    "01209dce-3575-4bed-b1df-129f57fbc031", "055ca631-6ffb-40de-815e-b931e10718c0",
+    "2a498ace-872a-4935-984b-1afa70fd9886", "2adb1f8a-a6b1-4909-8ee8-484814e2d4bf",
+    "3faad104-2ab8-4434-816d-474d8d2641db", "471647b3-04fe-4c76-8372-3264feb950e8",
+    "4c4cd77c-8fee-4836-9145-16562a8782fe", "84230ea4-998d-4aa8-8456-81dd54ce23af",
+    "8a554710-08bc-4005-87cd-da9675bdc2e7", "ae29ebd0-1973-40a4-a6af-d15a5f77a80f",
+    "bc260987-8ee5-4b6e-8773-72805166b3f7", "bc2a7b3d-f04e-477e-96c9-9d5367d5425c",
+    "d3566d6a-a455-4a15-980f-45eb29114cab", "d9b4bc69-ed90-4f5f-99b2-61b0681ba436",
+    "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569"
+]
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument(
+    "--source_files", default=[
+        "71be997d-ff75-41b9-8a9f-1288c865f921", "456e8b9b-f872-488b-871d-94534090a865",
+        "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8",
+        "c7775e88-49bf-4ba2-a03b-93f00447c958"
+    ])
+parser.add_argument("--data_dir", default="../tuning/temp_data")
+args = parser.parse_args()
+source_files = args.source_files
+data_dir = args.data_dir
+file_root = Path(__file__).resolve().parent
+set_seed(42)
+
+
+class CustomEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, (np.float32, np.float64)):
+            return float(obj)
+        if isinstance(obj, (np.int32, np.int64)):
+            return int(obj)
+        if isinstance(obj, pd.DataFrame):
+            return obj.to_dict(orient='records')
+        return super().default(obj)
+
+
+def dataset_from_anndata(adata: AnnData, label_key: str = 'cell_type', classes=None):
+    X = adata.X
+    if issparse(X):
+        X = X.toarray()
+    X_tensor = torch.from_numpy(X).float()
+    Y = adata.obs[label_key].values
+    if pd.api.types.is_numeric_dtype(Y):
+        targets = torch.LongTensor(Y)
+        if classes is None:
+            classes = sorted(np.unique(Y))
+    else:
+        unique_classes = sorted(np.unique(Y))
+        # class_to_idx = {cls: idx for idx, cls in enumerate(unique_classes)}
+        # Y_encoded = np.array([class_to_idx[cls] for cls in Y])
+        targets = torch.LongTensor(Y.codes)
+        if classes is None:
+            classes = unique_classes
+    ds = TensorDataset(X_tensor, targets)
+    ds.targets = targets
+    ds.classes = classes
+    return ds
+
+
+def run_test_otdd():
+    for target_file in target_files:
+        source_data = sc.read_h5ad(f"{data_root}/human_Blood{source_file}_data.h5ad")
+        target_data = sc.read_h5ad(f"{data_root}/human_Blood{target_file}_data.h5ad")
+        source_ds = dataset_from_anndata(source_data)
+        target_ds = dataset_from_anndata(target_data)
+        dist = DatasetDistance(source_ds, target_ds)
+        dist.distance()
+
+
+def run_test_case(source_file):
+    ans = {}
+    for target_file in target_files:
+        # source_data=sc.read_h5ad(f"{data_root}/{source_file}.h5ad")
+        # target_data=sc.read_h5ad(f"{data_root}/{target_file}.h5ad")
+        source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir)
+        target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir)
+        similarity_calculator = AnnDataSimilarity(adata1=source_data, adata2=target_data, sample_size=10,
+                                                  init_random_state=42, n_runs=1,
+                                                  ground_truth_conf_path="Cell Type Annotation Atlas.xlsx",
+                                                  adata1_name=source_file, adata2_name=target_file)
+        ans[target_file] = similarity_calculator.get_similarity_matrix_A2B(methods=[
+            "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num",
+            "ground_truth", "mmd", "metadata_sim"
+        ])
+    # with open(f'sim_{source_file}.json', 'w') as f:
+    #     json.dump(ans, f,indent=4,cls=CustomEncoder)
+    ans = pd.DataFrame(ans)
+    ans.to_csv(f'sim_{source_file}.csv')
+    return ans
+
+
+query_data = os.listdir(file_root / "query_data")
+with pd.ExcelWriter(file_root / "Blood_similarity.xlsx", engine='openpyxl') as writer:
+    for source_file in source_files:
+        query_ans = [
+            pd.read_csv(file_root / "query_data" / element, index_col=0) for element in query_data
+            if element.split("_")[-3] == source_file
+        ]
+        ans = run_test_case(source_file)
+        merged_df = pd.concat(query_ans + [ans], join='inner')
+        merged_df.to_excel(writer, sheet_name=source_file[:4], index=True)
diff --git a/examples/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py
similarity index 100%
rename from examples/sc_similarity_examples/sim_query_atlas.py
rename to examples/atlas/sc_similarity_examples/sim_query_atlas.py
diff --git a/examples/atlas/sc_similarity_examples/vis.py b/examples/atlas/sc_similarity_examples/vis.py
new file mode 100644
index 00000000..a547851c
--- /dev/null
+++ b/examples/atlas/sc_similarity_examples/vis.py
@@ -0,0 +1,112 @@
+import re
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from dance.utils import try_import
+
+sys.path.append("..")
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from get_result_web import spilt_web
+
+wandb = try_import("wandb")
+entity = "xzy11632"
+project = "dance-dev"
+query_datasets = [
+    "c7775e88-49bf-4ba2-a03b-93f00447c958", "456e8b9b-f872-488b-871d-94534090a865",
+    "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8",
+    "71be997d-ff75-41b9-8a9f-1288c865f921"
+]
+file_root = Path(__file__).resolve().parent
+ground_truth_conf = pd.read_excel(file_root / "Cell Type Annotation Atlas.xlsx", sheet_name="blood", index_col=0)
+methods = ["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"]
+feature_name = "spectral"
+
+
+def get_accs(sweep):
+    ans = []
+    for run in sweep.runs:
+        if "test_acc" in run.summary:
+            ans.append(run.summary["test_acc"])
+    return ans
+
+
+def get_runs(sweep_record):
+    step_links = {}
+    pattern = r'(step\d+):((?:https?://[^|,]+(?:,)?)+)'
+    matches = re.finditer(pattern, sweep_record)
+    for match in matches:
+        step = match.group(1)  # e.g., 'step2'
+        links_str = match.group(2)  # e.g., 'https://...y31tzbnv'
+        links = links_str.split(',')
+        step_links[step] = links
+    ans = []
+    for step, links in step_links.items():
+        for sweep_url in links:
+            _, _, sweep_id = spilt_web(sweep_url)
+        sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")
+        ans += get_accs(sweep)
+    return ans
+
+
+def get_atlas_ans(query_dataset, method):
+    data = pd.read_excel("Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0)
+    weight1 = 1.0
+    weight2 = 0.0
+    weighted_sum = data.loc[feature_name, :] * weight1 + data.loc["metadata_sim", :] * weight2
+    atlas_dataset_res = weighted_sum.idxmax()
+    max_value = weighted_sum.max()
+    return data.loc[:, atlas_dataset_res][method]
+
+
+def vis(data, target_value, title, ax):
+    # sns.boxplot(data=data, color='skyblue',ax=ax)
+    # if target_value is not np.nan:
+    #     ax.axhline(y=target_value, color='red', linestyle='--', linewidth=2, label=f'atlas_value = {target_value}')
+    #     ax.text(0, target_value + (max(data)-min(data))*0.01, f'{target_value}', color='red', ha='center',size=16)
+
+    data = np.array(data)
+    data_df = pd.DataFrame({'test_acc': data})
+    sns.violinplot(y='test_acc', data=data_df, inner=None, color='skyblue', ax=ax)
+    median = np.median(data)
+    ax.axhline(median, color='gray', linestyle='--', label=f'Median: {median:.1f}')
+    if not np.isnan(target_value):
+        percentile = (np.sum(data < float(target_value)) / len(data)) * 100
+        ax.scatter(0, float(target_value), color='red', s=100, zorder=5,
+                   label=f'Specific Value: {target_value}\n({percentile:.1f} percentile)')
+    ax.set_title(str(title))
+    ax.set_ylabel('test_acc')
+    ax.title.set_size(16)
+    ax.yaxis.label.set_size(14)
+    ax.tick_params(axis='both', which='major', labelsize=10)
+    ax.legend()
+
+
+if __name__ == "__main__":
+    # ans_all=defaultdict(dict)
+    # for query_dataset in query_datasets:
+    #     for method in methods:
+    #         sweep_record=ground_truth_conf.loc[query_dataset,method]
+    #         ans_all[query_dataset][method]=get_runs(sweep_record)
+    # with open("runs.json","w") as f:
+    #     json.dump(ans_all,f)
+
+    with open("runs.json") as f:
+        runs = json.load(f)
+    plt.style.use("default")
+
+    for query_dataset in query_datasets:
+        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+        axes = axes.flatten()
+        for i, method in enumerate(methods):
+            vis(runs[query_dataset][method], get_atlas_ans(query_dataset, method), f"{query_dataset}_{method}", axes[i])
+        plt.tight_layout()
+        plt.savefig(f"imgs/{query_dataset}.png", dpi=300)
+        plt.show()
diff --git a/examples/result_analysis/get_num.py b/examples/result_analysis/get_num.py
index 6573bd70..2432ae38 100644
--- a/examples/result_analysis/get_num.py
+++ b/examples/result_analysis/get_num.py
@@ -1,3 +1,22 @@
+"""Count the total number of experiment runs across different tasks in W&B project.
+
+This script analyzes experiment results stored in a W&B project by:
+1. Reading task data from Excel sheets
+2. Extracting sweep URLs for each task
+3. Querying W&B API to count runs in each sweep
+4. Computing the total number of experimental runs
+
+Parameters
+----------
+None
+
+Returns
+-------
+int
+    Total number of runs across all tasks and sweeps
+
+"""
+
 import sys
 from pathlib import Path
 
diff --git a/examples/tuning/get_important_pattern.py b/examples/tuning/get_important_pattern.py
deleted file mode 100644
index e542b068..00000000
--- a/examples/tuning/get_important_pattern.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import itertools
-import pathlib
-from itertools import combinations
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import scikit_posthocs as sp
-import seaborn as sns
-from mlxtend.frequent_patterns import apriori, association_rules
-from mlxtend.preprocessing import TransactionEncoder
-from networkx import parse_adjlist
-from scipy import stats
-
-metric_name = "acc"
-ascending = False
-
-
-def get_important_pattern(test_accs, vis=True, alpha=0.8, title="", test_acc_names=None):
-    medians = [np.median(group) for group in test_accs]
-    _, p_value = stats.kruskal(*test_accs)
-    if vis:
-        fig = plt.figure(figsize=(12, 4))
-        sns.boxplot(data=test_accs)
-        plt.xticks(list(range(len(test_accs))),
-                   ([f"{i}" for i in range(len(test_accs))] if test_acc_names is None else test_acc_names), rotation=45,
-                   fontsize=10)
-        plt.title(title)
-        plt.show()
-    if p_value < alpha:
-        data = test_accs
-        p_values_matrix = sp.posthoc_dunn(a=data)
-        sorted_indices = np.argsort(np.argsort(medians * -1 if ascending else medians))
-        ranks = {
-            index: {
-                "rank": rank,
-                "before": None,
-                "after": [],
-                "real_rank": rank
-            }
-            for index, rank in enumerate(sorted_indices)
-        }
-        for (rank1, rank2) in combinations(range(max(sorted_indices) + 1), 2):
-            for idx1 in [index for index, value in ranks.items() if value["rank"] == rank1]:
-                for idx2 in [index for index, value in ranks.items() if value["rank"] == rank2]:
-                    if p_values_matrix.iloc[idx1, idx2] > alpha:
-                        if ranks[idx2]["before"] is None:
-                            ranks[idx1]["after"].append(idx2)
-                            ranks[idx2]["before"] = idx1
-
-        def change_real_rank(rank_item, real_rank):
-            rank_item["real_rank"] = real_rank
-            for idx in rank_item["after"]:
-                change_real_rank(ranks[idx], real_rank)
-
-        for rank_item in ranks.values():
-            if rank_item["before"] is None:
-                for idx in rank_item["after"]:
-                    change_real_rank(ranks[idx], rank_item["real_rank"])
-        return [v["real_rank"] for k, v in ranks.items()]
-    else:
-        if vis:
-            print("No significant differences found between the groups.")
-        return []
-
-
-def get_com(step2_data, r=2, alpha=0.8, columns=None, vis=True):
-    ans = []
-    for com in itertools.combinations(columns, r):
-        test_accs_arrays = []
-        for g in step2_data.groupby(by=list(com)):
-            test_accs_arrays.append({"name": g[0], metric_name: list(g[1][metric_name])})
-        test_accs = [i[metric_name] for i in test_accs_arrays]
-        test_acc_names = [i["name"] for i in test_accs_arrays]
-        final_ranks = get_important_pattern(
-            test_accs, alpha=alpha, title=" ".join(list(com)), vis=vis,
-            test_acc_names=[" ".join(test_acc_name) for test_acc_name in test_acc_names])
-        if len(final_ranks) > 0:
-            max_rank = max(final_ranks)
-            max_rank_count = final_ranks.count(max_rank)
-            if max_rank_count < len(final_ranks) / 2:
-                for index, (test_acc_name, rank) in enumerate(zip(test_acc_names, final_ranks)):
-                    if rank == max_rank:
-                        if vis:
-                            print(f"index={index},name={test_acc_name},rank={rank}")
-                        ans.append(test_acc_name if isinstance(test_acc_name, tuple) else (test_acc_name, ))
-    return ans
-
-
-def draw_graph(rules, rules_to_show):
-    import networkx as nx
-    G1 = nx.DiGraph()
-
-    color_map = []
-    N = 50
-    colors = np.random.rand(N)
-    strs = ['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']
-
-    for i in range(rules_to_show):
-        G1.add_nodes_from(["R" + str(i)])
-
-        for a in rules.iloc[i]['antecedents']:
-
-            G1.add_nodes_from([a])
-
-            G1.add_edge(a, "R" + str(i), color=colors[i], weight=2)
-
-        for c in rules.iloc[i]['consequents']:
-
-            G1.add_nodes_from([c])
-
-            G1.add_edge("R" + str(i), c, color=colors[i], weight=2)
-
-    for node in G1:
-        found_a_string = False
-        for item in strs:
-            if node == item:
-                found_a_string = True
-        if found_a_string:
-            color_map.append('yellow')
-        else:
-            color_map.append('green')
-
-    edges = G1.edges()
-    colors = [G1[u][v]['color'] for u, v in edges]
-    weights = [G1[u][v]['weight'] for u, v in edges]
-
-    pos = nx.spring_layout(G1, k=16, scale=1)
-    nx.draw(G1, pos, node_color=color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)
-
-    for p in pos:  # raise text positions
-        pos[p][1] += 0.07
-    nx.draw_networkx_labels(G1, pos)
-    plt.show()
-
-
-def get_frequent_itemsets(step2_data, threshold_per=0.1, vis=False):
-    threshold = int(len(step2_data) * threshold_per)
-    df_sorted = step2_data.sort_values(metric_name, ascending=ascending)
-    top_10_percent = df_sorted.head(threshold)
-    columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
-    transactions = top_10_percent[columns].values.tolist()
-    te = TransactionEncoder()
-    te_ary = te.fit(transactions).transform(transactions)
-    df = pd.DataFrame(te_ary, columns=te.columns_)
-    frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
-    # print(frequent_itemsets)
-    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
-    if vis:
-        # print(frequent_itemsets)
-        # print(frequent_itemsets)
-        # draw_graph(rules=rules,rules_to_show=10)
-        frequent_itemsets_copy = frequent_itemsets.copy()
-        frequent_itemsets_copy = frequent_itemsets_copy.sort_values(by="support")
-        frequent_itemsets_copy.plot(x="itemsets", y="support", kind="bar")
-        plt.xticks(rotation=30, fontsize=7)
-        # print(type(rules))
-    return [tuple(a) for a in frequent_itemsets["itemsets"]]
-
-
-def get_com_all(step2_data, vis=True, alpha=0.8):
-    ans = []
-    columns = sorted([col for col in step2_data.columns if col.startswith("pipeline")])
-    for i in range(1, len(columns)):
-        ans += get_com(step2_data, i, columns=columns, vis=vis, alpha=alpha)
-    return ans
-
-
-def summary_pattern(data_path, alpha=0.8, vis=False):
-    step2_origin_data = pd.read_csv(data_path)
-    step2_data = step2_origin_data.dropna()
-    com_ans = get_com_all(step2_data, vis=vis, alpha=alpha)
-    apr_ans = get_frequent_itemsets(step2_data, vis=vis)
-    return list(set(com_ans) & set(apr_ans))
-
-
-# def list_files(directory,file_name="best_test_acc.csv",save_path="summary_file"):
-#     ans=[]
-#     path = Path(directory)
-#     for file_path in path.rglob('*'):
-#         if file_path.is_file():
-#             if file_path.name==file_name:
-#                 algorithm,dataset=file_path.relative_to(directory).parts[:2]
-#                 ans.append({"algorithm":algorithm,"dataset":dataset,"summary_pattern":summary_pattern(file_path)})
-#     pd.DataFrame(ans).to_csv(save_path)
-def list_files(directories, file_name="best_test_acc.csv", alpha=0.8, vis=False):
-    for directory in directories:
-        path = Path(directory)
-        for file_path in path.rglob('*'):
-            if file_path.is_file():
-                if file_path.name == file_name:
-                    print(file_path)
-                    with open(Path(file_path.parent.resolve(), "pipeline_summary_pattern.txt"), 'w') as f:
-                        f.write(str(summary_pattern(file_path, alpha=alpha, vis=vis)))
-
-
-if __name__ == "__main__":
-    # directories = []
-    # for path in Path('/home/zyxing/dance/examples/tuning').iterdir():
-    #     if path.is_dir():
-    #         if str(path.name).startswith("cluster"):
-    #             directories.append(path)
-    # list_files(directories)
-    # directories = []
-    # for path in Path('/home/zyxing/dance/examples/tuning').iterdir():
-    #     if path.is_dir():
-    #         if str(path.name).startswith("cluster"):
-    #             directories.append(path)
-    # list_files(directories)
-
-    print(
-        summary_pattern(
-            "/home/zyxing/dance/examples/tuning/cluster_graphsc/mouse_ES_cell/results/pipeline/best_test_acc.csv",
-            alpha=0.3, vis=False))

From 104934929a897af41f89be5f42d117c8d845fc76 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 11:52:04 +0800
Subject: [PATCH 188/203] add notes

---
 .../atlas/sc_similarity_examples/cal_w1_w2.py | 66 ++++++++++++++++++-
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/examples/atlas/sc_similarity_examples/cal_w1_w2.py b/examples/atlas/sc_similarity_examples/cal_w1_w2.py
index 63b0e3f4..710b5787 100644
--- a/examples/atlas/sc_similarity_examples/cal_w1_w2.py
+++ b/examples/atlas/sc_similarity_examples/cal_w1_w2.py
@@ -1,3 +1,21 @@
+"""Calculate optimal weights for combining similarity metrics in cell type annotation.
+
+This script analyzes different similarity metrics (like Wasserstein, Hausdorff, etc.) and metadata similarity
+to find optimal weights that minimize the total rank of correct cell type predictions across multiple datasets.
+
+The script:
+1. Loads similarity scores from Excel files
+2. Computes rankings for different cell type annotation methods
+3. Finds optimal weights (w1, w2) for combining feature-based and metadata-based similarity
+4. Outputs the best performing feature and its corresponding weight
+
+Returns
+-------
+DataFrame
+    Results containing feature names, weights, and corresponding total ranks
+
+"""
+
 import ast
 import re
 from pathlib import Path
@@ -23,6 +41,14 @@
 
 
 def get_ans():
+    """Load similarity scores from Excel files for each dataset.
+
+    Returns
+    -------
+    dict
+        Dictionary mapping dataset IDs to their similarity score DataFrames
+
+    """
     ans = {}
     for query_dataset in query_datasets:
         data = pd.read_excel(file_root / "Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0)
@@ -31,6 +57,12 @@ def get_ans():
 
 
 def get_rank():
+    """Calculate rankings for each cell type annotation method.
+
+    Updates the input DataFrames with rank columns for each method, where lower ranks
+    indicate better performance.
+
+    """
     for query_dataset, data in ans.items():
         for method in methods:
             rank_col = 'rank_' + method
@@ -39,6 +71,19 @@ def get_rank():
 
 
 def convert_to_complex(s):
+    """Convert string representations of complex numbers to float values.
+
+    Parameters
+    ----------
+    s : str or float
+        Input value to convert
+
+    Returns
+    -------
+    float
+        Real part of complex number or NaN if conversion fails
+
+    """
     if isinstance(s, float) or isinstance(s, int):
         return float(s)
     try:
@@ -48,6 +93,21 @@ def convert_to_complex(s):
 
 
 def objective(w1, feature_name):
+    """Calculate total rank score for given weights and feature.
+
+    Parameters
+    ----------
+    w1 : float
+        Weight for the feature-based similarity (0-1)
+    feature_name : str
+        Name of the similarity feature to evaluate
+
+    Returns
+    -------
+    float
+        Total rank score (lower is better)
+
+    """
     w2 = 1 - w1
     total_rank = 0
     for query_dataset, data in ans.items():
@@ -89,6 +149,6 @@ def objective(w1, feature_name):
 results_df.to_csv("temp/results_df.csv")
 best_result = results_df.loc[results_df['total_rank'].idxmin()]
 
-print('最佳相似性特征:', best_result['feature_name'])
-print('最佳 w1:', best_result['w1'])
-print('对应的总排名:', best_result['total_rank'])
+print('Best similarity feature:', best_result['feature_name'])
+print('Best w1:', best_result['w1'])
+print('Corresponding total rank:', best_result['total_rank'])

From 609db041b071c1c2f8f379353a4959f6d5b71cc3 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 12:51:50 +0800
Subject: [PATCH 189/203] translate notes

---
 .../sc_similarity_examples/sim_query_atlas.py | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/examples/atlas/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py
index de783f20..8e78393c 100644
--- a/examples/atlas/sc_similarity_examples/sim_query_atlas.py
+++ b/examples/atlas/sc_similarity_examples/sim_query_atlas.py
@@ -26,40 +26,42 @@
 
 
 def find_unique_matching_row(df, config_col, input_dict_list):
-    """在 DataFrame 中查找指定列中与输入字典列表匹配的唯一一行。
+    """Find a unique matching row in DataFrame based on the specified column and input
+    dictionary list.
 
-    :param df: pandas.DataFrame，包含要搜索的数据。
-    :param config_col: str，DataFrame 中包含字典列表字符串的列名。
-    :param input_dict_list: list of dicts，输入的字典列表，用于匹配。
-    :return: pandas.Series，匹配的行。
-    :raises ValueError: 如果匹配的行数不等于1。
+    :param df: pandas.DataFrame, containing the data to search.
+    :param config_col: str, name of the DataFrame column containing dictionary list
+        strings.
+    :param input_dict_list: list of dicts, input dictionary list for matching.
+    :return: pandas.Series, the matching row.
+    :raises ValueError: if the number of matching rows is not equal to 1.
 
     """
 
-    # 定义一个函数，用于解析字符串并比较
+    # Define a function for parsing strings and comparing
     def is_match(config_str):
         try:
-            # 使用 ast.literal_eval 安全地解析字符串为 Python 对象
+            # Safely parse string to Python object using ast.literal_eval
             config = ast.literal_eval(config_str)
             return config == input_dict_list
         except (ValueError, SyntaxError):
-            # 如果解析失败，则不匹配
+            # If parsing fails, no match
             return False
 
-    # 应用比较函数，得到一个布尔系列
+    # Apply comparison function to get a boolean series
     matches = df[config_col].apply(is_match)
 
-    # 获取所有匹配的行
+    # Get all matching rows
     matching_rows = df[matches]
 
-    # 检查匹配的行数
+    # Check number of matching rows
     num_matches = len(matching_rows)
     if num_matches == 1:
         return matching_rows.iloc[0]
     elif num_matches == 0:
-        raise ValueError("未找到匹配的行。")
+        raise ValueError("No matching rows found.")
     else:
-        raise ValueError(f"找到 {num_matches} 行匹配，预期恰好一行。")
+        raise ValueError(f"Found {num_matches} matching rows, expected exactly one.")
 
 
 wandb = try_import("wandb")

From b26151f987d52cd75f096e34f5befdf5dca3368e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 15:16:42 +0800
Subject: [PATCH 190/203] add notes

---
 .../example_usage_anndata.py                  | 38 +++++++++-
 examples/atlas/sc_similarity_examples/vis.py  | 70 ++++++++++++++++++-
 2 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/examples/atlas/sc_similarity_examples/example_usage_anndata.py b/examples/atlas/sc_similarity_examples/example_usage_anndata.py
index 8b124f10..651ab4a1 100644
--- a/examples/atlas/sc_similarity_examples/example_usage_anndata.py
+++ b/examples/atlas/sc_similarity_examples/example_usage_anndata.py
@@ -55,6 +55,23 @@ def default(self, obj):
 
 
 def dataset_from_anndata(adata: AnnData, label_key: str = 'cell_type', classes=None):
+    """Convert AnnData object to PyTorch TensorDataset.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Input AnnData object
+    label_key : str, default='cell_type'
+        Column name in adata.obs containing cell type labels
+    classes : list, optional
+        Predefined class labels. If None, will be inferred from data
+
+    Returns
+    -------
+    TensorDataset
+        PyTorch dataset with features and labels
+
+    """
     X = adata.X
     if issparse(X):
         X = X.toarray()
@@ -88,22 +105,39 @@ def run_test_otdd():
 
 
 def run_test_case(source_file):
+    """Calculate similarity matrices between source and target datasets.
+
+    Parameters
+    ----------
+    source_file : str
+        Name of the source dataset file
+
+    Returns
+    -------
+    pandas.DataFrame
+        Similarity scores for different metrics
+
+    """
     ans = {}
     for target_file in target_files:
         # source_data=sc.read_h5ad(f"{data_root}/{source_file}.h5ad")
         # target_data=sc.read_h5ad(f"{data_root}/{target_file}.h5ad")
         source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir)
         target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir)
+
+        # Initialize similarity calculator with multiple metrics
         similarity_calculator = AnnDataSimilarity(adata1=source_data, adata2=target_data, sample_size=10,
                                                   init_random_state=42, n_runs=1,
                                                   ground_truth_conf_path="Cell Type Annotation Atlas.xlsx",
                                                   adata1_name=source_file, adata2_name=target_file)
+
+        # Calculate similarity using multiple methods
         ans[target_file] = similarity_calculator.get_similarity_matrix_A2B(methods=[
             "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num",
             "ground_truth", "mmd", "metadata_sim"
         ])
-    # with open(f'sim_{source_file}.json', 'w') as f:
-    #     json.dump(ans, f,indent=4,cls=CustomEncoder)
+
+    # Convert results to DataFrame and save
     ans = pd.DataFrame(ans)
     ans.to_csv(f'sim_{source_file}.csv')
     return ans
diff --git a/examples/atlas/sc_similarity_examples/vis.py b/examples/atlas/sc_similarity_examples/vis.py
index a547851c..a9f28ec5 100644
--- a/examples/atlas/sc_similarity_examples/vis.py
+++ b/examples/atlas/sc_similarity_examples/vis.py
@@ -28,9 +28,29 @@
 ground_truth_conf = pd.read_excel(file_root / "Cell Type Annotation Atlas.xlsx", sheet_name="blood", index_col=0)
 methods = ["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"]
 feature_name = "spectral"
+"""Visualization script for comparing model performance across different datasets and
+methods.
+
+This script loads experiment results from wandb and compares them with atlas-based
+predictions, generating violin plots to visualize the distribution of accuracies.
+
+"""
 
 
 def get_accs(sweep):
+    """Extract test accuracies from a wandb sweep.
+
+    Parameters
+    ----------
+    sweep : wandb.Sweep
+        Sweep object containing multiple runs
+
+    Returns
+    -------
+    list
+        List of test accuracies from all runs
+
+    """
     ans = []
     for run in sweep.runs:
         if "test_acc" in run.summary:
@@ -39,6 +59,19 @@ def get_accs(sweep):
 
 
 def get_runs(sweep_record):
+    """Parse sweep URLs and collect all run results.
+
+    Parameters
+    ----------
+    sweep_record : str
+        String containing sweep URLs for different steps
+
+    Returns
+    -------
+    list
+        Combined list of test accuracies from all sweeps
+
+    """
     step_links = {}
     pattern = r'(step\d+):((?:https?://[^|,]+(?:,)?)+)'
     matches = re.finditer(pattern, sweep_record)
@@ -57,16 +90,45 @@ def get_runs(sweep_record):
 
 
 def get_atlas_ans(query_dataset, method):
+    """Calculate atlas-based prediction accuracy for a given dataset and method.
+
+    Parameters
+    ----------
+    query_dataset : str
+        Dataset identifier
+    method : str
+        Method name to evaluate
+
+    Returns
+    -------
+    float
+        Predicted accuracy based on atlas similarity
+
+    """
     data = pd.read_excel("Blood_similarity.xlsx", sheet_name=query_dataset[:4], index_col=0)
-    weight1 = 1.0
-    weight2 = 0.0
+    weight1 = 1.0  # Weight for feature-based similarity
+    weight2 = 0.0  # Weight for metadata similarity
     weighted_sum = data.loc[feature_name, :] * weight1 + data.loc["metadata_sim", :] * weight2
-    atlas_dataset_res = weighted_sum.idxmax()
+    atlas_dataset_res = weighted_sum.idxmax()  # Get most similar dataset
     max_value = weighted_sum.max()
     return data.loc[:, atlas_dataset_res][method]
 
 
 def vis(data, target_value, title, ax):
+    """Create violin plot comparing distribution of accuracies with atlas prediction.
+
+    Parameters
+    ----------
+    data : list
+        List of accuracy values
+    target_value : float
+        Atlas-predicted accuracy value
+    title : str
+        Plot title
+    ax : matplotlib.axes.Axes
+        Axes object to plot on
+
+    """
     # sns.boxplot(data=data, color='skyblue',ax=ax)
     # if target_value is not np.nan:
     #     ax.axhline(y=target_value, color='red', linestyle='--', linewidth=2, label=f'atlas_value = {target_value}')
@@ -102,9 +164,11 @@ def vis(data, target_value, title, ax):
         runs = json.load(f)
     plt.style.use("default")
 
+    # Generate visualization for each dataset
     for query_dataset in query_datasets:
         fig, axes = plt.subplots(2, 2, figsize=(15, 10))
         axes = axes.flatten()
+        # Create subplot for each method
         for i, method in enumerate(methods):
             vis(runs[query_dataset][method], get_atlas_ans(query_dataset, method), f"{query_dataset}_{method}", axes[i])
         plt.tight_layout()

From 6878afa65c1ee765fc943e90005c14da6179ba97 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 15:41:25 +0800
Subject: [PATCH 191/203] add notes

---
 dance/atlas/data_dropbox_upload.py | 54 ++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/dance/atlas/data_dropbox_upload.py b/dance/atlas/data_dropbox_upload.py
index 718e07c9..87b6d78e 100644
--- a/dance/atlas/data_dropbox_upload.py
+++ b/dance/atlas/data_dropbox_upload.py
@@ -12,6 +12,23 @@
 
 
 def upload_file_to_dropbox(dropbox_path, access_token, local_path):
+    """Upload a local file to Dropbox.
+
+    Parameters
+    ----------
+    dropbox_path : str
+        Destination path in Dropbox
+    access_token : str
+        Dropbox API access token
+    local_path : str or pathlib.Path
+        Path to local file to upload
+
+    Returns
+    -------
+    None
+        Returns None if upload fails
+
+    """
     dbx = dropbox.Dropbox(access_token)
 
     # Verify access token
@@ -29,6 +46,18 @@ def upload_file_to_dropbox(dropbox_path, access_token, local_path):
 
 
 def file_upload(dbx: dropbox.Dropbox, local_path: pathlib.Path, remote_path: str):
+    """Upload large files to Dropbox using chunked upload.
+
+    Parameters
+    ----------
+    dbx : dropbox.Dropbox
+        Authenticated Dropbox client
+    local_path : pathlib.Path
+        Path to local file
+    remote_path : str
+        Destination path in Dropbox
+
+    """
     CHUNKSIZE = 100 * 1024 * 1024
     upload_session_start_result = dbx.files_upload_session_start(b'')
     cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id, offset=0)
@@ -86,10 +115,35 @@ def get_link(data_fname, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH):
 
 
 def get_ans(data: sc.AnnData, tissue: str, dataset_id: str, local_path, ACCESS_TOKEN, DROPBOX_DEST_PATH):
+    """Generate metadata dictionary for dataset and upload to Dropbox.
+
+    Parameters
+    ----------
+    data : sc.AnnData
+        Annotated data matrix
+    tissue : str
+        Tissue type
+    dataset_id : str
+        Unique identifier for dataset
+    local_path : str or pathlib.Path
+        Path to local data file
+    ACCESS_TOKEN : str
+        Dropbox API access token
+    DROPBOX_DEST_PATH : str
+        Base path in Dropbox for uploads
+
+    Returns
+    -------
+    dict
+        Metadata dictionary containing dataset information and Dropbox URLs
+
+    """
     # keys=["species","tissue","dataset","split","celltype_fname","celltype_url","data_fname","data_url"]
+    # Create metadata dictionary with dataset info
     ans = {}
     ans["species"] = "human"
     ans["tissue"] = tissue.capitalize()
+    # Store number of observations (cells) in dataset
     ans["dataset"] = data.n_obs
     ans["split"] = "train"
     ans["celltype_fname"] = ""

From 04ab7eb2138754e0f9c656c5e46ca674a9e143a7 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 15:45:28 +0800
Subject: [PATCH 192/203] add notes

---
 .../atlas/sc_similarity/anndata_similarity.py | 79 +++++++++++++++++--
 1 file changed, 72 insertions(+), 7 deletions(-)

diff --git a/dance/atlas/sc_similarity/anndata_similarity.py b/dance/atlas/sc_similarity/anndata_similarity.py
index 88784a9d..b7d392bf 100644
--- a/dance/atlas/sc_similarity/anndata_similarity.py
+++ b/dance/atlas/sc_similarity/anndata_similarity.py
@@ -31,6 +31,32 @@ def get_anndata(tissue: str = "Blood", species: str = "human", filetype: str = "
 
 
 class AnnDataSimilarity:
+    """A class to compute various similarity metrics between two AnnData objects.
+
+    Parameters
+    ----------
+    adata1 : anndata.AnnData
+        First AnnData object for comparison
+    adata2 : anndata.AnnData
+        Second AnnData object for comparison
+    sample_size : Optional[int]
+        Number of cells to sample from each dataset. If None, uses min(adata1.n_obs, adata2.n_obs)
+    init_random_state : Optional[int]
+        Random seed for reproducibility
+    n_runs : int
+        Number of times to run each similarity computation
+    ground_truth_conf_path : Optional[str]
+        Path to ground truth configuration file
+    adata1_name : Optional[str]
+        Name identifier for first dataset
+    adata2_name : Optional[str]
+        Name identifier for second dataset
+    methods : List[str]
+        List of cell type annotation methods to use
+    tissue : str
+        Tissue type being analyzed
+
+    """
 
     def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size: Optional[int] = None,
                  init_random_state: Optional[int] = None, n_runs: int = 10,
@@ -52,6 +78,14 @@ def __init__(self, adata1: anndata.AnnData, adata2: anndata.AnnData, sample_size
         self.n_runs = n_runs
 
     def filter_gene(self, n_top_genes=3000):
+        """Filter genes to keep only highly variable genes common between datasets.
+
+        Parameters
+        ----------
+        n_top_genes : int
+            Number of top variable genes to select
+
+        """
         sc.pp.highly_variable_genes(self.origin_adata1, n_top_genes=n_top_genes, flavor='seurat_v3')
         sc.pp.highly_variable_genes(self.origin_adata2, n_top_genes=n_top_genes, flavor='seurat_v3')
 
@@ -159,6 +193,14 @@ def jsd(p, q):
         return np.nanmean(similarity_matrix)
 
     def compute_mmd(self) -> float:
+        """Compute Maximum Mean Discrepancy between datasets.
+
+        Returns
+        -------
+        float
+            Normalized MMD similarity score between 0 and 1
+
+        """
         X = self.X
         Y = self.Y
         kernel = "rbf"
@@ -196,8 +238,14 @@ def data_company(self):
         raise NotImplementedError("data company")
 
     def wasserstein_dist(self) -> float:
-        """Computes the average Wasserstein distance between all pairs of cells from the
-        two datasets."""
+        """Compute Wasserstein distance between datasets.
+
+        Returns
+        -------
+        float
+            Normalized Wasserstein similarity score between 0 and 1
+
+        """
         X = self.X
         Y = self.Y
         a = np.ones((X.shape[0], )) / X.shape[0]
@@ -272,6 +320,15 @@ def spectral_distance(self):
         return 1 / (1 + np.linalg.norm(eig_A - eig_B))
 
     def get_dataset_meta_sim(self):
+        """Compute metadata similarity between datasets based on discrete and continuous
+        features.
+
+        Returns
+        -------
+        float
+            Average similarity score across all metadata features
+
+        """
         # dis_cols=['assay', 'cell_type', 'development_stage','disease','is_primary_data','self_reported_ethnicity','sex', 'suspension_type', 'tissue','tissue_type', 'tissue_general']
         con_cols = [
             "nnz_mean", "nnz_var", "nnz_counts_mean", "nnz_counts_var", "n_measured_vars", "n_counts_mean",
@@ -357,11 +414,19 @@ def compute_similarity(
             'cosine', 'pearson', 'jaccard', 'js_distance', 'otdd', 'common_genes_num', "ground_truth", "metadata_sim"
         ]
     ) -> Dict[str, float]:
-        """Computes the specified similarity measure. Parameters:
-
-        methods: List of similarity measures to be computed. Supports 'cosine', 'pearson', 'jaccard', 'js_distance', 'wasserstein','otdd'
-        Returns:
-        Dictionary containing the similarity matrices
+        """Compute multiple similarity metrics between datasets.
+
+        Parameters
+        ----------
+        random_state : int
+            Random seed for cell sampling
+        methods : List[str]
+            List of similarity methods to compute
+
+        Returns
+        -------
+        Dict[str, float]
+            Dictionary mapping method names to similarity scores
 
         """
         self.adata1 = self.origin_adata1.copy()

From 844d8839a7e43f0152e4ea362fe7737162125e23 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 16:10:16 +0800
Subject: [PATCH 193/203] add notes

---
 .../sc_similarity_examples/sim_query_atlas.py | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/atlas/sc_similarity_examples/sim_query_atlas.py b/examples/atlas/sc_similarity_examples/sim_query_atlas.py
index 8e78393c..a424df17 100644
--- a/examples/atlas/sc_similarity_examples/sim_query_atlas.py
+++ b/examples/atlas/sc_similarity_examples/sim_query_atlas.py
@@ -26,15 +26,26 @@
 
 
 def find_unique_matching_row(df, config_col, input_dict_list):
-    """Find a unique matching row in DataFrame based on the specified column and input
-    dictionary list.
-
-    :param df: pandas.DataFrame, containing the data to search.
-    :param config_col: str, name of the DataFrame column containing dictionary list
-        strings.
-    :param input_dict_list: list of dicts, input dictionary list for matching.
-    :return: pandas.Series, the matching row.
-    :raises ValueError: if the number of matching rows is not equal to 1.
+    """Find a unique matching row in DataFrame based on specified criteria.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        DataFrame containing the data to search
+    config_col : str
+        Name of the DataFrame column containing dictionary list strings
+    input_dict_list : list of dict
+        Input dictionary list for matching
+
+    Returns
+    -------
+    pandas.Series
+        The matching row from the DataFrame
+
+    Raises
+    ------
+    ValueError
+        If the number of matching rows is not exactly one
 
     """
 

From 1512c01e9931c8ec39b4b688c1ae08ecaf93bc3b Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 16:20:20 +0800
Subject: [PATCH 194/203] add notes

---
 examples/atlas/upload_data.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/atlas/upload_data.py b/examples/atlas/upload_data.py
index 2a6f3f8b..6713aff8 100644
--- a/examples/atlas/upload_data.py
+++ b/examples/atlas/upload_data.py
@@ -24,6 +24,25 @@
     DROPBOX_DEST_PATH = args.dropbox_dest_path  # Destination path on Dropbox
 
     def get_data(dataset_id, in_atlas=False, large=False):
+        """Load h5ad dataset from local path.
+
+        Parameters
+        ----------
+        dataset_id : str
+            Identifier for the dataset
+        in_atlas : bool
+            Whether dataset is from atlas (True) or query (False)
+        large : bool
+            Whether dataset is large (>10000 cells) requiring sampling
+
+        Returns
+        -------
+        AnnData
+            Loaded single cell data
+        Path
+            Local path to the data file
+
+        """
         if large:
             if in_atlas:
                 local_path = MAINDIR / f"sampled-10000/{tissue}/{dataset_id}.h5ad"

From 7549f1d28fcb57f7f5c9ff3e1c06f12a1be72d71 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 16:50:23 +0800
Subject: [PATCH 195/203] minor

---
 examples/tuning/joint_embedding_dcca/main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index dd8f9f76..af76f595 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -12,10 +12,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA
@@ -122,8 +122,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
             #                 feature_channel_type=["layers", "layers", None, None, "obsm", "obsm"],
             #                 feature_channel=["counts", "counts", None, None, "size_factors",
             #                                 "size_factors"], label_channel="labels")
-            #TODO 感觉layers中的counts才是raw
-            #TODO 的确感觉layers中的counts才是raw，不知道反过来影响大不大
+            # TODO Feels like counts in layers should be raw
+            # TODO Indeed feels like counts in layers should be raw, not sure how big the reverse impact would be
             (x_train, y_train, x_train_raw, y_train_raw, x_train_size,
             y_train_size), train_labels = data.get_train_data(return_type="torch")
             (x_test, y_test, x_test_raw, y_test_raw, x_test_size,
@@ -201,7 +201,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
                     logger.info(f"Variable '{var}' does not exist, continuing...")
             torch.cuda.empty_cache()
             gc.collect()
-        #主要是报错时没有执行这些命令导致的，我感觉
+        # This is mainly caused by these commands not being executed when errors occur, I think
 
 
     entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(

From f0b8db47f24db4985efd7c3ce048711882e071fe Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 08:51:32 +0000
Subject: [PATCH 196/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_dcca/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_dcca/main.py b/examples/tuning/joint_embedding_dcca/main.py
index af76f595..42465bcb 100644
--- a/examples/tuning/joint_embedding_dcca/main.py
+++ b/examples/tuning/joint_embedding_dcca/main.py
@@ -12,10 +12,10 @@
 import scipy
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
 import dance.utils.metrics as metrics
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.dcca import DCCA

From 3c3f527c3f52ad61b0ec1e6863b235dcf2f9f825 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 16:55:00 +0800
Subject: [PATCH 197/203] minor

---
 examples/tuning/joint_embedding_scmogcn/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index 39152202..14161f93 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper
@@ -100,7 +100,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
                 data.data["mod1"].obsm["phase_labels"] = np.zeros(data.data['mod1'].shape[0])
 
             # train_size = len(data.get_split_idx("train"))
-            #按理说meta1应该包括mod1前半部分的所有内容，可能中途打乱了顺序
+            # In theory, meta1 should include all content from the first half of mod1, the order might have been shuffled during processing
             data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod1")(data)
             data = CellFeatureBipartiteGraph(cell_feature_channel="feature.cell", mod="mod2")(data)
             # data.set_config(

From 93d96c1759869db34215b73eae9b4fb0e8eebffd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 08:55:54 +0000
Subject: [PATCH 198/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmogcn/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tuning/joint_embedding_scmogcn/main.py b/examples/tuning/joint_embedding_scmogcn/main.py
index 14161f93..d860be0e 100644
--- a/examples/tuning/joint_embedding_scmogcn/main.py
+++ b/examples/tuning/joint_embedding_scmogcn/main.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmogcn import ScMoGCNWrapper

From e2bd54078403f0fea42f128a7613e947ad2d67f9 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 16:58:11 +0800
Subject: [PATCH 199/203] minor

---
 examples/tuning/joint_embedding_scmvae/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index e20be682..e4c0f3aa 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -9,9 +9,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
-import wandb
 from sklearn import preprocessing
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE
@@ -134,7 +134,8 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
 
             x_test = torch.cat([x_train, x_test])
             y_test = torch.cat([y_train, y_test])
-            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
+            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"])
+                                      )  # This might be problematic, likely due to dimensionality reduction issues
             model = scMVAE(
                 encoder_1=[Nfeature1, 1024, 128, 128],
                 hidden_1=128,

From 312ab95f01e9cf5eea66985774373ef6a7ee1951 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 16:58:28 +0800
Subject: [PATCH 200/203] minor

---
 examples/tuning/predict_modality_babel/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py
index 112317f7..5f47bbbb 100644
--- a/examples/tuning/predict_modality_babel/main.py
+++ b/examples/tuning/predict_modality_babel/main.py
@@ -6,8 +6,8 @@
 
 import pandas as pd
 import torch
-import wandb
 
+import wandb
 from dance import logger
 from dance.datasets.multimodality import ModalityPredictionDataset
 from dance.modules.multi_modality.predict_modality.babel import BabelWrapper
@@ -77,7 +77,7 @@ def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer)
         x_test, y_test = data.get_test_data(return_type="torch")
         x_train, y_train, x_test, y_test = x_train.float(), y_train.float(), x_test.float(), y_test.float()
         # Train and evaluate the model
-        #突然想到，或许有些算法可以降维，而有些算法不能降维，所以还是要依据算法而定
+        # Just realized some algorithms can do dimensionality reduction while others cannot, so it depends on the algorithm
         model = BabelWrapper(args, dim_in=x_train.shape[1], dim_out=y_train.shape[1])
         model.fit(x_train, y_train, val_ratio=0.15)
         wandb.log({'rmse': model.score(x_test, y_test)})

From 5e51a664505f0704fa844eae00b0f074b832920e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 08:59:11 +0000
Subject: [PATCH 201/203] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/tuning/joint_embedding_scmvae/main.py | 2 +-
 examples/tuning/predict_modality_babel/main.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tuning/joint_embedding_scmvae/main.py b/examples/tuning/joint_embedding_scmvae/main.py
index e4c0f3aa..5c1e264c 100644
--- a/examples/tuning/joint_embedding_scmvae/main.py
+++ b/examples/tuning/joint_embedding_scmvae/main.py
@@ -9,9 +9,9 @@
 import pandas as pd
 import torch
 import torch.utils.data as data_utils
+import wandb
 from sklearn import preprocessing
 
-import wandb
 from dance import logger
 from dance.datasets.multimodality import JointEmbeddingNIPSDataset
 from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE
diff --git a/examples/tuning/predict_modality_babel/main.py b/examples/tuning/predict_modality_babel/main.py
index 5f47bbbb..97062150 100644
--- a/examples/tuning/predict_modality_babel/main.py
+++ b/examples/tuning/predict_modality_babel/main.py
@@ -6,8 +6,8 @@
 
 import pandas as pd
 import torch
-
 import wandb
+
 from dance import logger
 from dance.datasets.multimodality import ModalityPredictionDataset
 from dance.modules.multi_modality.predict_modality.babel import BabelWrapper

From 1a98457e0b1498391e9f1bcf38c0b4491204755e Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 17:03:45 +0800
Subject: [PATCH 202/203] minor

---
 .../joint_embedding_scmvae/main.py            | 208 ------------------
 1 file changed, 208 deletions(-)
 delete mode 100644 examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py

diff --git a/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py b/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py
deleted file mode 100644
index 9fb85885..00000000
--- a/examples/tuning/temp_scmvae/joint_embedding_scmvae/main.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import argparse
-import gc
-import os
-import pprint
-import sys
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import torch
-import torch.utils.data as data_utils
-import wandb
-from sklearn import preprocessing
-
-from dance import logger
-from dance.datasets.multimodality import JointEmbeddingNIPSDataset
-from dance.modules.multi_modality.joint_embedding.scmvae import scMVAE
-from dance.pipeline import PipelinePlaner, get_step3_yaml, run_step3, save_summary_data
-from dance.transforms.preprocess import calculate_log_library_size
-from dance.utils import set_seed
-
-
-def parameter_setting():
-    parser = argparse.ArgumentParser(description="Single cell Multi-omics data analysis")
-
-    parser.add_argument("--workdir", "-wk", type=str, default="./new_test", help="work path")
-    parser.add_argument("--outdir", "-od", type=str, default="./new_test", help="Output path")
-
-    parser.add_argument("--lr", type=float, default=1E-3, help="Learning rate")
-    parser.add_argument("--weight_decay", type=float, default=1e-6, help="weight decay")
-    parser.add_argument("--eps", type=float, default=0.01, help="eps")
-    parser.add_argument("--runs", type=int, default=1, help="Number of repetitions")
-
-    parser.add_argument("--batch_size", "-b", type=int, default=64, help="Batch size")
-    parser.add_argument('-seed', '--seed', type=int, default=1, help='Random seed for repeat results')
-    parser.add_argument("--latent", "-l", type=int, default=10, help="latent layer dim")
-    parser.add_argument("--max_epoch", "-me", type=int, default=25, help="Max epoches")
-    parser.add_argument("--max_iteration", "-mi", type=int, default=3000, help="Max iteration")
-    parser.add_argument("--anneal_epoch", "-ae", type=int, default=200, help="Anneal epoch")
-    parser.add_argument("--epoch_per_test", "-ept", type=int, default=1,
-                        help="Epoch per test, must smaller than max iteration.")
-    parser.add_argument("--max_ARI", "-ma", type=int, default=-200, help="initial ARI")
-    parser.add_argument("-t", "--subtask", default="openproblems_bmmc_cite_phase2")
-    parser.add_argument("-device", "--device", default="cuda")
-    parser.add_argument("--final_rate", type=float, default=1e-4)
-    parser.add_argument("--scale_factor", type=float, default=4)
-
-    parser.add_argument("--cache", action="store_true", help="Cache processed data.")
-    parser.add_argument("--tune_mode", default="pipeline_params", choices=["pipeline", "params", "pipeline_params"])
-    parser.add_argument("--count", type=int, default=2)
-    parser.add_argument("--sweep_id", type=str, default=None)
-    parser.add_argument("--summary_file_path", default="results/pipeline/best_test_acc.csv", type=str)
-    parser.add_argument("--root_path", default=str(Path(__file__).resolve().parent), type=str)
-
-    return parser
-
-
-if __name__ == "__main__":
-    parser = parameter_setting()
-    args = parser.parse_args()
-    assert args.max_iteration > args.epoch_per_test
-    device = torch.device(args.device)
-    args.lr = 0.001
-    args.anneal_epoch = 200
-    res = None
-    logger.info(f"\n{pprint.pformat(vars(args))}")
-    file_root_path = Path(args.root_path, args.subtask).resolve()
-    logger.info(f"\n files is saved in {file_root_path}")
-    pipeline_planer = PipelinePlaner.from_config_file(f"{file_root_path}/{args.tune_mode}_tuning_config.yaml")
-    os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "2000"
-
-    def evaluate_pipeline(tune_mode=args.tune_mode, pipeline_planer=pipeline_planer):
-        wandb.init(settings=wandb.Settings(start_method='thread'))
-        set_seed(args.seed)
-        wandb_config = wandb.config
-        if "run_kwargs" in pipeline_planer.config:
-            if any(d == dict(wandb.config["run_kwargs"]) for d in pipeline_planer.config.run_kwargs):
-                wandb_config = wandb_config["run_kwargs"]
-            else:
-                wandb.log({"skip": 1})
-                wandb.finish()
-                return
-        try:
-            dataset = JointEmbeddingNIPSDataset(args.subtask, root="./data/joint_embedding")
-            data = dataset.load_data()
-
-            le = preprocessing.LabelEncoder()
-            labels = le.fit_transform(data.mod["test_sol"].obs["cell_type"])
-            data.mod["mod1"].obsm["labels"] = labels
-
-            # Prepare preprocessing pipeline and apply it to data
-            kwargs = {tune_mode: dict(wandb_config)}
-            preprocessing_pipeline = pipeline_planer.generate(**kwargs)
-            print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}")
-            preprocessing_pipeline(data)
-            train_name = [item for item in data.mod["mod1"].obs_names if item in data.mod["meta1"].obs_names]
-            train_idx = [data.mod["mod1"].obs_names.get_loc(name) for name in train_name]
-            test_idx = list({i for i in range(data.mod["mod1"].shape[0])}.difference(set(train_idx)))
-
-            # train_size=data.mod["meta1"].shape[0]
-            # test_size=data.mod["mod1"].shape[0]-train_size
-            data.set_split_idx("train", train_idx)
-            data.set_split_idx("test", test_idx)
-            (x_train, y_train, x_train_raw, y_train_raw), _ = data.get_train_data(return_type="torch")
-            (x_test, y_test, x_test_raw, y_test_raw), labels = data.get_test_data(return_type="torch")
-            # x_train,y_train,x_test,y_test,labels=torch.nan_to_num(x_train),torch.nan_to_num(y_train),torch.nan_to_num(x_test),torch.nan_to_num(y_test),torch.nan_to_num(labels)
-            lib_mean1, lib_var1 = calculate_log_library_size(np.concatenate([x_train_raw.numpy(), x_test_raw.numpy()]))
-            lib_mean2, lib_var2 = calculate_log_library_size(np.concatenate([y_train_raw.numpy(), y_test_raw.numpy()]))
-            lib_mean1 = torch.from_numpy(lib_mean1)
-            lib_var1 = torch.from_numpy(lib_var1)
-            lib_mean2 = torch.from_numpy(lib_mean2)
-            lib_var2 = torch.from_numpy(lib_var2)
-
-            Nfeature1 = x_train.shape[1]
-            Nfeature2 = y_train.shape[1]
-            # train_size = len(data.get_split_idx("train"))
-            # train_size=x_train.shape[0]
-            train = data_utils.TensorDataset(x_train, lib_mean1[train_idx], lib_var1[train_idx], lib_mean2[train_idx],
-                                             lib_var2[train_idx], y_train)
-
-            valid = data_utils.TensorDataset(x_test, lib_mean1[test_idx], lib_var1[test_idx], lib_mean2[test_idx],
-                                             lib_var2[test_idx], y_test)
-
-            total = data_utils.TensorDataset(torch.cat([x_train, x_test]), torch.cat([y_train, y_test]))
-
-            total_loader = data_utils.DataLoader(total, batch_size=args.batch_size, shuffle=False)
-
-            x_test = torch.cat([x_train, x_test])
-            y_test = torch.cat([y_train, y_test])
-            labels = torch.from_numpy(le.fit_transform(data.mod["test_sol"].obs["cell_type"]))  #这里大概会有问题，很可能就是降维的问题
-            model = scMVAE(
-                encoder_1=[Nfeature1, 1024, 128, 128],
-                hidden_1=128,
-                Z_DIMS=22,
-                decoder_share=[22, 128, 256],
-                share_hidden=128,
-                decoder_1=[128, 128, 1024],
-                hidden_2=1024,
-                encoder_l=[Nfeature1, 128],
-                hidden3=128,
-                encoder_2=[Nfeature2, 1024, 128, 128],
-                hidden_4=128,
-                encoder_l1=[Nfeature2, 128],
-                hidden3_1=128,
-                decoder_2=[128, 128, 1024],
-                hidden_5=1024,
-                drop_rate=0.1,
-                log_variational=True,
-                Type="ZINB",
-                device=device,
-                n_centroids=22,
-                penality="GMM",
-                model=1,
-            )
-            model.to(device)
-            model.init_gmm_params(total_loader)
-            model.fit(args, train, valid, args.final_rate, args.scale_factor, device)
-
-            # embeds = model.predict(x_test, y_test).cpu().numpy()
-            score = model.score(x_test, y_test, labels)
-            # score.update(model.score(x_test, y_test, labels, adata_sol=data.data['test_sol'], metric="openproblems"))
-            score["ARI"] = score["dance_ari"]
-            del score["dance_ari"]
-            wandb.log(score)
-            wandb.finish()
-        finally:
-            locals_keys = list(locals().keys())
-            for var in locals_keys:
-                try:
-                    exec(f"del {var}")
-                    logger.info(f"Deleted '{var}'")
-                except NameError:
-                    logger.info(f"Variable '{var}' does not exist, continuing...")
-            torch.cuda.empty_cache()
-            gc.collect()
-        # score.update({
-        #     'seed': args.seed + k,
-        #     'subtask': args.subtask,
-        #     'method': 'scmvae',
-        # })
-
-        # if res is not None:
-        #     res = res.append(score, ignore_index=True)
-        # else:
-        #     for s in score:
-        #         score[s] = [score[s]]
-        #     res = pd.DataFrame(score)
-
-    entity, project, sweep_id = pipeline_planer.wandb_sweep_agent(
-        evaluate_pipeline, sweep_id=args.sweep_id, count=args.count)  #Score can be recorded for each epoch
-    save_summary_data(entity, project, sweep_id, summary_file_path=args.summary_file_path, root_path=file_root_path)
-    if args.tune_mode == "pipeline" or args.tune_mode == "pipeline_params":
-        get_step3_yaml(result_load_path=f"{args.summary_file_path}", step2_pipeline_planer=pipeline_planer,
-                       conf_load_path=f"{Path(args.root_path).resolve().parent}/step3_default_params.yaml",
-                       root_path=file_root_path,
-                       required_funs=["AlignMod", "FilterCellsCommonMod", "FilterCellsCommonMod",
-                                      "SetConfig"], required_indexes=[2, 11, 14, sys.maxsize], metric="ARI")
-        if args.tune_mode == "pipeline_params":
-            run_step3(file_root_path, evaluate_pipeline, tune_mode="params", step2_pipeline_planer=pipeline_planer)
-"""To reproduce scMVAE on other samples, please refer to command lines belows:
-
-GEX-ADT:
-$ python scmvae.py --subtask openproblems_bmmc_cite_phase2 --device cuda
-
-GEX-ATAC:
-$ python scmvae.py --subtask openproblems_bmmc_multiome_phase2 --device cuda
-
-"""

From a19aa536eac2683f35292d30d967016259e066b9 Mon Sep 17 00:00:00 2001
From: xzy <xzy12386@163.com>
Date: Fri, 20 Dec 2024 21:38:00 +0800
Subject: [PATCH 203/203] update data

---
 dance/metadata/scdeepsort.csv | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dance/metadata/scdeepsort.csv b/dance/metadata/scdeepsort.csv
index 39e41209..446cc2f9 100644
--- a/dance/metadata/scdeepsort.csv
+++ b/dance/metadata/scdeepsort.csv
@@ -170,13 +170,13 @@ human,Kidney,10000,train,,,train_human_Kidney2adb1f8a-a6b1-4909-8ee8-484814e2d4b
 human,Kidney,10000,train,,,train_human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad,https://www.dropbox.com/scl/fi/feklth6jvnc5qqwvgaydy/human_Kidney0b75c598-0893-4216-afe8-5414cab7739d_data.h5ad?rlkey=28vpy2m90lnri9aekfthrsvr1&dl=1
 human,Kidney,5848,train,,,train_human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad,https://www.dropbox.com/scl/fi/1jq1wrqo1rcl041antcm8/human_Kidney2aa1c93c-4ef3-4e9a-98e7-0bd37933953c_data.h5ad?rlkey=ssgfsiobqfah3pxgqnrsaff6l&dl=1
 human,Kidney,9641,train,,,train_human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad,https://www.dropbox.com/scl/fi/o2cnntkrd5j6coeqehv8b/human_Kidney2423ce2c-3149-4cca-a2ff-cf682ea29b5f_data.h5ad?rlkey=5tbupfd3cdvqzy2rix6scvwzu&dl=1
-human,Lung,10000,train,,,train_human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad,https://www.dropbox.com/scl/fi/w0n6axa32nej87tw4rk49/human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data.h5ad?rlkey=8lgoi54y9wtxtfwpmnumpmzex&dl=1
+human,Lung,10000,train,,,train_human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/w0n6axa32nej87tw4rk49/human_Lungfa27492b-82ff-4ab7-ac61-0e2b184eee67_data-Lung.h5ad?rlkey=8lgoi54y9wtxtfwpmnumpmzex&st=1ofxszp7&dl=1
 human,Lung,10000,train,,,train_human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad,https://www.dropbox.com/scl/fi/dqhei15s96dg3q8bdd31b/human_Lungf72958f5-7f42-4ebb-98da-445b0c6de516_data.h5ad?rlkey=ykpxbucys97t327fwehflkoa2&dl=1
 human,Lung,10000,train,,,train_human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad,https://www.dropbox.com/scl/fi/pwhyse079mo9radk2xzuw/human_Lung3de0ad6d-4378-4f62-b37b-ec0b75a50d94_data.h5ad?rlkey=t60bp7w5mf3k877q1i430oc14&dl=1
 human,Lung,10000,train,,,train_human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad,https://www.dropbox.com/scl/fi/w2r13kqrkzdxecvhizm0i/human_Lung1e5bd3b8-6a0e-4959-8d69-cafed30fe814_data.h5ad?rlkey=6s4wbv2ii1d8ged5l8s8lwt6l&dl=1
-human,Lung,10000,train,,,train_human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad,https://www.dropbox.com/scl/fi/ubcw0cyn5uvaq034ysgxl/human_Lung4ed927e9-c099-49af-b8ce-a2652d069333_data.h5ad?rlkey=m57pb8bx4936fnao2yyljqdgz&dl=1
-human,Lung,10000,train,,,train_human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad,https://www.dropbox.com/scl/fi/0vqe7wmb0afoubwnb5srb/human_Lung01209dce-3575-4bed-b1df-129f57fbc031_data.h5ad?rlkey=7necb5o9afgpnppsj74tga5y2&dl=1
-human,Lung,10000,train,,,train_human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad,https://www.dropbox.com/scl/fi/sbe6h2v5dijlu36qd6nyw/human_Lungc5d88abe-f23a-45fa-a534-788985e93dad_data.h5ad?rlkey=gunxweprd7r8e0xlk9mo2kkv3&dl=1
+human,Lung,10000,train,,,train_human_Lung4ed927e9-c099-49af-b8ce-a2652d069333(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/ubcw0cyn5uvaq034ysgxl/human_Lung4ed927e9-c099-49af-b8ce-a2652d069333-Lung-_data.h5ad?rlkey=m57pb8bx4936fnao2yyljqdgz&st=hz86nc4q&dl=1
+human,Lung,10000,train,,,train_human_Lung01209dce-3575-4bed-b1df-129f57fbc031(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/0vqe7wmb0afoubwnb5srb/human_Lung01209dce-3575-4bed-b1df-129f57fbc031-Lung-_data.h5ad?rlkey=7necb5o9afgpnppsj74tga5y2&st=1a1tjz2c&dl=1
+human,Lung,10000,train,,,train_human_Lungc5d88abe-f23a-45fa-a534-788985e93dad(Lung)_data.h5ad,https://www.dropbox.com/scl/fi/sbe6h2v5dijlu36qd6nyw/human_Lungc5d88abe-f23a-45fa-a534-788985e93dad-Lung-_data.h5ad?rlkey=gunxweprd7r8e0xlk9mo2kkv3&st=yn1er34y&dl=1
 human,Lung,10000,train,,,train_human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad,https://www.dropbox.com/scl/fi/mz6umlbnjoxynhqklwyxg/human_Lung9968be68-ab65-4a38-9e1a-c9b6abece194_data.h5ad?rlkey=upom03ch71gebjvxq15x59gk9&dl=1
 human,Lung,10000,train,,,train_human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad,https://www.dropbox.com/scl/fi/b2e6gr542wah0t5xtgshh/human_Lung1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1_data.h5ad?rlkey=7pkq1kh7wz6z0qzj4wdj94i79&dl=1
 human,Lung,10000,train,,,train_human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad,https://www.dropbox.com/scl/fi/ymmdfevzihlcyjosugyuq/human_Lung486486d4-9462-43e5-9249-eb43fa5a49a6_data.h5ad?rlkey=71rly1fkb21yl8gxy8af42ke7&dl=1