From 702da0c53d8cf82e973756a346a30c052b344e0d Mon Sep 17 00:00:00 2001
From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com>
Date: Sat, 9 Sep 2023 09:28:09 +0800
Subject: [PATCH 01/11] add imputation metadata

---
 dance/metadata/imputation.csv |  9 +++++++
 dance/metadata/imputation.py  | 47 +++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 dance/metadata/imputation.csv
 create mode 100644 dance/metadata/imputation.py

diff --git a/dance/metadata/imputation.csv b/dance/metadata/imputation.csv
new file mode 100644
index 00000000..ace36502
--- /dev/null
+++ b/dance/metadata/imputation.csv
@@ -0,0 +1,9 @@
+pbmc_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0
+mouse_embryo_data,https://www.dropbox.com/s/8ftx1bydoy7kn6p/GSE65525.zip?dl=0
+mouse_brain_data,https://www.dropbox.com/s/zzpotaayy2i29hk/neuron_10k.zip?dl=0
+human_stemcell_data,https://www.dropbox.com/s/g2qua2j3rqcngn6/GSE75748.zip?dl=0
+pbmc_raw_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0
+human_breast_TGFb_data,https://dl.dropboxusercontent.com/scl/fi/qympicswl7slkksbjk7cp/GSE114397.zip?dl=0
+human_breast_Dox_data,https://dl.dropboxusercontent.com/scl/fi/f2ifl5druqjr2rji8h4qq/GSM3141014.zip?dl=0
+human_melanoma_data,https://dl.dropboxusercontent.com/scl/fi/ci9ihqytb4sys3u4xkdbq/human_melanoma_data.zip?dl=0
+mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0
\ No newline at end of file
diff --git a/dance/metadata/imputation.py b/dance/metadata/imputation.py
new file mode 100644
index 00000000..126a8e90
--- /dev/null
+++ b/dance/metadata/imputation.py
@@ -0,0 +1,47 @@
+import os.path as osp
+IMPUTATION_DATASET_TO_FILE = {
+        "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
+        "mouse_embryo_data": [
+            osp.join("GSE65525", i)
+            for i in [
+                "GSM1599494_ES_d0_main.csv",
+                "GSM1599497_ES_d2_LIFminus.csv",
+                "GSM1599498_ES_d4_LIFminus.csv",
+                "GSM1599499_ES_d7_LIFminus.csv",
+            ]
+        ],
+        "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5",
+        "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz",
+        "human_breast_TGFb_data": "GSE114397_HMLE_TGFb.csv",
+        "human_breast_Dox_data": "GSM3141014_Zeb1_Dox.csv",
+        "human_melanoma_data": "human_melanoma_data.csv",
+        "mouse_visual_data": ['GSM2746905_B4_11_0h_counts.csv',
+                              # 'GSM2746906_B4_12_0h_counts.csv',
+                              # 'GSM2746922_B7_23_4h_B_counts.csv',
+                              # 'GSM2746895_B1_1_0h_counts.csv',
+                              # 'GSM2746916_B6_20_4h_A_counts.csv',
+                              # 'GSM2746903_B3_9_4h_counts.csv',
+                              # 'GSM2746914_B6_19_4h_A_counts.csv',
+                              # 'GSM2746908_B5_14_0h_counts.csv',
+                              # 'GSM2746907_B5_13_0h_counts.csv',
+                              # 'GSM2746917_B6_20_4h_B_counts.csv',
+                              # 'GSM2746918_B7_21_1h_counts.csv',
+                              # 'GSM2746898_B2_4_1h_counts.csv',
+                              # 'GSM2746909_B5_15_0h_counts.csv',
+                              # 'GSM2746915_B6_19_4h_B_counts.csv',
+                              # 'GSM2746897_B1_3_4h_counts.csv',
+                              # 'GSM2746902_B3_8_1h_counts.csv',
+                              # 'GSM2746911_B6_17_1h_A_counts.csv',
+                              # 'GSM2746904_B3_10_4h_counts.csv',
+                              # 'GSM2746900_B3_6_0h_counts.csv',
+                              # 'GSM2746920_B7_22_4h_B_counts.csv',
+                              # 'GSM2746896_B1_2_1h_counts.csv',
+                              # 'GSM2746921_B7_23_4h_A_counts.csv',
+                              # 'GSM2746899_B3_5_0h_counts.csv',
+                              # 'GSM2746919_B7_22_4h_A_counts.csv',
+                              # 'GSM2746901_B3_7_1h_counts.csv',
+                              # 'GSM2746910_B5_16_0h_counts.csv',
+                              # 'GSM2746912_B6_17_1h_B_counts.csv',
+                              'GSM2746913_B6_18_1h_counts.csv'
+                              ]
+    }
\ No newline at end of file

From 24b8af00005827e9fa01edab1972d0689c40e3f1 Mon Sep 17 00:00:00 2001
From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com>
Date: Sat, 9 Sep 2023 09:30:42 +0800
Subject: [PATCH 02/11] add 3 imputation datasets

---
 dance/datasets/singlemodality.py | 103 ++++++++++++++++---------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
index bf778dec..7c46e160 100644
--- a/dance/datasets/singlemodality.py
+++ b/dance/datasets/singlemodality.py
@@ -22,7 +22,7 @@
 from dance.utils.download import download_file, download_unzip
 from dance.utils.io import load_data_url_dict_from_csv
 from dance.utils.preprocess import cell_label_to_df
-
+from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE
 
 def _load_scdeepsort_metadata():
     path = METADIR / "scdeepsort.csv"
@@ -40,13 +40,12 @@ def _load_scdeepsort_metadata():
 
 @register_dataset("scdeepsort")
 class ScDeepSortDataset(BaseDataset):
-
     _DISPLAY_ATTRS = ("species", "tissue", "train_dataset", "test_dataset")
     ALL_URL_DICT: Dict[str, str] = {
-        "train_human_cell_atlas":   "https://www.dropbox.com/s/1itq1pokplbqxhx?dl=1",
-        "test_human_test_data":     "https://www.dropbox.com/s/gpxjnnvwyblv3xb?dl=1",
-        "train_mouse_cell_atlas":   "https://www.dropbox.com/s/ng8d3eujfah9ppl?dl=1",
-        "test_mouse_test_data":     "https://www.dropbox.com/s/pkr28czk5g3al2p?dl=1",
+        "train_human_cell_atlas": "https://www.dropbox.com/s/1itq1pokplbqxhx?dl=1",
+        "test_human_test_data": "https://www.dropbox.com/s/gpxjnnvwyblv3xb?dl=1",
+        "train_mouse_cell_atlas": "https://www.dropbox.com/s/ng8d3eujfah9ppl?dl=1",
+        "test_mouse_test_data": "https://www.dropbox.com/s/pkr28czk5g3al2p?dl=1",
     }  # yapf: disable
     BENCH_URL_DICT, AVAILABLE_DATA = _load_scdeepsort_metadata()
 
@@ -82,17 +81,26 @@ def download_all(self):
                 pass
             os.rename(download_path, move_path)
 
+    def get_all_filenames(self, filetype: str = "csv",
+                          feat_suffix: str = "data", label_suffix: str = "celltype"):
+        filenames = []
+        for id in self.train_dataset:
+            filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}")
+            filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{filetype}")
+        return filenames
+
     def download(self, download_map=True):
         if self.is_complete():
             return
 
-        # TODO: only download missing files
+        filenames=self.get_all_filenames()
         # Download training and testing data
         for name, url in self.BENCH_URL_DICT.items():
             parts = name.split("_")  # [train|test]_{species}_{tissue}{id}_[celltype|data].csv
             filename = "_".join(parts[1:])
-            filepath = osp.join(self.data_dir, *parts[:2], filename)
-            download_file(url, filepath)
+            if filename in filenames:
+                filepath = osp.join(self.data_dir, *parts[:2], filename)
+                download_file(url, filepath)
 
         if download_map:
             # Download mapping data
@@ -115,7 +123,7 @@ def is_complete_all(self):
     def is_complete(self):
         """Check if benchmarking data is complete."""
         for name in self.BENCH_URL_DICT:
-            filename = name[name.find('mouse'):]
+            filename = name[name.find(self.species):]
             file_i = osp.join(self.data_dir, *name.split("_")[:2], filename)
             if not osp.exists(file_i):
                 logger.info(file_i)
@@ -276,27 +284,8 @@ def _raw_to_dance(self, raw_data: Tuple[ad.AnnData, np.ndarray]):
 
 @register_dataset("imputation")
 class ImputationDataset(BaseDataset):
-
-    URL = {
-        "pbmc_data": "https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0",
-        "mouse_embryo_data": "https://www.dropbox.com/s/8ftx1bydoy7kn6p/GSE65525.zip?dl=0",
-        "mouse_brain_data": "https://www.dropbox.com/s/zzpotaayy2i29hk/neuron_10k.zip?dl=0",
-        "human_stemcell_data": "https://www.dropbox.com/s/g2qua2j3rqcngn6/GSE75748.zip?dl=0"
-    }
-    DATASET_TO_FILE = {
-        "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
-        "mouse_embryo_data": [
-            osp.join("GSE65525", i)
-            for i in [
-                "GSM1599494_ES_d0_main.csv",
-                "GSM1599497_ES_d2_LIFminus.csv",
-                "GSM1599498_ES_d4_LIFminus.csv",
-                "GSM1599499_ES_d7_LIFminus.csv",
-            ]
-        ],
-        "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5",
-        "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz"
-    }  # yapf: disable
+    URL = load_data_url_dict_from_csv(METADIR / "imputation.csv")
+    DATASET_TO_FILE =IMPUTATION_DATASET_TO_FILE   # yapf: disable
     AVAILABLE_DATA = sorted(URL)
 
     def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1):
@@ -307,20 +296,29 @@ def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1):
 
     def download(self):
 
-        gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data"]
+        gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data",
+                      "human_breast_TGFb_data", "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data"]
 
         file_name = {
             "pbmc_data": "5k.zip?dl=0",
             "mouse_embryo_data": "GSE65525.zip?dl=0",
             "mouse_brain_data": "neuron_10k.zip?dl=0",
-            "human_stemcell_data": "GSE75748.zip?dl=0"
+            "human_stemcell_data": "GSE75748.zip?dl=0",
+            "human_breast_TGFb_data": "GSE114397.zip?dl=0",
+            "human_breast_Dox_data": "GSM3141014.zip?dl=0",
+            "human_melanoma_data": "human_melanoma_data.zip?dl=0",
+            "mouse_visual_data": "mouse_visual_data.zip?dl=0"
         }
 
         dl_files = {
             "pbmc_data": "5k_*",
             "mouse_embryo_data": "GSE65525",
             "mouse_brain_data": "neuron*",
-            "human_stemcell_data": "GSE75748"
+            "human_stemcell_data": "GSE75748",
+            "human_breast_TGFb_data": "GSE11*",
+            "human_breast_Dox_data": "GSM31*",
+            "human_melanoma_data": "human*",
+            "mouse_visual_data": "GSM27*"
         }
 
         if sys.platform != 'win32':
@@ -330,12 +328,13 @@ def download(self):
                 os.system("mkdir " + self.data_dir + "/train")
 
             for class_name in gene_class:
-                if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
-                    os.system("mkdir " + self.data_dir + "/train/" + class_name)
-                    os.system("wget " + self.URL[class_name])  # assumes linux... mac needs to install
-                    os.system("unzip " + file_name[class_name])
-                    os.system("rm " + file_name[class_name])
-                    os.system("mv " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
+                if self.dataset==gene_class:
+                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
+                        os.system("mkdir " + self.data_dir + "/train/" + class_name)
+                        os.system("wget " + self.URL[class_name])  # assumes linux... mac needs to install
+                        os.system("unzip " + file_name[class_name])
+                        os.system("rm " + file_name[class_name])
+                        os.system("mv " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
             os.system("cp -r " + self.data_dir + "/train/ " + self.data_dir + "/test")
         if sys.platform == 'win32':
             if not osp.exists(self.data_dir):
@@ -343,12 +342,13 @@ def download(self):
             if not osp.exists(self.data_dir + "/train"):
                 os.mkdir(self.data_dir + "/train")
             for class_name in gene_class:
-                if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
-                    os.mkdir(self.data_dir + "/train/" + class_name)
-                    os.system("curl " + self.URL[class_name])
-                    os.system("tar -xf " + file_name[class_name])
-                    os.system("del -R " + file_name[class_name])
-                    os.system("move " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
+                if self.dataset==gene_class:
+                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
+                        os.mkdir(self.data_dir + "/train/" + class_name)
+                        os.system("curl " + self.URL[class_name])
+                        os.system("tar -xf " + file_name[class_name])
+                        os.system("del -R " + file_name[class_name])
+                        os.system("move " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
             os.system("copy /r " + self.data_dir + "/train/ " + self.data_dir + "/test")
 
     def is_complete(self):
@@ -370,7 +370,7 @@ def _load_raw_data(self) -> ad.AnnData:
         else:
             dataset = self.dataset
 
-        if self.dataset == 'mouse_embryo' or self.dataset == 'mouse_embryo_data':
+        if self.dataset == 'mouse_embryo' or self.dataset == 'mouse_embryo_data' or self.dataset == "mouse_visual_data":
             for i in range(len(self.DATASET_TO_FILE[dataset])):
                 fname = self.DATASET_TO_FILE[dataset][i]
                 data_path = f'{self.data_dir}/train/{dataset}/{fname}'
@@ -394,12 +394,15 @@ def _load_raw_data(self) -> ad.AnnData:
                 raise FileNotFoundError(f"{data_path} does not exist")
 
             if self.DATASET_TO_FILE[dataset][-3:] == 'csv':
-                counts = pd.read_csv(data_path, index_col=0, header=None)
+                counts = pd.read_csv(data_path, header=None, index_col=0)
+                nums = pd.Series(np.arange(counts.shape[1]))
+                nums = pd.DataFrame(nums)
+                nums.columns = ['nums']
                 counts = counts.T
+                counts.index = [i for i in range(counts.shape[0])]
                 adata = ad.AnnData(csr_matrix(counts.values))
-                # adata.obs_names = ["%d"%i for i in range(adata.shape[0])]
-                adata.obs_names = counts.index.tolist()
                 adata.var_names = counts.columns.tolist()
+                adata.obs['nums'] = nums.to_numpy()
             if self.DATASET_TO_FILE[dataset][-2:] == 'gz':
                 counts = pd.read_csv(data_path, index_col=0, compression='gzip', header=0)
                 counts = counts.T

From 3bf9575468a05e18030b03e37081c7ba9d865845 Mon Sep 17 00:00:00 2001
From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com>
Date: Sat, 9 Sep 2023 09:32:13 +0800
Subject: [PATCH 03/11] Update gene_holdout.py

---
 dance/transforms/gene_holdout.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dance/transforms/gene_holdout.py b/dance/transforms/gene_holdout.py
index a0749353..1b1b0982 100644
--- a/dance/transforms/gene_holdout.py
+++ b/dance/transforms/gene_holdout.py
@@ -40,8 +40,8 @@ def __call__(self, data):
         covariance_matrix = np.cov(feat, rowvar=False)
         predictors = []
         for targs in enumerate(targets):
-            genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs)
-            subMatrix = covariance_matrix[targs][:, genes_not_in_target]
+            genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs[1])
+            subMatrix = covariance_matrix[targs[1]][:, genes_not_in_target]
             sorted_idx = np.argsort(-subMatrix, axis=0)
             preds = genes_not_in_target[sorted_idx[:self.n_top].flatten()]
             predictors.append(np.unique(preds))

From 0e05872b901c5b3ca97dc7b225d9e74332e6ed45 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 9 Sep 2023 01:38:24 +0000
Subject: [PATCH 04/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/datasets/singlemodality.py | 24 ++++----
 dance/metadata/imputation.csv    |  2 +-
 dance/metadata/imputation.py     | 97 +++++++++++++++++---------------
 3 files changed, 67 insertions(+), 56 deletions(-)

diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
index 7c46e160..2870bcda 100644
--- a/dance/datasets/singlemodality.py
+++ b/dance/datasets/singlemodality.py
@@ -17,12 +17,13 @@
 from dance.config import METADIR
 from dance.data import Data
 from dance.datasets.base import BaseDataset
+from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE
 from dance.registers import register_dataset
 from dance.typing import Dict, List, Optional, Set, Tuple
 from dance.utils.download import download_file, download_unzip
 from dance.utils.io import load_data_url_dict_from_csv
 from dance.utils.preprocess import cell_label_to_df
-from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE
+
 
 def _load_scdeepsort_metadata():
     path = METADIR / "scdeepsort.csv"
@@ -81,8 +82,7 @@ def download_all(self):
                 pass
             os.rename(download_path, move_path)
 
-    def get_all_filenames(self, filetype: str = "csv",
-                          feat_suffix: str = "data", label_suffix: str = "celltype"):
+    def get_all_filenames(self, filetype: str = "csv", feat_suffix: str = "data", label_suffix: str = "celltype"):
         filenames = []
         for id in self.train_dataset:
             filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}")
@@ -93,7 +93,7 @@ def download(self, download_map=True):
         if self.is_complete():
             return
 
-        filenames=self.get_all_filenames()
+        filenames = self.get_all_filenames()
         # Download training and testing data
         for name, url in self.BENCH_URL_DICT.items():
             parts = name.split("_")  # [train|test]_{species}_{tissue}{id}_[celltype|data].csv
@@ -296,8 +296,10 @@ def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1):
 
     def download(self):
 
-        gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data",
-                      "human_breast_TGFb_data", "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data"]
+        gene_class = [
+            "pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data", "human_breast_TGFb_data",
+            "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data"
+        ]
 
         file_name = {
             "pbmc_data": "5k.zip?dl=0",
@@ -328,8 +330,9 @@ def download(self):
                 os.system("mkdir " + self.data_dir + "/train")
 
             for class_name in gene_class:
-                if self.dataset==gene_class:
-                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
+                if self.dataset == gene_class:
+                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name,
+                                                             dl_files[class_name])))):
                         os.system("mkdir " + self.data_dir + "/train/" + class_name)
                         os.system("wget " + self.URL[class_name])  # assumes linux... mac needs to install
                         os.system("unzip " + file_name[class_name])
@@ -342,8 +345,9 @@ def download(self):
             if not osp.exists(self.data_dir + "/train"):
                 os.mkdir(self.data_dir + "/train")
             for class_name in gene_class:
-                if self.dataset==gene_class:
-                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
+                if self.dataset == gene_class:
+                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name,
+                                                             dl_files[class_name])))):
                         os.mkdir(self.data_dir + "/train/" + class_name)
                         os.system("curl " + self.URL[class_name])
                         os.system("tar -xf " + file_name[class_name])
diff --git a/dance/metadata/imputation.csv b/dance/metadata/imputation.csv
index ace36502..4d5480f1 100644
--- a/dance/metadata/imputation.csv
+++ b/dance/metadata/imputation.csv
@@ -6,4 +6,4 @@ pbmc_raw_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0
 human_breast_TGFb_data,https://dl.dropboxusercontent.com/scl/fi/qympicswl7slkksbjk7cp/GSE114397.zip?dl=0
 human_breast_Dox_data,https://dl.dropboxusercontent.com/scl/fi/f2ifl5druqjr2rji8h4qq/GSM3141014.zip?dl=0
 human_melanoma_data,https://dl.dropboxusercontent.com/scl/fi/ci9ihqytb4sys3u4xkdbq/human_melanoma_data.zip?dl=0
-mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0
\ No newline at end of file
+mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0
diff --git a/dance/metadata/imputation.py b/dance/metadata/imputation.py
index 126a8e90..fe1b3a52 100644
--- a/dance/metadata/imputation.py
+++ b/dance/metadata/imputation.py
@@ -1,47 +1,54 @@
 import os.path as osp
+
 IMPUTATION_DATASET_TO_FILE = {
-        "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
-        "mouse_embryo_data": [
-            osp.join("GSE65525", i)
-            for i in [
-                "GSM1599494_ES_d0_main.csv",
-                "GSM1599497_ES_d2_LIFminus.csv",
-                "GSM1599498_ES_d4_LIFminus.csv",
-                "GSM1599499_ES_d7_LIFminus.csv",
-            ]
-        ],
-        "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5",
-        "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz",
-        "human_breast_TGFb_data": "GSE114397_HMLE_TGFb.csv",
-        "human_breast_Dox_data": "GSM3141014_Zeb1_Dox.csv",
-        "human_melanoma_data": "human_melanoma_data.csv",
-        "mouse_visual_data": ['GSM2746905_B4_11_0h_counts.csv',
-                              # 'GSM2746906_B4_12_0h_counts.csv',
-                              # 'GSM2746922_B7_23_4h_B_counts.csv',
-                              # 'GSM2746895_B1_1_0h_counts.csv',
-                              # 'GSM2746916_B6_20_4h_A_counts.csv',
-                              # 'GSM2746903_B3_9_4h_counts.csv',
-                              # 'GSM2746914_B6_19_4h_A_counts.csv',
-                              # 'GSM2746908_B5_14_0h_counts.csv',
-                              # 'GSM2746907_B5_13_0h_counts.csv',
-                              # 'GSM2746917_B6_20_4h_B_counts.csv',
-                              # 'GSM2746918_B7_21_1h_counts.csv',
-                              # 'GSM2746898_B2_4_1h_counts.csv',
-                              # 'GSM2746909_B5_15_0h_counts.csv',
-                              # 'GSM2746915_B6_19_4h_B_counts.csv',
-                              # 'GSM2746897_B1_3_4h_counts.csv',
-                              # 'GSM2746902_B3_8_1h_counts.csv',
-                              # 'GSM2746911_B6_17_1h_A_counts.csv',
-                              # 'GSM2746904_B3_10_4h_counts.csv',
-                              # 'GSM2746900_B3_6_0h_counts.csv',
-                              # 'GSM2746920_B7_22_4h_B_counts.csv',
-                              # 'GSM2746896_B1_2_1h_counts.csv',
-                              # 'GSM2746921_B7_23_4h_A_counts.csv',
-                              # 'GSM2746899_B3_5_0h_counts.csv',
-                              # 'GSM2746919_B7_22_4h_A_counts.csv',
-                              # 'GSM2746901_B3_7_1h_counts.csv',
-                              # 'GSM2746910_B5_16_0h_counts.csv',
-                              # 'GSM2746912_B6_17_1h_B_counts.csv',
-                              'GSM2746913_B6_18_1h_counts.csv'
-                              ]
-    }
\ No newline at end of file
+    "pbmc_data":
+    "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
+    "mouse_embryo_data": [
+        osp.join("GSE65525", i) for i in [
+            "GSM1599494_ES_d0_main.csv",
+            "GSM1599497_ES_d2_LIFminus.csv",
+            "GSM1599498_ES_d4_LIFminus.csv",
+            "GSM1599499_ES_d7_LIFminus.csv",
+        ]
+    ],
+    "mouse_brain_data":
+    "neuron_10k_v3_filtered_feature_bc_matrix.h5",
+    "human_stemcell_data":
+    "GSE75748/GSE75748_sc_time_course_ec.csv.gz",
+    "human_breast_TGFb_data":
+    "GSE114397_HMLE_TGFb.csv",
+    "human_breast_Dox_data":
+    "GSM3141014_Zeb1_Dox.csv",
+    "human_melanoma_data":
+    "human_melanoma_data.csv",
+    "mouse_visual_data": [
+        'GSM2746905_B4_11_0h_counts.csv',
+        # 'GSM2746906_B4_12_0h_counts.csv',
+        # 'GSM2746922_B7_23_4h_B_counts.csv',
+        # 'GSM2746895_B1_1_0h_counts.csv',
+        # 'GSM2746916_B6_20_4h_A_counts.csv',
+        # 'GSM2746903_B3_9_4h_counts.csv',
+        # 'GSM2746914_B6_19_4h_A_counts.csv',
+        # 'GSM2746908_B5_14_0h_counts.csv',
+        # 'GSM2746907_B5_13_0h_counts.csv',
+        # 'GSM2746917_B6_20_4h_B_counts.csv',
+        # 'GSM2746918_B7_21_1h_counts.csv',
+        # 'GSM2746898_B2_4_1h_counts.csv',
+        # 'GSM2746909_B5_15_0h_counts.csv',
+        # 'GSM2746915_B6_19_4h_B_counts.csv',
+        # 'GSM2746897_B1_3_4h_counts.csv',
+        # 'GSM2746902_B3_8_1h_counts.csv',
+        # 'GSM2746911_B6_17_1h_A_counts.csv',
+        # 'GSM2746904_B3_10_4h_counts.csv',
+        # 'GSM2746900_B3_6_0h_counts.csv',
+        # 'GSM2746920_B7_22_4h_B_counts.csv',
+        # 'GSM2746896_B1_2_1h_counts.csv',
+        # 'GSM2746921_B7_23_4h_A_counts.csv',
+        # 'GSM2746899_B3_5_0h_counts.csv',
+        # 'GSM2746919_B7_22_4h_A_counts.csv',
+        # 'GSM2746901_B3_7_1h_counts.csv',
+        # 'GSM2746910_B5_16_0h_counts.csv',
+        # 'GSM2746912_B6_17_1h_B_counts.csv',
+        'GSM2746913_B6_18_1h_counts.csv'
+    ]
+}

From 7482a043dbd8f3e701d41123d40b5c759f2a4ebb Mon Sep 17 00:00:00 2001
From: RemyLau <remylau961@gmail.com>
Date: Sat, 9 Sep 2023 10:35:29 -0400
Subject: [PATCH 05/11] remove enumerate

---
 dance/transforms/gene_holdout.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dance/transforms/gene_holdout.py b/dance/transforms/gene_holdout.py
index 1b1b0982..e3d50ec2 100644
--- a/dance/transforms/gene_holdout.py
+++ b/dance/transforms/gene_holdout.py
@@ -39,9 +39,9 @@ def __call__(self, data):
         # Use covariance to select predictors
         covariance_matrix = np.cov(feat, rowvar=False)
         predictors = []
-        for targs in enumerate(targets):
-            genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs[1])
-            subMatrix = covariance_matrix[targs[1]][:, genes_not_in_target]
+        for targs in targets:
+            genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs)
+            subMatrix = covariance_matrix[targs][:, genes_not_in_target]
             sorted_idx = np.argsort(-subMatrix, axis=0)
             preds = genes_not_in_target[sorted_idx[:self.n_top].flatten()]
             predictors.append(np.unique(preds))

From 2428e9e5e5705401dd817abd31ddb880b9f791fb Mon Sep 17 00:00:00 2001
From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com>
Date: Wed, 20 Sep 2023 00:32:33 +0800
Subject: [PATCH 06/11] Update multimodality.py

---
 dance/datasets/multimodality.py | 77 ++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 5 deletions(-)

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
index 0ff3d00b..dd5ba42a 100644
--- a/dance/datasets/multimodality.py
+++ b/dance/datasets/multimodality.py
@@ -62,8 +62,55 @@ def data_paths(self) -> List[str]:
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"),
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad")
             ]
+            if self.subtask=="10k_pbmc":
+                paths=[
+                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad")]
+            if self.subtask=="pbmc_cite":
+                paths=[
+                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad")]
+            if self.subtask.startswith("5k_pbmc"):
+                paths=[
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("openproblems_2022"):
+                paths=[
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("GSE127064"):
+                paths=[
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("GSE117089"):
+                paths=[
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("GSE140203"):
+                paths=[
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod2.h5ad")
+                ]
         elif self.TASK == "match_modality":
             paths = [
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
@@ -100,6 +147,24 @@ class ModalityPredictionDataset(MultiModalityDataset):
         "https://www.dropbox.com/s/cz60vp7bwapz0kw/openproblems_bmmc_multiome_phase2_rna.zip?dl=1",
         "openproblems_bmmc_cite_phase2_rna_subset":
         "https://www.dropbox.com/s/veytldxkgzyoa8j/openproblems_bmmc_cite_phase2_rna_subset.zip?dl=1",
+        "5k_pbmc":"https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1",
+        "5k_pbmc_subset":"https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1",
+        "10k_pbmc":"https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1",
+        "pbmc_cite":"https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1",
+        "openproblems_2022_multi_atac2gex":"https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1",
+        "openproblems_2022_cite_gex2adt":"https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1",
+        "GSE127064_AdBrain_gex2atac":"https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1",
+        "GSE127064_p0Brain_gex2atac":"https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1",
+        "GSE117089_mouse_gex2atac":"https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1",
+        "GSE117089_A549_gex2atac":"https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1",
+        "GSE117089_sciCAR_gex2atac":"https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1",
+        "GSE140203_3T3_HG19_atac2gex":"https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1",
+        "GSE140203_3T3_MM10_atac2gex":"https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1",
+        "GSE140203_12878.rep2_atac2gex":"https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1",
+        "GSE140203_12878.rep3_atac2gex":"https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1",
+        "GSE140203_K562_HG19_atac2gex":"https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1",
+        "GSE140203_K562_MM10_atac2gex":"https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1",
+        "GSE140203_LUNG_atac2gex":"https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1"
     }
     SUBTASK_NAME_MAP = {
         "adt2gex": "openproblems_bmmc_cite_phase2_mod2",
@@ -110,9 +175,10 @@ class ModalityPredictionDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, root="./data", preprocess=None):
+    def __init__(self, subtask, root="./data", preprocess=None,span=0.3):
         # TODO: factor our preprocess
         self.preprocess = preprocess
+        self.span=span
         super().__init__(subtask, root)
 
     def _raw_to_dance(self, raw_data):
@@ -135,7 +201,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000):
         if self.preprocess == "feature_selection":
             if raw_data[0].shape[1] > selection_threshold:
                 sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3",
-                                            n_top_genes=selection_threshold)
+                                            n_top_genes=selection_threshold,span=self.span)
                 raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"]
                 for i in [0, 2]:
                     raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]]
@@ -169,10 +235,11 @@ class ModalityMatchingDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None):
+    def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None,span=0.3):
         # TODO: factor our preprocess
         self.preprocess = preprocess
         self.pkl_path = pkl_path
+        self.span=span
         super().__init__(subtask, root)
 
     def _raw_to_dance(self, raw_data):
@@ -252,7 +319,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000):
             for i in range(2):
                 if modalities[i].shape[1] > selection_threshold:
                     sc.pp.highly_variable_genes(modalities[i], layer="counts", flavor="seurat_v3",
-                                                n_top_genes=selection_threshold)
+                                                n_top_genes=selection_threshold,span=self.span)
                     modalities[i + 2].var["highly_variable"] = modalities[i].var["highly_variable"]
                     modalities[i] = modalities[i][:, modalities[i].var["highly_variable"]]
                     modalities[i + 2] = modalities[i + 2][:, modalities[i + 2].var["highly_variable"]]

From e5c63778ce427718a49ece01941722e452a308b7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 16:32:51 +0000
Subject: [PATCH 07/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/datasets/multimodality.py | 102 +++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
index dd5ba42a..c8bc69cb 100644
--- a/dance/datasets/multimodality.py
+++ b/dance/datasets/multimodality.py
@@ -64,48 +64,50 @@ def data_paths(self) -> List[str]:
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad")
             ]
-            if self.subtask=="10k_pbmc":
-                paths=[
-                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad")]
-            if self.subtask=="pbmc_cite":
-                paths=[
-                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad")]
+            if self.subtask == "10k_pbmc":
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad")
+                ]
+            if self.subtask == "pbmc_cite":
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad")
+                ]
             if self.subtask.startswith("5k_pbmc"):
-                paths=[
+                paths = [
                     osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod2.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod2.h5ad")
                 ]
             if self.subtask.startswith("openproblems_2022"):
-                paths=[
+                paths = [
                     osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod2.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod2.h5ad")
                 ]
             if self.subtask.startswith("GSE127064"):
-                paths=[
+                paths = [
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod2.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod2.h5ad")
                 ]
             if self.subtask.startswith("GSE117089"):
-                paths=[
+                paths = [
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod2.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod2.h5ad")
                 ]
             if self.subtask.startswith("GSE140203"):
-                paths=[
+                paths = [
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod1.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod2.h5ad"),
                     osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod1.h5ad"),
@@ -147,24 +149,42 @@ class ModalityPredictionDataset(MultiModalityDataset):
         "https://www.dropbox.com/s/cz60vp7bwapz0kw/openproblems_bmmc_multiome_phase2_rna.zip?dl=1",
         "openproblems_bmmc_cite_phase2_rna_subset":
         "https://www.dropbox.com/s/veytldxkgzyoa8j/openproblems_bmmc_cite_phase2_rna_subset.zip?dl=1",
-        "5k_pbmc":"https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1",
-        "5k_pbmc_subset":"https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1",
-        "10k_pbmc":"https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1",
-        "pbmc_cite":"https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1",
-        "openproblems_2022_multi_atac2gex":"https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1",
-        "openproblems_2022_cite_gex2adt":"https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1",
-        "GSE127064_AdBrain_gex2atac":"https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1",
-        "GSE127064_p0Brain_gex2atac":"https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1",
-        "GSE117089_mouse_gex2atac":"https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1",
-        "GSE117089_A549_gex2atac":"https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1",
-        "GSE117089_sciCAR_gex2atac":"https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1",
-        "GSE140203_3T3_HG19_atac2gex":"https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1",
-        "GSE140203_3T3_MM10_atac2gex":"https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1",
-        "GSE140203_12878.rep2_atac2gex":"https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1",
-        "GSE140203_12878.rep3_atac2gex":"https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1",
-        "GSE140203_K562_HG19_atac2gex":"https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1",
-        "GSE140203_K562_MM10_atac2gex":"https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1",
-        "GSE140203_LUNG_atac2gex":"https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1"
+        "5k_pbmc":
+        "https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1",
+        "5k_pbmc_subset":
+        "https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1",
+        "10k_pbmc":
+        "https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1",
+        "pbmc_cite":
+        "https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1",
+        "openproblems_2022_multi_atac2gex":
+        "https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1",
+        "openproblems_2022_cite_gex2adt":
+        "https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1",
+        "GSE127064_AdBrain_gex2atac":
+        "https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1",
+        "GSE127064_p0Brain_gex2atac":
+        "https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1",
+        "GSE117089_mouse_gex2atac":
+        "https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1",
+        "GSE117089_A549_gex2atac":
+        "https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1",
+        "GSE117089_sciCAR_gex2atac":
+        "https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1",
+        "GSE140203_3T3_HG19_atac2gex":
+        "https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1",
+        "GSE140203_3T3_MM10_atac2gex":
+        "https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1",
+        "GSE140203_12878.rep2_atac2gex":
+        "https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1",
+        "GSE140203_12878.rep3_atac2gex":
+        "https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1",
+        "GSE140203_K562_HG19_atac2gex":
+        "https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1",
+        "GSE140203_K562_MM10_atac2gex":
+        "https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1",
+        "GSE140203_LUNG_atac2gex":
+        "https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1"
     }
     SUBTASK_NAME_MAP = {
         "adt2gex": "openproblems_bmmc_cite_phase2_mod2",
@@ -175,10 +195,10 @@ class ModalityPredictionDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, root="./data", preprocess=None,span=0.3):
+    def __init__(self, subtask, root="./data", preprocess=None, span=0.3):
         # TODO: factor our preprocess
         self.preprocess = preprocess
-        self.span=span
+        self.span = span
         super().__init__(subtask, root)
 
     def _raw_to_dance(self, raw_data):
@@ -201,7 +221,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000):
         if self.preprocess == "feature_selection":
             if raw_data[0].shape[1] > selection_threshold:
                 sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3",
-                                            n_top_genes=selection_threshold,span=self.span)
+                                            n_top_genes=selection_threshold, span=self.span)
                 raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"]
                 for i in [0, 2]:
                     raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]]
@@ -235,11 +255,11 @@ class ModalityMatchingDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None,span=0.3):
+    def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None, span=0.3):
         # TODO: factor our preprocess
         self.preprocess = preprocess
         self.pkl_path = pkl_path
-        self.span=span
+        self.span = span
         super().__init__(subtask, root)
 
     def _raw_to_dance(self, raw_data):
@@ -319,7 +339,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000):
             for i in range(2):
                 if modalities[i].shape[1] > selection_threshold:
                     sc.pp.highly_variable_genes(modalities[i], layer="counts", flavor="seurat_v3",
-                                                n_top_genes=selection_threshold,span=self.span)
+                                                n_top_genes=selection_threshold, span=self.span)
                     modalities[i + 2].var["highly_variable"] = modalities[i].var["highly_variable"]
                     modalities[i] = modalities[i][:, modalities[i].var["highly_variable"]]
                     modalities[i + 2] = modalities[i + 2][:, modalities[i + 2].var["highly_variable"]]

From 4a7389e05e07ddde397c7326911918161d4b2454 Mon Sep 17 00:00:00 2001
From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com>
Date: Wed, 20 Sep 2023 00:33:53 +0800
Subject: [PATCH 08/11] Update base.py

---
 dance/data/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/data/base.py b/dance/data/base.py
index 3cb73377..f6bb69f0 100644
--- a/dance/data/base.py
+++ b/dance/data/base.py
@@ -456,6 +456,7 @@ def get_feature(self, *, split_name: Optional[str] = None, return_type: FeatType
         if split_name is not None:
             if channel_type in ["X", "raw_X", "obs", "obsm", "obsp", "layers"]:
                 idx = self.get_split_idx(split_name, error_on_miss=True)
+                idx=list(filter(lambda a:a<feature.shape[0],idx))
                 feature = feature[idx][:, idx] if channel_type == "obsp" else feature[idx]
             else:
                 logger.warning(f"Indexing option for {channel_type!r} not implemented yet.")

From 1a17cc1b3011192de12b1330b60d3258e8cc58d4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 16:34:10 +0000
Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/data/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/data/base.py b/dance/data/base.py
index f6bb69f0..f004f740 100644
--- a/dance/data/base.py
+++ b/dance/data/base.py
@@ -456,7 +456,7 @@ def get_feature(self, *, split_name: Optional[str] = None, return_type: FeatType
         if split_name is not None:
             if channel_type in ["X", "raw_X", "obs", "obsm", "obsp", "layers"]:
                 idx = self.get_split_idx(split_name, error_on_miss=True)
-                idx=list(filter(lambda a:a<feature.shape[0],idx))
+                idx = list(filter(lambda a: a < feature.shape[0], idx))
                 feature = feature[idx][:, idx] if channel_type == "obsp" else feature[idx]
             else:
                 logger.warning(f"Indexing option for {channel_type!r} not implemented yet.")

From 9941fb52a0f1062aedcc940daceadfcc908c05c4 Mon Sep 17 00:00:00 2001
From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com>
Date: Wed, 20 Sep 2023 00:35:09 +0800
Subject: [PATCH 10/11] Update scmogcn_graph.py

---
 dance/transforms/graph/scmogcn_graph.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dance/transforms/graph/scmogcn_graph.py b/dance/transforms/graph/scmogcn_graph.py
index d17ec986..29d284e5 100644
--- a/dance/transforms/graph/scmogcn_graph.py
+++ b/dance/transforms/graph/scmogcn_graph.py
@@ -189,6 +189,7 @@ def construct_enhanced_feature_graph(u, v, e, train_size, feature_size, cell_nod
             graph.nodes['cell'].data['id'] = cell_node_features[:train_size] if not _test_graph else cell_node_features
         else:
             graph.nodes['cell'].data['id'] = cell_node_features
+        feature_size=min(graph.num_nodes('feature'),feature_size)
         graph.nodes['feature'].data['id'] = torch.arange(feature_size).long()
         graph.edges['feature2cell'].data['weight'] = e
         graph.edges['cell2feature'].data['weight'] = e[:graph.edges(etype='cell2feature')[0].shape[0]]

From e839fe883a2c5e66b4d1f87697d344b72dcdfe17 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 16:35:26 +0000
Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 dance/transforms/graph/scmogcn_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dance/transforms/graph/scmogcn_graph.py b/dance/transforms/graph/scmogcn_graph.py
index 29d284e5..428c0469 100644
--- a/dance/transforms/graph/scmogcn_graph.py
+++ b/dance/transforms/graph/scmogcn_graph.py
@@ -189,7 +189,7 @@ def construct_enhanced_feature_graph(u, v, e, train_size, feature_size, cell_nod
             graph.nodes['cell'].data['id'] = cell_node_features[:train_size] if not _test_graph else cell_node_features
         else:
             graph.nodes['cell'].data['id'] = cell_node_features
-        feature_size=min(graph.num_nodes('feature'),feature_size)
+        feature_size = min(graph.num_nodes('feature'), feature_size)
         graph.nodes['feature'].data['id'] = torch.arange(feature_size).long()
         graph.edges['feature2cell'].data['weight'] = e
         graph.edges['cell2feature'].data['weight'] = e[:graph.edges(etype='cell2feature')[0].shape[0]]