OmicsML · RemyLau · Sep 21, 2023 · Sep 9, 2023 · Sep 9, 2023 · Sep 9, 2023
diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py
@@ -17,6 +17,7 @@
 from dance.config import METADIR
 from dance.data import Data
 from dance.datasets.base import BaseDataset
+from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE
 from dance.registers import register_dataset
 from dance.typing import Dict, List, Optional, Set, Tuple
 from dance.utils.download import download_file, download_unzip
@@ -40,13 +41,12 @@ def _load_scdeepsort_metadata():
 
 @register_dataset("scdeepsort")
 class ScDeepSortDataset(BaseDataset):
-
     _DISPLAY_ATTRS = ("species", "tissue", "train_dataset", "test_dataset")
     ALL_URL_DICT: Dict[str, str] = {
-        "train_human_cell_atlas":   "https://www.dropbox.com/s/1itq1pokplbqxhx?dl=1",
-        "test_human_test_data":     "https://www.dropbox.com/s/gpxjnnvwyblv3xb?dl=1",
-        "train_mouse_cell_atlas":   "https://www.dropbox.com/s/ng8d3eujfah9ppl?dl=1",
-        "test_mouse_test_data":     "https://www.dropbox.com/s/pkr28czk5g3al2p?dl=1",
+        "train_human_cell_atlas": "https://www.dropbox.com/s/1itq1pokplbqxhx?dl=1",
+        "test_human_test_data": "https://www.dropbox.com/s/gpxjnnvwyblv3xb?dl=1",
+        "train_mouse_cell_atlas": "https://www.dropbox.com/s/ng8d3eujfah9ppl?dl=1",
+        "test_mouse_test_data": "https://www.dropbox.com/s/pkr28czk5g3al2p?dl=1",
     }  # yapf: disable
     BENCH_URL_DICT, AVAILABLE_DATA = _load_scdeepsort_metadata()
 
@@ -82,17 +82,25 @@ def download_all(self):
                 pass
             os.rename(download_path, move_path)
 
+    def get_all_filenames(self, filetype: str = "csv", feat_suffix: str = "data", label_suffix: str = "celltype"):
+        filenames = []
+        for id in self.train_dataset:
+            filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}")
+            filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{filetype}")
+        return filenames
+
     def download(self, download_map=True):
         if self.is_complete():
             return
 
-        # TODO: only download missing files
+        filenames = self.get_all_filenames()
         # Download training and testing data
         for name, url in self.BENCH_URL_DICT.items():
             parts = name.split("_")  # [train|test]_{species}_{tissue}{id}_[celltype|data].csv
             filename = "_".join(parts[1:])
-            filepath = osp.join(self.data_dir, *parts[:2], filename)
-            download_file(url, filepath)
+            if filename in filenames:
+                filepath = osp.join(self.data_dir, *parts[:2], filename)
+                download_file(url, filepath)
 
         if download_map:
             # Download mapping data
@@ -115,7 +123,7 @@ def is_complete_all(self):
     def is_complete(self):
         """Check if benchmarking data is complete."""
         for name in self.BENCH_URL_DICT:
-            filename = name[name.find('mouse'):]
+            filename = name[name.find(self.species):]
             file_i = osp.join(self.data_dir, *name.split("_")[:2], filename)
             if not osp.exists(file_i):
                 logger.info(file_i)
@@ -276,27 +284,8 @@ def _raw_to_dance(self, raw_data: Tuple[ad.AnnData, np.ndarray]):
 
 @register_dataset("imputation")
 class ImputationDataset(BaseDataset):
-
-    URL = {
-        "pbmc_data": "https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0",
-        "mouse_embryo_data": "https://www.dropbox.com/s/8ftx1bydoy7kn6p/GSE65525.zip?dl=0",
-        "mouse_brain_data": "https://www.dropbox.com/s/zzpotaayy2i29hk/neuron_10k.zip?dl=0",
-        "human_stemcell_data": "https://www.dropbox.com/s/g2qua2j3rqcngn6/GSE75748.zip?dl=0"
-    }
-    DATASET_TO_FILE = {
-        "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
-        "mouse_embryo_data": [
-            osp.join("GSE65525", i)
-            for i in [
-                "GSM1599494_ES_d0_main.csv",
-                "GSM1599497_ES_d2_LIFminus.csv",
-                "GSM1599498_ES_d4_LIFminus.csv",
-                "GSM1599499_ES_d7_LIFminus.csv",
-            ]
-        ],
-        "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5",
-        "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz"
-    }  # yapf: disable
+    URL = load_data_url_dict_from_csv(METADIR / "imputation.csv")
+    DATASET_TO_FILE =IMPUTATION_DATASET_TO_FILE   # yapf: disable
     AVAILABLE_DATA = sorted(URL)
 
     def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1):
@@ -307,20 +296,31 @@ def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1):
 
     def download(self):
 
-        gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data"]
+        gene_class = [
+            "pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data", "human_breast_TGFb_data",
+            "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data"
+        ]
 
         file_name = {
             "pbmc_data": "5k.zip?dl=0",
             "mouse_embryo_data": "GSE65525.zip?dl=0",
             "mouse_brain_data": "neuron_10k.zip?dl=0",
-            "human_stemcell_data": "GSE75748.zip?dl=0"
+            "human_stemcell_data": "GSE75748.zip?dl=0",
+            "human_breast_TGFb_data": "GSE114397.zip?dl=0",
+            "human_breast_Dox_data": "GSM3141014.zip?dl=0",
+            "human_melanoma_data": "human_melanoma_data.zip?dl=0",
+            "mouse_visual_data": "mouse_visual_data.zip?dl=0"
         }
 
         dl_files = {
             "pbmc_data": "5k_*",
             "mouse_embryo_data": "GSE65525",
             "mouse_brain_data": "neuron*",
-            "human_stemcell_data": "GSE75748"
+            "human_stemcell_data": "GSE75748",
+            "human_breast_TGFb_data": "GSE11*",
+            "human_breast_Dox_data": "GSM31*",
+            "human_melanoma_data": "human*",
+            "mouse_visual_data": "GSM27*"
         }
 
         if sys.platform != 'win32':
@@ -330,25 +330,29 @@ def download(self):
                 os.system("mkdir " + self.data_dir + "/train")
 
             for class_name in gene_class:
-                if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
-                    os.system("mkdir " + self.data_dir + "/train/" + class_name)
-                    os.system("wget " + self.URL[class_name])  # assumes linux... mac needs to install
-                    os.system("unzip " + file_name[class_name])
-                    os.system("rm " + file_name[class_name])
-                    os.system("mv " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
+                if self.dataset == gene_class:
+                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name,
+                                                             dl_files[class_name])))):
+                        os.system("mkdir " + self.data_dir + "/train/" + class_name)
+                        os.system("wget " + self.URL[class_name])  # assumes linux... mac needs to install
+                        os.system("unzip " + file_name[class_name])
+                        os.system("rm " + file_name[class_name])
+                        os.system("mv " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
             os.system("cp -r " + self.data_dir + "/train/ " + self.data_dir + "/test")
         if sys.platform == 'win32':
             if not osp.exists(self.data_dir):
                 os.system("mkdir " + self.data_dir)
             if not osp.exists(self.data_dir + "/train"):
                 os.mkdir(self.data_dir + "/train")
             for class_name in gene_class:
-                if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))):
-                    os.mkdir(self.data_dir + "/train/" + class_name)
-                    os.system("curl " + self.URL[class_name])
-                    os.system("tar -xf " + file_name[class_name])
-                    os.system("del -R " + file_name[class_name])
-                    os.system("move " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
+                if self.dataset == gene_class:
+                    if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name,
+                                                             dl_files[class_name])))):
+                        os.mkdir(self.data_dir + "/train/" + class_name)
+                        os.system("curl " + self.URL[class_name])
+                        os.system("tar -xf " + file_name[class_name])
+                        os.system("del -R " + file_name[class_name])
+                        os.system("move " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/")
             os.system("copy /r " + self.data_dir + "/train/ " + self.data_dir + "/test")
 
     def is_complete(self):
@@ -370,7 +374,7 @@ def _load_raw_data(self) -> ad.AnnData:
         else:
             dataset = self.dataset
 
-        if self.dataset == 'mouse_embryo' or self.dataset == 'mouse_embryo_data':
+        if self.dataset == 'mouse_embryo' or self.dataset == 'mouse_embryo_data' or self.dataset == "mouse_visual_data":
             for i in range(len(self.DATASET_TO_FILE[dataset])):
                 fname = self.DATASET_TO_FILE[dataset][i]
                 data_path = f'{self.data_dir}/train/{dataset}/{fname}'
@@ -394,12 +398,15 @@ def _load_raw_data(self) -> ad.AnnData:
                 raise FileNotFoundError(f"{data_path} does not exist")
 
             if self.DATASET_TO_FILE[dataset][-3:] == 'csv':
-                counts = pd.read_csv(data_path, index_col=0, header=None)
+                counts = pd.read_csv(data_path, header=None, index_col=0)
+                nums = pd.Series(np.arange(counts.shape[1]))
+                nums = pd.DataFrame(nums)
+                nums.columns = ['nums']
                 counts = counts.T
+                counts.index = [i for i in range(counts.shape[0])]
                 adata = ad.AnnData(csr_matrix(counts.values))
-                # adata.obs_names = ["%d"%i for i in range(adata.shape[0])]
-                adata.obs_names = counts.index.tolist()
                 adata.var_names = counts.columns.tolist()
+                adata.obs['nums'] = nums.to_numpy()
             if self.DATASET_TO_FILE[dataset][-2:] == 'gz':
                 counts = pd.read_csv(data_path, index_col=0, compression='gzip', header=0)
                 counts = counts.T

diff --git a/dance/metadata/imputation.csv b/dance/metadata/imputation.csv
@@ -0,0 +1,9 @@
+pbmc_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0
+mouse_embryo_data,https://www.dropbox.com/s/8ftx1bydoy7kn6p/GSE65525.zip?dl=0
+mouse_brain_data,https://www.dropbox.com/s/zzpotaayy2i29hk/neuron_10k.zip?dl=0
+human_stemcell_data,https://www.dropbox.com/s/g2qua2j3rqcngn6/GSE75748.zip?dl=0
+pbmc_raw_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0
+human_breast_TGFb_data,https://dl.dropboxusercontent.com/scl/fi/qympicswl7slkksbjk7cp/GSE114397.zip?dl=0
+human_breast_Dox_data,https://dl.dropboxusercontent.com/scl/fi/f2ifl5druqjr2rji8h4qq/GSM3141014.zip?dl=0
+human_melanoma_data,https://dl.dropboxusercontent.com/scl/fi/ci9ihqytb4sys3u4xkdbq/human_melanoma_data.zip?dl=0
+mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0
diff --git a/dance/metadata/imputation.py b/dance/metadata/imputation.py
@@ -0,0 +1,54 @@
+import os.path as osp
+
+IMPUTATION_DATASET_TO_FILE = {
+    "pbmc_data":
+    "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5",
+    "mouse_embryo_data": [
+        osp.join("GSE65525", i) for i in [
+            "GSM1599494_ES_d0_main.csv",
+            "GSM1599497_ES_d2_LIFminus.csv",
+            "GSM1599498_ES_d4_LIFminus.csv",
+            "GSM1599499_ES_d7_LIFminus.csv",
+        ]
+    ],
+    "mouse_brain_data":
+    "neuron_10k_v3_filtered_feature_bc_matrix.h5",
+    "human_stemcell_data":
+    "GSE75748/GSE75748_sc_time_course_ec.csv.gz",
+    "human_breast_TGFb_data":
+    "GSE114397_HMLE_TGFb.csv",
+    "human_breast_Dox_data":
+    "GSM3141014_Zeb1_Dox.csv",
+    "human_melanoma_data":
+    "human_melanoma_data.csv",
+    "mouse_visual_data": [
+        'GSM2746905_B4_11_0h_counts.csv',
+        # 'GSM2746906_B4_12_0h_counts.csv',
+        # 'GSM2746922_B7_23_4h_B_counts.csv',
+        # 'GSM2746895_B1_1_0h_counts.csv',
+        # 'GSM2746916_B6_20_4h_A_counts.csv',
+        # 'GSM2746903_B3_9_4h_counts.csv',
+        # 'GSM2746914_B6_19_4h_A_counts.csv',
+        # 'GSM2746908_B5_14_0h_counts.csv',
+        # 'GSM2746907_B5_13_0h_counts.csv',
+        # 'GSM2746917_B6_20_4h_B_counts.csv',
+        # 'GSM2746918_B7_21_1h_counts.csv',
+        # 'GSM2746898_B2_4_1h_counts.csv',
+        # 'GSM2746909_B5_15_0h_counts.csv',
+        # 'GSM2746915_B6_19_4h_B_counts.csv',
+        # 'GSM2746897_B1_3_4h_counts.csv',
+        # 'GSM2746902_B3_8_1h_counts.csv',
+        # 'GSM2746911_B6_17_1h_A_counts.csv',
+        # 'GSM2746904_B3_10_4h_counts.csv',
+        # 'GSM2746900_B3_6_0h_counts.csv',
+        # 'GSM2746920_B7_22_4h_B_counts.csv',
+        # 'GSM2746896_B1_2_1h_counts.csv',
+        # 'GSM2746921_B7_23_4h_A_counts.csv',
+        # 'GSM2746899_B3_5_0h_counts.csv',
+        # 'GSM2746919_B7_22_4h_A_counts.csv',
+        # 'GSM2746901_B3_7_1h_counts.csv',
+        # 'GSM2746910_B5_16_0h_counts.csv',
+        # 'GSM2746912_B6_17_1h_B_counts.csv',
+        'GSM2746913_B6_18_1h_counts.csv'
+    ]
+}
diff --git a/dance/transforms/gene_holdout.py b/dance/transforms/gene_holdout.py
@@ -39,7 +39,7 @@ def __call__(self, data):
         # Use covariance to select predictors
         covariance_matrix = np.cov(feat, rowvar=False)
         predictors = []
-        for targs in enumerate(targets):
+        for targs in targets:
             genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs)
             subMatrix = covariance_matrix[targs][:, genes_not_in_target]
             sorted_idx = np.argsort(-subMatrix, axis=0)