From 702da0c53d8cf82e973756a346a30c052b344e0d Mon Sep 17 00:00:00 2001 From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com> Date: Sat, 9 Sep 2023 09:28:09 +0800 Subject: [PATCH 01/11] add imputation metadata --- dance/metadata/imputation.csv | 9 +++++++ dance/metadata/imputation.py | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 dance/metadata/imputation.csv create mode 100644 dance/metadata/imputation.py diff --git a/dance/metadata/imputation.csv b/dance/metadata/imputation.csv new file mode 100644 index 00000000..ace36502 --- /dev/null +++ b/dance/metadata/imputation.csv @@ -0,0 +1,9 @@ +pbmc_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0 +mouse_embryo_data,https://www.dropbox.com/s/8ftx1bydoy7kn6p/GSE65525.zip?dl=0 +mouse_brain_data,https://www.dropbox.com/s/zzpotaayy2i29hk/neuron_10k.zip?dl=0 +human_stemcell_data,https://www.dropbox.com/s/g2qua2j3rqcngn6/GSE75748.zip?dl=0 +pbmc_raw_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0 +human_breast_TGFb_data,https://dl.dropboxusercontent.com/scl/fi/qympicswl7slkksbjk7cp/GSE114397.zip?dl=0 +human_breast_Dox_data,https://dl.dropboxusercontent.com/scl/fi/f2ifl5druqjr2rji8h4qq/GSM3141014.zip?dl=0 +human_melanoma_data,https://dl.dropboxusercontent.com/scl/fi/ci9ihqytb4sys3u4xkdbq/human_melanoma_data.zip?dl=0 +mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0 \ No newline at end of file diff --git a/dance/metadata/imputation.py b/dance/metadata/imputation.py new file mode 100644 index 00000000..126a8e90 --- /dev/null +++ b/dance/metadata/imputation.py @@ -0,0 +1,47 @@ +import os.path as osp +IMPUTATION_DATASET_TO_FILE = { + "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5", + "mouse_embryo_data": [ + osp.join("GSE65525", i) + for i in [ + "GSM1599494_ES_d0_main.csv", + "GSM1599497_ES_d2_LIFminus.csv", + "GSM1599498_ES_d4_LIFminus.csv", + "GSM1599499_ES_d7_LIFminus.csv", + ] + ], + "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5", + "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz", + "human_breast_TGFb_data": "GSE114397_HMLE_TGFb.csv", + "human_breast_Dox_data": "GSM3141014_Zeb1_Dox.csv", + "human_melanoma_data": "human_melanoma_data.csv", + "mouse_visual_data": ['GSM2746905_B4_11_0h_counts.csv', + # 'GSM2746906_B4_12_0h_counts.csv', + # 'GSM2746922_B7_23_4h_B_counts.csv', + # 'GSM2746895_B1_1_0h_counts.csv', + # 'GSM2746916_B6_20_4h_A_counts.csv', + # 'GSM2746903_B3_9_4h_counts.csv', + # 'GSM2746914_B6_19_4h_A_counts.csv', + # 'GSM2746908_B5_14_0h_counts.csv', + # 'GSM2746907_B5_13_0h_counts.csv', + # 'GSM2746917_B6_20_4h_B_counts.csv', + # 'GSM2746918_B7_21_1h_counts.csv', + # 'GSM2746898_B2_4_1h_counts.csv', + # 'GSM2746909_B5_15_0h_counts.csv', + # 'GSM2746915_B6_19_4h_B_counts.csv', + # 'GSM2746897_B1_3_4h_counts.csv', + # 'GSM2746902_B3_8_1h_counts.csv', + # 'GSM2746911_B6_17_1h_A_counts.csv', + # 'GSM2746904_B3_10_4h_counts.csv', + # 'GSM2746900_B3_6_0h_counts.csv', + # 'GSM2746920_B7_22_4h_B_counts.csv', + # 'GSM2746896_B1_2_1h_counts.csv', + # 'GSM2746921_B7_23_4h_A_counts.csv', + # 'GSM2746899_B3_5_0h_counts.csv', + # 'GSM2746919_B7_22_4h_A_counts.csv', + # 'GSM2746901_B3_7_1h_counts.csv', + # 'GSM2746910_B5_16_0h_counts.csv', + # 'GSM2746912_B6_17_1h_B_counts.csv', + 'GSM2746913_B6_18_1h_counts.csv' + ] + } \ No newline at end of file From 24b8af00005827e9fa01edab1972d0689c40e3f1 Mon Sep 17 00:00:00 2001 From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com> Date: Sat, 9 Sep 2023 09:30:42 +0800 Subject: [PATCH 02/11] add 3 imputation datasets --- dance/datasets/singlemodality.py | 103 ++++++++++++++++--------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py index bf778dec..7c46e160 100644 --- a/dance/datasets/singlemodality.py +++ b/dance/datasets/singlemodality.py @@ -22,7 +22,7 @@ from dance.utils.download import download_file, download_unzip from dance.utils.io import load_data_url_dict_from_csv from dance.utils.preprocess import cell_label_to_df - +from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE def _load_scdeepsort_metadata(): path = METADIR / "scdeepsort.csv" @@ -40,13 +40,12 @@ def _load_scdeepsort_metadata(): @register_dataset("scdeepsort") class ScDeepSortDataset(BaseDataset): - _DISPLAY_ATTRS = ("species", "tissue", "train_dataset", "test_dataset") ALL_URL_DICT: Dict[str, str] = { - "train_human_cell_atlas": "https://www.dropbox.com/s/1itq1pokplbqxhx?dl=1", - "test_human_test_data": "https://www.dropbox.com/s/gpxjnnvwyblv3xb?dl=1", - "train_mouse_cell_atlas": "https://www.dropbox.com/s/ng8d3eujfah9ppl?dl=1", - "test_mouse_test_data": "https://www.dropbox.com/s/pkr28czk5g3al2p?dl=1", + "train_human_cell_atlas": "https://www.dropbox.com/s/1itq1pokplbqxhx?dl=1", + "test_human_test_data": "https://www.dropbox.com/s/gpxjnnvwyblv3xb?dl=1", + "train_mouse_cell_atlas": "https://www.dropbox.com/s/ng8d3eujfah9ppl?dl=1", + "test_mouse_test_data": "https://www.dropbox.com/s/pkr28czk5g3al2p?dl=1", } # yapf: disable BENCH_URL_DICT, AVAILABLE_DATA = _load_scdeepsort_metadata() @@ -82,17 +81,26 @@ def download_all(self): pass os.rename(download_path, move_path) + def get_all_filenames(self, filetype: str = "csv", + feat_suffix: str = "data", label_suffix: str = "celltype"): + filenames = [] + for id in self.train_dataset: + filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}") + filenames.append(f"{self.species}_{self.tissue}{id}_{label_suffix}.{filetype}") + return filenames + def download(self, download_map=True): if self.is_complete(): return - # TODO: only download missing files + filenames=self.get_all_filenames() # Download training and testing data for name, url in self.BENCH_URL_DICT.items(): parts = name.split("_") # [train|test]_{species}_{tissue}{id}_[celltype|data].csv filename = "_".join(parts[1:]) - filepath = osp.join(self.data_dir, *parts[:2], filename) - download_file(url, filepath) + if filename in filenames: + filepath = osp.join(self.data_dir, *parts[:2], filename) + download_file(url, filepath) if download_map: # Download mapping data @@ -115,7 +123,7 @@ def is_complete_all(self): def is_complete(self): """Check if benchmarking data is complete.""" for name in self.BENCH_URL_DICT: - filename = name[name.find('mouse'):] + filename = name[name.find(self.species):] file_i = osp.join(self.data_dir, *name.split("_")[:2], filename) if not osp.exists(file_i): logger.info(file_i) @@ -276,27 +284,8 @@ def _raw_to_dance(self, raw_data: Tuple[ad.AnnData, np.ndarray]): @register_dataset("imputation") class ImputationDataset(BaseDataset): - - URL = { - "pbmc_data": "https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0", - "mouse_embryo_data": "https://www.dropbox.com/s/8ftx1bydoy7kn6p/GSE65525.zip?dl=0", - "mouse_brain_data": "https://www.dropbox.com/s/zzpotaayy2i29hk/neuron_10k.zip?dl=0", - "human_stemcell_data": "https://www.dropbox.com/s/g2qua2j3rqcngn6/GSE75748.zip?dl=0" - } - DATASET_TO_FILE = { - "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5", - "mouse_embryo_data": [ - osp.join("GSE65525", i) - for i in [ - "GSM1599494_ES_d0_main.csv", - "GSM1599497_ES_d2_LIFminus.csv", - "GSM1599498_ES_d4_LIFminus.csv", - "GSM1599499_ES_d7_LIFminus.csv", - ] - ], - "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5", - "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz" - } # yapf: disable + URL = load_data_url_dict_from_csv(METADIR / "imputation.csv") + DATASET_TO_FILE =IMPUTATION_DATASET_TO_FILE # yapf: disable AVAILABLE_DATA = sorted(URL) def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1): @@ -307,20 +296,29 @@ def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1): def download(self): - gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data"] + gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data", + "human_breast_TGFb_data", "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data"] file_name = { "pbmc_data": "5k.zip?dl=0", "mouse_embryo_data": "GSE65525.zip?dl=0", "mouse_brain_data": "neuron_10k.zip?dl=0", - "human_stemcell_data": "GSE75748.zip?dl=0" + "human_stemcell_data": "GSE75748.zip?dl=0", + "human_breast_TGFb_data": "GSE114397.zip?dl=0", + "human_breast_Dox_data": "GSM3141014.zip?dl=0", + "human_melanoma_data": "human_melanoma_data.zip?dl=0", + "mouse_visual_data": "mouse_visual_data.zip?dl=0" } dl_files = { "pbmc_data": "5k_*", "mouse_embryo_data": "GSE65525", "mouse_brain_data": "neuron*", - "human_stemcell_data": "GSE75748" + "human_stemcell_data": "GSE75748", + "human_breast_TGFb_data": "GSE11*", + "human_breast_Dox_data": "GSM31*", + "human_melanoma_data": "human*", + "mouse_visual_data": "GSM27*" } if sys.platform != 'win32': @@ -330,12 +328,13 @@ def download(self): os.system("mkdir " + self.data_dir + "/train") for class_name in gene_class: - if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))): - os.system("mkdir " + self.data_dir + "/train/" + class_name) - os.system("wget " + self.URL[class_name]) # assumes linux... mac needs to install - os.system("unzip " + file_name[class_name]) - os.system("rm " + file_name[class_name]) - os.system("mv " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/") + if self.dataset==gene_class: + if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))): + os.system("mkdir " + self.data_dir + "/train/" + class_name) + os.system("wget " + self.URL[class_name]) # assumes linux... mac needs to install + os.system("unzip " + file_name[class_name]) + os.system("rm " + file_name[class_name]) + os.system("mv " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/") os.system("cp -r " + self.data_dir + "/train/ " + self.data_dir + "/test") if sys.platform == 'win32': if not osp.exists(self.data_dir): @@ -343,12 +342,13 @@ def download(self): if not osp.exists(self.data_dir + "/train"): os.mkdir(self.data_dir + "/train") for class_name in gene_class: - if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))): - os.mkdir(self.data_dir + "/train/" + class_name) - os.system("curl " + self.URL[class_name]) - os.system("tar -xf " + file_name[class_name]) - os.system("del -R " + file_name[class_name]) - os.system("move " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/") + if self.dataset==gene_class: + if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))): + os.mkdir(self.data_dir + "/train/" + class_name) + os.system("curl " + self.URL[class_name]) + os.system("tar -xf " + file_name[class_name]) + os.system("del -R " + file_name[class_name]) + os.system("move " + dl_files[class_name] + " " + self.data_dir + "/train/" + class_name + "/") os.system("copy /r " + self.data_dir + "/train/ " + self.data_dir + "/test") def is_complete(self): @@ -370,7 +370,7 @@ def _load_raw_data(self) -> ad.AnnData: else: dataset = self.dataset - if self.dataset == 'mouse_embryo' or self.dataset == 'mouse_embryo_data': + if self.dataset == 'mouse_embryo' or self.dataset == 'mouse_embryo_data' or self.dataset == "mouse_visual_data": for i in range(len(self.DATASET_TO_FILE[dataset])): fname = self.DATASET_TO_FILE[dataset][i] data_path = f'{self.data_dir}/train/{dataset}/{fname}' @@ -394,12 +394,15 @@ def _load_raw_data(self) -> ad.AnnData: raise FileNotFoundError(f"{data_path} does not exist") if self.DATASET_TO_FILE[dataset][-3:] == 'csv': - counts = pd.read_csv(data_path, index_col=0, header=None) + counts = pd.read_csv(data_path, header=None, index_col=0) + nums = pd.Series(np.arange(counts.shape[1])) + nums = pd.DataFrame(nums) + nums.columns = ['nums'] counts = counts.T + counts.index = [i for i in range(counts.shape[0])] adata = ad.AnnData(csr_matrix(counts.values)) - # adata.obs_names = ["%d"%i for i in range(adata.shape[0])] - adata.obs_names = counts.index.tolist() adata.var_names = counts.columns.tolist() + adata.obs['nums'] = nums.to_numpy() if self.DATASET_TO_FILE[dataset][-2:] == 'gz': counts = pd.read_csv(data_path, index_col=0, compression='gzip', header=0) counts = counts.T From 3bf9575468a05e18030b03e37081c7ba9d865845 Mon Sep 17 00:00:00 2001 From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com> Date: Sat, 9 Sep 2023 09:32:13 +0800 Subject: [PATCH 03/11] Update gene_holdout.py --- dance/transforms/gene_holdout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dance/transforms/gene_holdout.py b/dance/transforms/gene_holdout.py index a0749353..1b1b0982 100644 --- a/dance/transforms/gene_holdout.py +++ b/dance/transforms/gene_holdout.py @@ -40,8 +40,8 @@ def __call__(self, data): covariance_matrix = np.cov(feat, rowvar=False) predictors = [] for targs in enumerate(targets): - genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs) - subMatrix = covariance_matrix[targs][:, genes_not_in_target] + genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs[1]) + subMatrix = covariance_matrix[targs[1]][:, genes_not_in_target] sorted_idx = np.argsort(-subMatrix, axis=0) preds = genes_not_in_target[sorted_idx[:self.n_top].flatten()] predictors.append(np.unique(preds)) From 0e05872b901c5b3ca97dc7b225d9e74332e6ed45 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 9 Sep 2023 01:38:24 +0000 Subject: [PATCH 04/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/datasets/singlemodality.py | 24 ++++---- dance/metadata/imputation.csv | 2 +- dance/metadata/imputation.py | 97 +++++++++++++++++--------------- 3 files changed, 67 insertions(+), 56 deletions(-) diff --git a/dance/datasets/singlemodality.py b/dance/datasets/singlemodality.py index 7c46e160..2870bcda 100644 --- a/dance/datasets/singlemodality.py +++ b/dance/datasets/singlemodality.py @@ -17,12 +17,13 @@ from dance.config import METADIR from dance.data import Data from dance.datasets.base import BaseDataset +from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE from dance.registers import register_dataset from dance.typing import Dict, List, Optional, Set, Tuple from dance.utils.download import download_file, download_unzip from dance.utils.io import load_data_url_dict_from_csv from dance.utils.preprocess import cell_label_to_df -from dance.metadata.imputation import IMPUTATION_DATASET_TO_FILE + def _load_scdeepsort_metadata(): path = METADIR / "scdeepsort.csv" @@ -81,8 +82,7 @@ def download_all(self): pass os.rename(download_path, move_path) - def get_all_filenames(self, filetype: str = "csv", - feat_suffix: str = "data", label_suffix: str = "celltype"): + def get_all_filenames(self, filetype: str = "csv", feat_suffix: str = "data", label_suffix: str = "celltype"): filenames = [] for id in self.train_dataset: filenames.append(f"{self.species}_{self.tissue}{id}_{feat_suffix}.{filetype}") @@ -93,7 +93,7 @@ def download(self, download_map=True): if self.is_complete(): return - filenames=self.get_all_filenames() + filenames = self.get_all_filenames() # Download training and testing data for name, url in self.BENCH_URL_DICT.items(): parts = name.split("_") # [train|test]_{species}_{tissue}{id}_[celltype|data].csv @@ -296,8 +296,10 @@ def __init__(self, data_dir="data", dataset="human_stemcell", train_size=0.1): def download(self): - gene_class = ["pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data", - "human_breast_TGFb_data", "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data"] + gene_class = [ + "pbmc_data", "mouse_brain_data", "mouse_embryo_data", "human_stemcell_data", "human_breast_TGFb_data", + "human_breast_Dox_data", "human_melanoma_data", "mouse_visual_data" + ] file_name = { "pbmc_data": "5k.zip?dl=0", @@ -328,8 +330,9 @@ def download(self): os.system("mkdir " + self.data_dir + "/train") for class_name in gene_class: - if self.dataset==gene_class: - if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))): + if self.dataset == gene_class: + if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, + dl_files[class_name])))): os.system("mkdir " + self.data_dir + "/train/" + class_name) os.system("wget " + self.URL[class_name]) # assumes linux... mac needs to install os.system("unzip " + file_name[class_name]) @@ -342,8 +345,9 @@ def download(self): if not osp.exists(self.data_dir + "/train"): os.mkdir(self.data_dir + "/train") for class_name in gene_class: - if self.dataset==gene_class: - if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, dl_files[class_name])))): + if self.dataset == gene_class: + if not any(map(osp.exists, glob(osp.join(self.data_dir, "train", class_name, + dl_files[class_name])))): os.mkdir(self.data_dir + "/train/" + class_name) os.system("curl " + self.URL[class_name]) os.system("tar -xf " + file_name[class_name]) diff --git a/dance/metadata/imputation.csv b/dance/metadata/imputation.csv index ace36502..4d5480f1 100644 --- a/dance/metadata/imputation.csv +++ b/dance/metadata/imputation.csv @@ -6,4 +6,4 @@ pbmc_raw_data,https://www.dropbox.com/s/brj3orsjbhnhawa/5k.zip?dl=0 human_breast_TGFb_data,https://dl.dropboxusercontent.com/scl/fi/qympicswl7slkksbjk7cp/GSE114397.zip?dl=0 human_breast_Dox_data,https://dl.dropboxusercontent.com/scl/fi/f2ifl5druqjr2rji8h4qq/GSM3141014.zip?dl=0 human_melanoma_data,https://dl.dropboxusercontent.com/scl/fi/ci9ihqytb4sys3u4xkdbq/human_melanoma_data.zip?dl=0 -mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0 \ No newline at end of file +mouse_visual_data,https://dl.dropboxusercontent.com/scl/fi/yic1iwhh7a3gp6njyk0nf/mouse_visual_data.zip?dl=0 diff --git a/dance/metadata/imputation.py b/dance/metadata/imputation.py index 126a8e90..fe1b3a52 100644 --- a/dance/metadata/imputation.py +++ b/dance/metadata/imputation.py @@ -1,47 +1,54 @@ import os.path as osp + IMPUTATION_DATASET_TO_FILE = { - "pbmc_data": "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5", - "mouse_embryo_data": [ - osp.join("GSE65525", i) - for i in [ - "GSM1599494_ES_d0_main.csv", - "GSM1599497_ES_d2_LIFminus.csv", - "GSM1599498_ES_d4_LIFminus.csv", - "GSM1599499_ES_d7_LIFminus.csv", - ] - ], - "mouse_brain_data": "neuron_10k_v3_filtered_feature_bc_matrix.h5", - "human_stemcell_data": "GSE75748/GSE75748_sc_time_course_ec.csv.gz", - "human_breast_TGFb_data": "GSE114397_HMLE_TGFb.csv", - "human_breast_Dox_data": "GSM3141014_Zeb1_Dox.csv", - "human_melanoma_data": "human_melanoma_data.csv", - "mouse_visual_data": ['GSM2746905_B4_11_0h_counts.csv', - # 'GSM2746906_B4_12_0h_counts.csv', - # 'GSM2746922_B7_23_4h_B_counts.csv', - # 'GSM2746895_B1_1_0h_counts.csv', - # 'GSM2746916_B6_20_4h_A_counts.csv', - # 'GSM2746903_B3_9_4h_counts.csv', - # 'GSM2746914_B6_19_4h_A_counts.csv', - # 'GSM2746908_B5_14_0h_counts.csv', - # 'GSM2746907_B5_13_0h_counts.csv', - # 'GSM2746917_B6_20_4h_B_counts.csv', - # 'GSM2746918_B7_21_1h_counts.csv', - # 'GSM2746898_B2_4_1h_counts.csv', - # 'GSM2746909_B5_15_0h_counts.csv', - # 'GSM2746915_B6_19_4h_B_counts.csv', - # 'GSM2746897_B1_3_4h_counts.csv', - # 'GSM2746902_B3_8_1h_counts.csv', - # 'GSM2746911_B6_17_1h_A_counts.csv', - # 'GSM2746904_B3_10_4h_counts.csv', - # 'GSM2746900_B3_6_0h_counts.csv', - # 'GSM2746920_B7_22_4h_B_counts.csv', - # 'GSM2746896_B1_2_1h_counts.csv', - # 'GSM2746921_B7_23_4h_A_counts.csv', - # 'GSM2746899_B3_5_0h_counts.csv', - # 'GSM2746919_B7_22_4h_A_counts.csv', - # 'GSM2746901_B3_7_1h_counts.csv', - # 'GSM2746910_B5_16_0h_counts.csv', - # 'GSM2746912_B6_17_1h_B_counts.csv', - 'GSM2746913_B6_18_1h_counts.csv' - ] - } \ No newline at end of file + "pbmc_data": + "5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5", + "mouse_embryo_data": [ + osp.join("GSE65525", i) for i in [ + "GSM1599494_ES_d0_main.csv", + "GSM1599497_ES_d2_LIFminus.csv", + "GSM1599498_ES_d4_LIFminus.csv", + "GSM1599499_ES_d7_LIFminus.csv", + ] + ], + "mouse_brain_data": + "neuron_10k_v3_filtered_feature_bc_matrix.h5", + "human_stemcell_data": + "GSE75748/GSE75748_sc_time_course_ec.csv.gz", + "human_breast_TGFb_data": + "GSE114397_HMLE_TGFb.csv", + "human_breast_Dox_data": + "GSM3141014_Zeb1_Dox.csv", + "human_melanoma_data": + "human_melanoma_data.csv", + "mouse_visual_data": [ + 'GSM2746905_B4_11_0h_counts.csv', + # 'GSM2746906_B4_12_0h_counts.csv', + # 'GSM2746922_B7_23_4h_B_counts.csv', + # 'GSM2746895_B1_1_0h_counts.csv', + # 'GSM2746916_B6_20_4h_A_counts.csv', + # 'GSM2746903_B3_9_4h_counts.csv', + # 'GSM2746914_B6_19_4h_A_counts.csv', + # 'GSM2746908_B5_14_0h_counts.csv', + # 'GSM2746907_B5_13_0h_counts.csv', + # 'GSM2746917_B6_20_4h_B_counts.csv', + # 'GSM2746918_B7_21_1h_counts.csv', + # 'GSM2746898_B2_4_1h_counts.csv', + # 'GSM2746909_B5_15_0h_counts.csv', + # 'GSM2746915_B6_19_4h_B_counts.csv', + # 'GSM2746897_B1_3_4h_counts.csv', + # 'GSM2746902_B3_8_1h_counts.csv', + # 'GSM2746911_B6_17_1h_A_counts.csv', + # 'GSM2746904_B3_10_4h_counts.csv', + # 'GSM2746900_B3_6_0h_counts.csv', + # 'GSM2746920_B7_22_4h_B_counts.csv', + # 'GSM2746896_B1_2_1h_counts.csv', + # 'GSM2746921_B7_23_4h_A_counts.csv', + # 'GSM2746899_B3_5_0h_counts.csv', + # 'GSM2746919_B7_22_4h_A_counts.csv', + # 'GSM2746901_B3_7_1h_counts.csv', + # 'GSM2746910_B5_16_0h_counts.csv', + # 'GSM2746912_B6_17_1h_B_counts.csv', + 'GSM2746913_B6_18_1h_counts.csv' + ] +} From 7482a043dbd8f3e701d41123d40b5c759f2a4ebb Mon Sep 17 00:00:00 2001 From: RemyLau Date: Sat, 9 Sep 2023 10:35:29 -0400 Subject: [PATCH 05/11] remove enumerate --- dance/transforms/gene_holdout.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dance/transforms/gene_holdout.py b/dance/transforms/gene_holdout.py index 1b1b0982..e3d50ec2 100644 --- a/dance/transforms/gene_holdout.py +++ b/dance/transforms/gene_holdout.py @@ -39,9 +39,9 @@ def __call__(self, data): # Use covariance to select predictors covariance_matrix = np.cov(feat, rowvar=False) predictors = [] - for targs in enumerate(targets): - genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs[1]) - subMatrix = covariance_matrix[targs[1]][:, genes_not_in_target] + for targs in targets: + genes_not_in_target = np.setdiff1d(range(feat.shape[1]), targs) + subMatrix = covariance_matrix[targs][:, genes_not_in_target] sorted_idx = np.argsort(-subMatrix, axis=0) preds = genes_not_in_target[sorted_idx[:self.n_top].flatten()] predictors.append(np.unique(preds)) From 2428e9e5e5705401dd817abd31ddb880b9f791fb Mon Sep 17 00:00:00 2001 From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com> Date: Wed, 20 Sep 2023 00:32:33 +0800 Subject: [PATCH 06/11] Update multimodality.py --- dance/datasets/multimodality.py | 77 ++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py index 0ff3d00b..dd5ba42a 100644 --- a/dance/datasets/multimodality.py +++ b/dance/datasets/multimodality.py @@ -62,8 +62,55 @@ def data_paths(self) -> List[str]: osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad") ] + if self.subtask=="10k_pbmc": + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad")] + if self.subtask=="pbmc_cite": + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad")] + if self.subtask.startswith("5k_pbmc"): + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod2.h5ad") + ] + if self.subtask.startswith("openproblems_2022"): + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod2.h5ad") + ] + if self.subtask.startswith("GSE127064"): + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod2.h5ad") + ] + if self.subtask.startswith("GSE117089"): + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod2.h5ad") + ] + if self.subtask.startswith("GSE140203"): + paths=[ + osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod2.h5ad") + ] elif self.TASK == "match_modality": paths = [ osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"), @@ -100,6 +147,24 @@ class ModalityPredictionDataset(MultiModalityDataset): "https://www.dropbox.com/s/cz60vp7bwapz0kw/openproblems_bmmc_multiome_phase2_rna.zip?dl=1", "openproblems_bmmc_cite_phase2_rna_subset": "https://www.dropbox.com/s/veytldxkgzyoa8j/openproblems_bmmc_cite_phase2_rna_subset.zip?dl=1", + "5k_pbmc":"https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1", + "5k_pbmc_subset":"https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1", + "10k_pbmc":"https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1", + "pbmc_cite":"https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1", + "openproblems_2022_multi_atac2gex":"https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1", + "openproblems_2022_cite_gex2adt":"https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1", + "GSE127064_AdBrain_gex2atac":"https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1", + "GSE127064_p0Brain_gex2atac":"https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1", + "GSE117089_mouse_gex2atac":"https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1", + "GSE117089_A549_gex2atac":"https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1", + "GSE117089_sciCAR_gex2atac":"https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1", + "GSE140203_3T3_HG19_atac2gex":"https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1", + "GSE140203_3T3_MM10_atac2gex":"https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1", + "GSE140203_12878.rep2_atac2gex":"https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1", + "GSE140203_12878.rep3_atac2gex":"https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1", + "GSE140203_K562_HG19_atac2gex":"https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1", + "GSE140203_K562_MM10_atac2gex":"https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1", + "GSE140203_LUNG_atac2gex":"https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1" } SUBTASK_NAME_MAP = { "adt2gex": "openproblems_bmmc_cite_phase2_mod2", @@ -110,9 +175,10 @@ class ModalityPredictionDataset(MultiModalityDataset): } AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP)) - def __init__(self, subtask, root="./data", preprocess=None): + def __init__(self, subtask, root="./data", preprocess=None,span=0.3): # TODO: factor our preprocess self.preprocess = preprocess + self.span=span super().__init__(subtask, root) def _raw_to_dance(self, raw_data): @@ -135,7 +201,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000): if self.preprocess == "feature_selection": if raw_data[0].shape[1] > selection_threshold: sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3", - n_top_genes=selection_threshold) + n_top_genes=selection_threshold,span=self.span) raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"] for i in [0, 2]: raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]] @@ -169,10 +235,11 @@ class ModalityMatchingDataset(MultiModalityDataset): } AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP)) - def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None): + def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None,span=0.3): # TODO: factor our preprocess self.preprocess = preprocess self.pkl_path = pkl_path + self.span=span super().__init__(subtask, root) def _raw_to_dance(self, raw_data): @@ -252,7 +319,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000): for i in range(2): if modalities[i].shape[1] > selection_threshold: sc.pp.highly_variable_genes(modalities[i], layer="counts", flavor="seurat_v3", - n_top_genes=selection_threshold) + n_top_genes=selection_threshold,span=self.span) modalities[i + 2].var["highly_variable"] = modalities[i].var["highly_variable"] modalities[i] = modalities[i][:, modalities[i].var["highly_variable"]] modalities[i + 2] = modalities[i + 2][:, modalities[i + 2].var["highly_variable"]] From e5c63778ce427718a49ece01941722e452a308b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:32:51 +0000 Subject: [PATCH 07/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/datasets/multimodality.py | 102 +++++++++++++++++++------------- 1 file changed, 61 insertions(+), 41 deletions(-) diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py index dd5ba42a..c8bc69cb 100644 --- a/dance/datasets/multimodality.py +++ b/dance/datasets/multimodality.py @@ -64,48 +64,50 @@ def data_paths(self) -> List[str]: osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad") ] - if self.subtask=="10k_pbmc": - paths=[ - osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad")] - if self.subtask=="pbmc_cite": - paths=[ - osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"), - osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad")] + if self.subtask == "10k_pbmc": + paths = [ + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad") + ] + if self.subtask == "pbmc_cite": + paths = [ + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"), + osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad") + ] if self.subtask.startswith("5k_pbmc"): - paths=[ + paths = [ osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod2.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod2.h5ad") ] if self.subtask.startswith("openproblems_2022"): - paths=[ + paths = [ osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod2.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod2.h5ad") ] if self.subtask.startswith("GSE127064"): - paths=[ + paths = [ osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod2.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod2.h5ad") ] if self.subtask.startswith("GSE117089"): - paths=[ + paths = [ osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod2.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod2.h5ad") ] if self.subtask.startswith("GSE140203"): - paths=[ + paths = [ osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod1.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod2.h5ad"), osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod1.h5ad"), @@ -147,24 +149,42 @@ class ModalityPredictionDataset(MultiModalityDataset): "https://www.dropbox.com/s/cz60vp7bwapz0kw/openproblems_bmmc_multiome_phase2_rna.zip?dl=1", "openproblems_bmmc_cite_phase2_rna_subset": "https://www.dropbox.com/s/veytldxkgzyoa8j/openproblems_bmmc_cite_phase2_rna_subset.zip?dl=1", - "5k_pbmc":"https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1", - "5k_pbmc_subset":"https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1", - "10k_pbmc":"https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1", - "pbmc_cite":"https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1", - "openproblems_2022_multi_atac2gex":"https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1", - "openproblems_2022_cite_gex2adt":"https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1", - "GSE127064_AdBrain_gex2atac":"https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1", - "GSE127064_p0Brain_gex2atac":"https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1", - "GSE117089_mouse_gex2atac":"https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1", - "GSE117089_A549_gex2atac":"https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1", - "GSE117089_sciCAR_gex2atac":"https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1", - "GSE140203_3T3_HG19_atac2gex":"https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1", - "GSE140203_3T3_MM10_atac2gex":"https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1", - "GSE140203_12878.rep2_atac2gex":"https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1", - "GSE140203_12878.rep3_atac2gex":"https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1", - "GSE140203_K562_HG19_atac2gex":"https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1", - "GSE140203_K562_MM10_atac2gex":"https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1", - "GSE140203_LUNG_atac2gex":"https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1" + "5k_pbmc": + "https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1", + "5k_pbmc_subset": + "https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1", + "10k_pbmc": + "https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1", + "pbmc_cite": + "https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1", + "openproblems_2022_multi_atac2gex": + "https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1", + "openproblems_2022_cite_gex2adt": + "https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1", + "GSE127064_AdBrain_gex2atac": + "https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1", + "GSE127064_p0Brain_gex2atac": + "https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1", + "GSE117089_mouse_gex2atac": + "https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1", + "GSE117089_A549_gex2atac": + "https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1", + "GSE117089_sciCAR_gex2atac": + "https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1", + "GSE140203_3T3_HG19_atac2gex": + "https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1", + "GSE140203_3T3_MM10_atac2gex": + "https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1", + "GSE140203_12878.rep2_atac2gex": + "https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1", + "GSE140203_12878.rep3_atac2gex": + "https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1", + "GSE140203_K562_HG19_atac2gex": + "https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1", + "GSE140203_K562_MM10_atac2gex": + "https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1", + "GSE140203_LUNG_atac2gex": + "https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1" } SUBTASK_NAME_MAP = { "adt2gex": "openproblems_bmmc_cite_phase2_mod2", @@ -175,10 +195,10 @@ class ModalityPredictionDataset(MultiModalityDataset): } AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP)) - def __init__(self, subtask, root="./data", preprocess=None,span=0.3): + def __init__(self, subtask, root="./data", preprocess=None, span=0.3): # TODO: factor our preprocess self.preprocess = preprocess - self.span=span + self.span = span super().__init__(subtask, root) def _raw_to_dance(self, raw_data): @@ -201,7 +221,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000): if self.preprocess == "feature_selection": if raw_data[0].shape[1] > selection_threshold: sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3", - n_top_genes=selection_threshold,span=self.span) + n_top_genes=selection_threshold, span=self.span) raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"] for i in [0, 2]: raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]] @@ -235,11 +255,11 @@ class ModalityMatchingDataset(MultiModalityDataset): } AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP)) - def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None,span=0.3): + def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None, span=0.3): # TODO: factor our preprocess self.preprocess = preprocess self.pkl_path = pkl_path - self.span=span + self.span = span super().__init__(subtask, root) def _raw_to_dance(self, raw_data): @@ -319,7 +339,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000): for i in range(2): if modalities[i].shape[1] > selection_threshold: sc.pp.highly_variable_genes(modalities[i], layer="counts", flavor="seurat_v3", - n_top_genes=selection_threshold,span=self.span) + n_top_genes=selection_threshold, span=self.span) modalities[i + 2].var["highly_variable"] = modalities[i].var["highly_variable"] modalities[i] = modalities[i][:, modalities[i].var["highly_variable"]] modalities[i + 2] = modalities[i + 2][:, modalities[i + 2].var["highly_variable"]] From 4a7389e05e07ddde397c7326911918161d4b2454 Mon Sep 17 00:00:00 2001 From: xingzhongyu <57212168+xingzhongyu@users.noreply.github.com> Date: Wed, 20 Sep 2023 00:33:53 +0800 Subject: [PATCH 08/11] Update base.py --- dance/data/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/data/base.py b/dance/data/base.py index 3cb73377..f6bb69f0 100644 --- a/dance/data/base.py +++ b/dance/data/base.py @@ -456,6 +456,7 @@ def get_feature(self, *, split_name: Optional[str] = None, return_type: FeatType if split_name is not None: if channel_type in ["X", "raw_X", "obs", "obsm", "obsp", "layers"]: idx = self.get_split_idx(split_name, error_on_miss=True) + idx=list(filter(lambda a:a Date: Tue, 19 Sep 2023 16:34:10 +0000 Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/data/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/data/base.py b/dance/data/base.py index f6bb69f0..f004f740 100644 --- a/dance/data/base.py +++ b/dance/data/base.py @@ -456,7 +456,7 @@ def get_feature(self, *, split_name: Optional[str] = None, return_type: FeatType if split_name is not None: if channel_type in ["X", "raw_X", "obs", "obsm", "obsp", "layers"]: idx = self.get_split_idx(split_name, error_on_miss=True) - idx=list(filter(lambda a:a Date: Wed, 20 Sep 2023 00:35:09 +0800 Subject: [PATCH 10/11] Update scmogcn_graph.py --- dance/transforms/graph/scmogcn_graph.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dance/transforms/graph/scmogcn_graph.py b/dance/transforms/graph/scmogcn_graph.py index d17ec986..29d284e5 100644 --- a/dance/transforms/graph/scmogcn_graph.py +++ b/dance/transforms/graph/scmogcn_graph.py @@ -189,6 +189,7 @@ def construct_enhanced_feature_graph(u, v, e, train_size, feature_size, cell_nod graph.nodes['cell'].data['id'] = cell_node_features[:train_size] if not _test_graph else cell_node_features else: graph.nodes['cell'].data['id'] = cell_node_features + feature_size=min(graph.num_nodes('feature'),feature_size) graph.nodes['feature'].data['id'] = torch.arange(feature_size).long() graph.edges['feature2cell'].data['weight'] = e graph.edges['cell2feature'].data['weight'] = e[:graph.edges(etype='cell2feature')[0].shape[0]] From e839fe883a2c5e66b4d1f87697d344b72dcdfe17 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:35:26 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dance/transforms/graph/scmogcn_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dance/transforms/graph/scmogcn_graph.py b/dance/transforms/graph/scmogcn_graph.py index 29d284e5..428c0469 100644 --- a/dance/transforms/graph/scmogcn_graph.py +++ b/dance/transforms/graph/scmogcn_graph.py @@ -189,7 +189,7 @@ def construct_enhanced_feature_graph(u, v, e, train_size, feature_size, cell_nod graph.nodes['cell'].data['id'] = cell_node_features[:train_size] if not _test_graph else cell_node_features else: graph.nodes['cell'].data['id'] = cell_node_features - feature_size=min(graph.num_nodes('feature'),feature_size) + feature_size = min(graph.num_nodes('feature'), feature_size) graph.nodes['feature'].data['id'] = torch.arange(feature_size).long() graph.edges['feature2cell'].data['weight'] = e graph.edges['cell2feature'].data['weight'] = e[:graph.edges(etype='cell2feature')[0].shape[0]]