OmicsML · RemyLau · Jun 24, 2023 · Jun 24, 2023 · Jun 24, 2023 · Jun 24, 2023
diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
@@ -1,77 +1,80 @@
 import os
 import os.path as osp
 import pickle
+from abc import ABC
 
 import anndata as ad
+import mudata as md
 import numpy as np
 import scanpy as sc
 import torch
 
 from dance import logger
+from dance.data import Data
+from dance.datasets.base import BaseDataset
 from dance.transforms.preprocess import lsiTransformer
 from dance.typing import List
 from dance.utils.download import download_file, unzip_file
 
 
-class MultiModalityDataset:
+class MultiModalityDataset(BaseDataset, ABC):
 
     TASK = "N/A"
     URL_DICT = {}
     SUBTASK_NAME_MAP = {}
     AVAILABLE_DATA = []
 
-    def __init__(self, subtask, data_dir="./data"):
+    def __init__(self, subtask, root="./data"):
         assert subtask in self.AVAILABLE_DATA, f"Undefined subtask {subtask!r}."
         assert self.TASK in ["predict_modality", "match_modality", "joint_embedding"]
 
         self.subtask = self.SUBTASK_NAME_MAP.get(subtask, subtask)
         self.data_url = self.URL_DICT[subtask]
-
-        self.data_dir = data_dir
         self.loaded = False
 
+        super().__init__(root=root, full_download=False)
+
+    def download(self):
+        self.download_data()
+
     def download_data(self):
-        download_file(self.data_url, osp.join(self.data_dir, f"{self.subtask}.zip"))
-        unzip_file(osp.join(self.data_dir, f"{self.subtask}.zip"), self.data_dir)
+        download_file(self.data_url, osp.join(self.root, f"{self.subtask}.zip"))
+        unzip_file(osp.join(self.root, f"{self.subtask}.zip"), self.root)
         return self
 
     def download_pathway(self):
         download_file("https://www.dropbox.com/s/uqoakpalr3albiq/h.all.v7.4.entrez.gmt?dl=1",
-                      osp.join(self.data_dir, "h.all.v7.4.entrez.gmt"))
+                      osp.join(self.root, "h.all.v7.4.entrez.gmt"))
         download_file("https://www.dropbox.com/s/yjrcsd2rpmahmfo/h.all.v7.4.symbols.gmt?dl=1",
-                      osp.join(self.data_dir, "h.all.v7.4.symbols.gmt"))
+                      osp.join(self.root, "h.all.v7.4.symbols.gmt"))
         return self
 
     @property
     def mod_data_paths(self) -> List[str]:
         if self.TASK == "joint_embedding":
             paths = [
-                osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_mod1.h5ad"),
-                osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_mod2.h5ad"),
             ]
         else:
             paths = [
-                osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
-                osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"),
-                osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
-                osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"),
             ]
         return paths
 
     def is_complete(self) -> bool:
         return all(map(osp.exists, self.mod_data_paths))
 
-    def load_data(self):
-        # Load data from existing h5ad files, or download files and load data.
-        if not self.is_complete():
-            self.download_data()
-            assert self.is_complete()
-
-        self.modalities = []
+    def _load_raw_data(self) -> List[ad.AnnData]:
+        modalities = []
         for mod_path in self.mod_data_paths:
-            self.modalities.append(ad.read_h5ad(mod_path))
+            logger.info(f"Loading {mod_path}")
+            modalities.append(ad.read_h5ad(mod_path))
         self.loaded = True
-        return self
+        return modalities
 
     def sparse_features(self, index=None, count=False):
         assert self.loaded, "Data have not been loaded."
@@ -138,25 +141,39 @@ class ModalityPredictionDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, data_dir="./data"):
-        super().__init__(subtask, data_dir)
+    def __init__(self, subtask, root="./data", preprocess=None):
+        # TODO: factor our preprocess
+        self.preprocess = preprocess
+        super().__init__(subtask, root)
 
-    def preprocess(self, kind="feature_selection", selection_threshold=10000):
-        if kind == "pca":
-            logger.info("Preprocessing method not supported.")
-            return self
-        elif kind == "feature_selection":
-            if self.modalities[0].shape[1] > selection_threshold:
-                sc.pp.highly_variable_genes(self.modalities[0], layer="counts", flavor="seurat_v3",
+    def _raw_to_dance(self, raw_data):
+        train_mod1, train_mod2, test_mod1, test_mod2 = self._maybe_preprocess(raw_data)
+
+        mod1 = ad.concat((train_mod1, test_mod1))
+        mod2 = ad.concat((train_mod2, test_mod2))
+        mod1.var_names_make_unique()
+        mod2.var_names_make_unique()
+
+        mdata = md.MuData({"mod1": mod1, "mod2": mod2})
+        mdata.var_names_make_unique()
+
+        data = Data(mdata, train_size=train_mod1.shape[0])
+        data.set_config(feature_mod="mod1", label_mod="mod2")
+
+        return data
+
+    def _maybe_preprocess(self, raw_data, selection_threshold=10000):
+        if self.preprocess == "feature_selection":
+            if raw_data[0].shape[1] > selection_threshold:
+                sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3",
                                             n_top_genes=selection_threshold)
-                self.modalities[2].var["highly_variable"] = self.modalities[0].var["highly_variable"]
+                raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"]
                 for i in [0, 2]:
-                    self.modalities[i] = self.modalities[i][:, self.modalities[i].var["highly_variable"]]
-        else:
-            logger.info("Preprocessing method not supported.")
-            return self
+                    raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]]
+        elif self.preprocess not in (None, "none"):
+            logger.info(f"Preprocessing method {self.preprocess!r} not supported.")
         logger.info("Preprocessing done.")
-        return self
+        return raw_data
 
 
 class ModalityMatchingDataset(MultiModalityDataset):
@@ -180,16 +197,16 @@ class ModalityMatchingDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, data_dir="./data"):
-        super().__init__(subtask, data_dir)
+    def __init__(self, subtask, root="./data"):
+        super().__init__(subtask, root)
         self.preprocessed = False
 
     def load_sol(self):
         assert (self.loaded)
         self.train_sol = ad.read_h5ad(
-            osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_train_sol.h5ad"))
+            osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_sol.h5ad"))
         self.test_sol = ad.read_h5ad(
-            osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_test_sol.h5ad"))
+            osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_sol.h5ad"))
         self.modalities[1] = self.modalities[1][self.train_sol.to_df().values.argmax(1)]
         return self
 
@@ -273,8 +290,8 @@ class JointEmbeddingNIPSDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, data_dir="./data"):
-        super().__init__(subtask, data_dir)
+    def __init__(self, subtask, root="./data"):
+        super().__init__(subtask, root)
         self.preprocessed = False
 
     def load_metadata(self):
@@ -287,15 +304,15 @@ def load_metadata(self):
             mod = "atac"
             meta = "multiome"
         self.exploration = [
-            ad.read_h5ad(osp.join(self.data_dir, self.subtask, f"{meta}_gex_processed_training.h5ad")),
-            ad.read_h5ad(osp.join(self.data_dir, self.subtask, f"{meta}_{mod}_processed_training.h5ad")),
+            ad.read_h5ad(osp.join(self.root, self.subtask, f"{meta}_gex_processed_training.h5ad")),
+            ad.read_h5ad(osp.join(self.root, self.subtask, f"{meta}_{mod}_processed_training.h5ad")),
         ]
         return self
 
     def load_sol(self):
         assert (self.loaded)
         self.test_sol = ad.read_h5ad(
-            osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_solution.h5ad"))
+            osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_solution.h5ad"))
         return self
 
     def preprocess(self, kind="aux", pretrained_folder=".", selection_threshold=10000):

diff --git a/dance/modules/multi_modality/predict_modality/cmae.py b/dance/modules/multi_modality/predict_modality/cmae.py
@@ -583,7 +583,6 @@ def fit(self, train_mod1, train_mod2, aux_labels=None, checkpoint_directory='./c
         while True:
             print('Iteration: ', iterations)
             for it, batch_idx in enumerate(train_loader):
-                self._update_learning_rate()
                 mod1, mod2 = train_mod1[batch_idx], train_mod2[batch_idx]
 
                 for _ in range(num_disc):
@@ -594,6 +593,7 @@ def fit(self, train_mod1, train_mod2, aux_labels=None, checkpoint_directory='./c
                                          aux_labels[batch_idx], variational=False)
                     else:
                         self._gen_update(mod1, mod2, mod1, mod2, hyperparameters, variational=False)
+                self._update_learning_rate()
 
             print('RMSE Loss:', self.score(train_mod1[val_idx], train_mod2[val_idx]))
 

diff --git a/examples/multi_modality/predict_modality/babel.py b/examples/multi_modality/predict_modality/babel.py
@@ -3,12 +3,9 @@
 import os
 import random
 
-import anndata
-import mudata
 import torch
 
 from dance import logger
-from dance.data import Data
 from dance.datasets.multimodality import ModalityPredictionDataset
 from dance.modules.multi_modality.predict_modality.babel import BabelWrapper
 from dance.utils import set_seed
@@ -40,16 +37,14 @@
     torch.set_num_threads(args.cpus)
     rndseed = args.rnd_seed
     set_seed(rndseed)
-    dataset = ModalityPredictionDataset(args.subtask).load_data().preprocess("feature_selection")
+    dataset = ModalityPredictionDataset(args.subtask, preprocess="feature_selection")
+    data = dataset.load_data()
+
     device = args.device
+    args.outdir = os.path.abspath(args.outdir)
     os.makedirs(args.model_folder, exist_ok=True)
     os.makedirs(args.outdir, exist_ok=True)
 
-    args.outdir = os.path.abspath(args.outdir)
-
-    if not os.path.isdir(os.path.dirname(args.outdir)):
-        os.makedirs(os.path.dirname(args.outdir))
-
     # Specify output log file
     fh = logging.FileHandler(f"{args.outdir}/training_{args.subtask}_{args.rnd_seed}.log", "w")
     fh.setLevel(logging.INFO)
@@ -58,17 +53,6 @@
     for arg in vars(args):
         logger.info(f"Parameter {arg}: {getattr(args, arg)}")
 
-    # Construct data object
-    mod1 = anndata.concat((dataset.modalities[0], dataset.modalities[2]))
-    mod2 = anndata.concat((dataset.modalities[1], dataset.modalities[3]))
-    mod1.var_names_make_unique()
-    mod2.var_names_make_unique()
-    mdata = mudata.MuData({"mod1": mod1, "mod2": mod2})
-    mdata.var_names_make_unique()
-    train_size = dataset.modalities[0].shape[0]
-    data = Data(mdata, train_size=train_size)
-    data.set_config(feature_mod="mod1", label_mod="mod2")
-
     # Obtain training and testing data
     x_train, y_train = data.get_train_data(return_type="torch")
     x_test, y_test = data.get_test_data(return_type="torch")
@@ -78,7 +62,7 @@
     model.fit(x_train, y_train, val_ratio=0.15)
     print(model.predict(x_test))
     print(model.score(x_test, y_test))
-""" To reproduce BABEL on other samples, please refer to command lines belows:
+"""To reproduce BABEL on other samples, please refer to command lines belows:
 GEX to ADT:
 python babel.py --subtask openproblems_bmmc_cite_phase2_rna --device cuda