Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor multi dataset object for predict modality tasks #284

Merged
merged 7 commits into from
Jun 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 64 additions & 47 deletions dance/datasets/multimodality.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,80 @@
import os
import os.path as osp
import pickle
from abc import ABC

import anndata as ad
import mudata as md
import numpy as np
import scanpy as sc
import torch

from dance import logger
from dance.data import Data
from dance.datasets.base import BaseDataset
from dance.transforms.preprocess import lsiTransformer
from dance.typing import List
from dance.utils.download import download_file, unzip_file


class MultiModalityDataset:
class MultiModalityDataset(BaseDataset, ABC):

TASK = "N/A"
URL_DICT = {}
SUBTASK_NAME_MAP = {}
AVAILABLE_DATA = []

def __init__(self, subtask, data_dir="./data"):
def __init__(self, subtask, root="./data"):
assert subtask in self.AVAILABLE_DATA, f"Undefined subtask {subtask!r}."
assert self.TASK in ["predict_modality", "match_modality", "joint_embedding"]

self.subtask = self.SUBTASK_NAME_MAP.get(subtask, subtask)
self.data_url = self.URL_DICT[subtask]

self.data_dir = data_dir
self.loaded = False

super().__init__(root=root, full_download=False)

def download(self):
self.download_data()

def download_data(self):
download_file(self.data_url, osp.join(self.data_dir, f"{self.subtask}.zip"))
unzip_file(osp.join(self.data_dir, f"{self.subtask}.zip"), self.data_dir)
download_file(self.data_url, osp.join(self.root, f"{self.subtask}.zip"))
unzip_file(osp.join(self.root, f"{self.subtask}.zip"), self.root)
return self

def download_pathway(self):
download_file("https://www.dropbox.com/s/uqoakpalr3albiq/h.all.v7.4.entrez.gmt?dl=1",
osp.join(self.data_dir, "h.all.v7.4.entrez.gmt"))
osp.join(self.root, "h.all.v7.4.entrez.gmt"))
download_file("https://www.dropbox.com/s/yjrcsd2rpmahmfo/h.all.v7.4.symbols.gmt?dl=1",
osp.join(self.data_dir, "h.all.v7.4.symbols.gmt"))
osp.join(self.root, "h.all.v7.4.symbols.gmt"))
return self

@property
def mod_data_paths(self) -> List[str]:
if self.TASK == "joint_embedding":
paths = [
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_mod1.h5ad"),
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_mod2.h5ad"),
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_mod1.h5ad"),
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_mod2.h5ad"),
]
else:
paths = [
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"),
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"),
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"),
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"),
]
return paths

def is_complete(self) -> bool:
return all(map(osp.exists, self.mod_data_paths))

def load_data(self):
# Load data from existing h5ad files, or download files and load data.
if not self.is_complete():
self.download_data()
assert self.is_complete()

self.modalities = []
def _load_raw_data(self) -> List[ad.AnnData]:
modalities = []
for mod_path in self.mod_data_paths:
self.modalities.append(ad.read_h5ad(mod_path))
logger.info(f"Loading {mod_path}")
modalities.append(ad.read_h5ad(mod_path))
self.loaded = True
return self
return modalities

def sparse_features(self, index=None, count=False):
assert self.loaded, "Data have not been loaded."
Expand Down Expand Up @@ -138,25 +141,39 @@ class ModalityPredictionDataset(MultiModalityDataset):
}
AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))

def __init__(self, subtask, data_dir="./data"):
super().__init__(subtask, data_dir)
def __init__(self, subtask, root="./data", preprocess=None):
# TODO: factor our preprocess
self.preprocess = preprocess
super().__init__(subtask, root)

def preprocess(self, kind="feature_selection", selection_threshold=10000):
if kind == "pca":
logger.info("Preprocessing method not supported.")
return self
elif kind == "feature_selection":
if self.modalities[0].shape[1] > selection_threshold:
sc.pp.highly_variable_genes(self.modalities[0], layer="counts", flavor="seurat_v3",
def _raw_to_dance(self, raw_data):
train_mod1, train_mod2, test_mod1, test_mod2 = self._maybe_preprocess(raw_data)

mod1 = ad.concat((train_mod1, test_mod1))
mod2 = ad.concat((train_mod2, test_mod2))
mod1.var_names_make_unique()
mod2.var_names_make_unique()

mdata = md.MuData({"mod1": mod1, "mod2": mod2})
mdata.var_names_make_unique()

data = Data(mdata, train_size=train_mod1.shape[0])
data.set_config(feature_mod="mod1", label_mod="mod2")

return data

def _maybe_preprocess(self, raw_data, selection_threshold=10000):
if self.preprocess == "feature_selection":
if raw_data[0].shape[1] > selection_threshold:
sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3",
n_top_genes=selection_threshold)
self.modalities[2].var["highly_variable"] = self.modalities[0].var["highly_variable"]
raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"]
for i in [0, 2]:
self.modalities[i] = self.modalities[i][:, self.modalities[i].var["highly_variable"]]
else:
logger.info("Preprocessing method not supported.")
return self
raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]]
elif self.preprocess not in (None, "none"):
logger.info(f"Preprocessing method {self.preprocess!r} not supported.")
logger.info("Preprocessing done.")
return self
return raw_data


class ModalityMatchingDataset(MultiModalityDataset):
Expand All @@ -180,16 +197,16 @@ class ModalityMatchingDataset(MultiModalityDataset):
}
AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))

def __init__(self, subtask, data_dir="./data"):
super().__init__(subtask, data_dir)
def __init__(self, subtask, root="./data"):
super().__init__(subtask, root)
self.preprocessed = False

def load_sol(self):
assert (self.loaded)
self.train_sol = ad.read_h5ad(
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_train_sol.h5ad"))
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_sol.h5ad"))
self.test_sol = ad.read_h5ad(
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_test_sol.h5ad"))
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_sol.h5ad"))
self.modalities[1] = self.modalities[1][self.train_sol.to_df().values.argmax(1)]
return self

Expand Down Expand Up @@ -273,8 +290,8 @@ class JointEmbeddingNIPSDataset(MultiModalityDataset):
}
AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))

def __init__(self, subtask, data_dir="./data"):
super().__init__(subtask, data_dir)
def __init__(self, subtask, root="./data"):
super().__init__(subtask, root)
self.preprocessed = False

def load_metadata(self):
Expand All @@ -287,15 +304,15 @@ def load_metadata(self):
mod = "atac"
meta = "multiome"
self.exploration = [
ad.read_h5ad(osp.join(self.data_dir, self.subtask, f"{meta}_gex_processed_training.h5ad")),
ad.read_h5ad(osp.join(self.data_dir, self.subtask, f"{meta}_{mod}_processed_training.h5ad")),
ad.read_h5ad(osp.join(self.root, self.subtask, f"{meta}_gex_processed_training.h5ad")),
ad.read_h5ad(osp.join(self.root, self.subtask, f"{meta}_{mod}_processed_training.h5ad")),
]
return self

def load_sol(self):
assert (self.loaded)
self.test_sol = ad.read_h5ad(
osp.join(self.data_dir, self.subtask, f"{self.subtask}.censor_dataset.output_solution.h5ad"))
osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_solution.h5ad"))
return self

def preprocess(self, kind="aux", pretrained_folder=".", selection_threshold=10000):
Expand Down
2 changes: 1 addition & 1 deletion dance/modules/multi_modality/predict_modality/cmae.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,6 @@ def fit(self, train_mod1, train_mod2, aux_labels=None, checkpoint_directory='./c
while True:
print('Iteration: ', iterations)
for it, batch_idx in enumerate(train_loader):
self._update_learning_rate()
mod1, mod2 = train_mod1[batch_idx], train_mod2[batch_idx]

for _ in range(num_disc):
Expand All @@ -594,6 +593,7 @@ def fit(self, train_mod1, train_mod2, aux_labels=None, checkpoint_directory='./c
aux_labels[batch_idx], variational=False)
else:
self._gen_update(mod1, mod2, mod1, mod2, hyperparameters, variational=False)
self._update_learning_rate()

print('RMSE Loss:', self.score(train_mod1[val_idx], train_mod2[val_idx]))

Expand Down
26 changes: 5 additions & 21 deletions examples/multi_modality/predict_modality/babel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,9 @@
import os
import random

import anndata
import mudata
import torch

from dance import logger
from dance.data import Data
from dance.datasets.multimodality import ModalityPredictionDataset
from dance.modules.multi_modality.predict_modality.babel import BabelWrapper
from dance.utils import set_seed
Expand Down Expand Up @@ -40,16 +37,14 @@
torch.set_num_threads(args.cpus)
rndseed = args.rnd_seed
set_seed(rndseed)
dataset = ModalityPredictionDataset(args.subtask).load_data().preprocess("feature_selection")
dataset = ModalityPredictionDataset(args.subtask, preprocess="feature_selection")
data = dataset.load_data()

device = args.device
args.outdir = os.path.abspath(args.outdir)
os.makedirs(args.model_folder, exist_ok=True)
os.makedirs(args.outdir, exist_ok=True)

args.outdir = os.path.abspath(args.outdir)

if not os.path.isdir(os.path.dirname(args.outdir)):
os.makedirs(os.path.dirname(args.outdir))

# Specify output log file
fh = logging.FileHandler(f"{args.outdir}/training_{args.subtask}_{args.rnd_seed}.log", "w")
fh.setLevel(logging.INFO)
Expand All @@ -58,17 +53,6 @@
for arg in vars(args):
logger.info(f"Parameter {arg}: {getattr(args, arg)}")

# Construct data object
mod1 = anndata.concat((dataset.modalities[0], dataset.modalities[2]))
mod2 = anndata.concat((dataset.modalities[1], dataset.modalities[3]))
mod1.var_names_make_unique()
mod2.var_names_make_unique()
mdata = mudata.MuData({"mod1": mod1, "mod2": mod2})
mdata.var_names_make_unique()
train_size = dataset.modalities[0].shape[0]
data = Data(mdata, train_size=train_size)
data.set_config(feature_mod="mod1", label_mod="mod2")

# Obtain training and testing data
x_train, y_train = data.get_train_data(return_type="torch")
x_test, y_test = data.get_test_data(return_type="torch")
Expand All @@ -78,7 +62,7 @@
model.fit(x_train, y_train, val_ratio=0.15)
print(model.predict(x_test))
print(model.score(x_test, y_test))
""" To reproduce BABEL on other samples, please refer to command lines belows:
"""To reproduce BABEL on other samples, please refer to command lines belows:
GEX to ADT:
python babel.py --subtask openproblems_bmmc_cite_phase2_rna --device cuda

Expand Down
Loading