diff --git a/dance/modules/single_modality/clustering/graphsc.py b/dance/modules/single_modality/clustering/graphsc.py index 5c9b0a4f..17d6d97d 100644 --- a/dance/modules/single_modality/clustering/graphsc.py +++ b/dance/modules/single_modality/clustering/graphsc.py @@ -9,10 +9,7 @@ """ -import time - import dgl -import matplotlib.pyplot as plt import numpy as np import pandas as pd import scanpy as sc @@ -24,32 +21,90 @@ from dgl.nn.pytorch import GraphConv from dgl.utils import expand_as_pair from sklearn.cluster import KMeans -from sklearn.decomposition import PCA -from sklearn.metrics import adjusted_rand_score, calinski_harabasz_score, normalized_mutual_info_score, silhouette_score from torch.nn.functional import binary_cross_entropy_with_logits as BCELoss from tqdm import tqdm +from dance.modules.base import BaseClusteringMethod from dance.transforms import AnnDataTransform, Compose, SetConfig from dance.transforms.graph import PCACellFeatureGraph -from dance.typing import LogLevel +from dance.typing import Any, Literal, LogLevel, Optional +from dance.utils import get_device -class GraphSC: +class GraphSC(BaseClusteringMethod): """GraphSC class. Parameters ---------- - args : argparse.Namespace - a Namespace contains arguments of GCNAE. For details of parameters in parser args, please refer to link - (parser help document). + agg + Aggregation layer. + activation + Activation function. + in_feats + Dimension of input feature + n_hidden + Number of hidden layer. + hidden_dim + Input dimension of hidden layer 1. + hidden_1 + Output dimension of hidden layer 1. + hidden_2 + Output dimension of hidden layer 2. + dropout + Dropout rate. + n_layers + Number of graph convolutional layers. + hidden_relu + Use relu activation in hidden layers or not. + hidden_bn + Use batch norm in hidden layers or not. + cluster_method + Method for clustering. + num_workers + Number of workers. + device + Computation device to use. """ - def __init__(self, args): + def __init__( + self, + agg: str = "sum", + activation: str = "relu", + in_feats: int = 50, + n_hidden: int = 1, + hidden_dim: int = 200, + hidden_1: int = 300, + hidden_2: int = 0, + dropout: float = 0.1, + n_layers: int = 1, + hidden_relu: bool = False, + hidden_bn: bool = False, + n_clusters: int = 10, + cluster_method: Literal["kmeans", "leiden"] = "kmeans", + num_workers: int = 1, + device: str = "auto", + ): super().__init__() - self.args = args - self.device = get_device(args.use_cpu) - self.model = GCNAE(args).to(self.device) + self.n_layers = n_layers + self.n_clusters = n_clusters + self.cluster_method = cluster_method + self.num_workers = num_workers + self.device = get_device(device) + + self.model = GCNAE( + agg=agg, + activation=activation, + in_feats=in_feats, + n_hidden=n_hidden, + hidden_dim=hidden_dim, + hidden_1=hidden_1, + hidden_2=hidden_2, + dropout=dropout, + n_layers=n_layers, + hidden_relu=hidden_relu, + hidden_bn=hidden_bn, + ).to(self.device) @staticmethod def preprocessing_pipeline(n_top_genes: int = 3000, normalize_weights: str = "log_per_cell", n_components: int = 50, @@ -90,300 +145,231 @@ def preprocessing_pipeline(n_top_genes: int = 3000, normalize_weights: str = "lo return Compose(*transforms, log_level=log_level) - def fit(self, g, n_epochs, n_clusters, lr, cluster=["KMeans"]): + def fit( + self, + g: dgl.DGLGraph, + y: Optional[Any] = None, + *, + n_epochs: int = 100, + lr: float = 1e-5, + batch_size: int = 128, + show_epoch_ari: bool = False, + eval_epoch: bool = False, + ): """Train graph-sc. Parameters ---------- - g : dgl.DGLGraph - input cell-gene graph. - n_epochs : int - number of epochs. - n_clusters : int - number of clusters. - lr : float - learning rate. - cluster : list optional - clustering method. + g + Input cell-gene graph. + y + Not used, for compatibility with the BaseClusteringMethod class. + n_epochs + Number of epochs. + lr + Learning rate. + batch_size + Batch size. + show_epoch_ari + Show ARI score for each epoch + eval_epoch + Evaluate every epoch. """ g.ndata["order"] = g.ndata["label"] = g.ndata["feat_id"] train_ids = np.where(g.ndata["label"] != -1)[0] - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(self.args.n_layers) - dataloader = dgl.dataloading.NodeDataLoader(g, train_ids, sampler, batch_size=self.args.batch_size, - shuffle=True, drop_last=False, num_workers=self.args.num_workers) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(self.n_layers) + dataloader = dgl.dataloading.NodeDataLoader(g, train_ids, sampler, batch_size=batch_size, shuffle=True, + drop_last=False, num_workers=self.num_workers) - device = get_device(self.args.use_cpu) + device = get_device(self.device) optim = torch.optim.Adam(self.model.parameters(), lr=lr) losses = [] - aris_kmeans = [] + aris = [] Z = {} - with dataloader.enable_cpu_affinity(): - for epoch in tqdm(range(n_epochs)): - self.model.train() - z = [] - y = [] - order = [] - - for input_nodes, output_nodes, blocks in dataloader: - blocks = [b.to(device) for b in blocks] - input_features = blocks[0].srcdata['features'] - g = blocks[-1] - - adj_logits, emb = self.model.forward(blocks, input_features) - z.extend(emb.detach().cpu().numpy()) - if "label" in blocks[-1].dstdata: - y.extend(blocks[-1].dstdata["label"].cpu().numpy()) - order.extend(blocks[-1].dstdata["order"].cpu().numpy()) - - adj = g.adjacency_matrix().to_dense().to(device) - adj = adj[g.dstnodes()] - pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) - factor = float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) - if factor == 0: - factor = 1 - norm = adj.shape[0] * adj.shape[0] / factor - adj_logits, _ = self.model.forward(blocks, input_features) - loss = norm * BCELoss(adj_logits, adj.to(device), pos_weight=pos_weight.to(device)) - optim.zero_grad() - loss.backward() - optim.step() - losses.append(loss.item()) - - z = np.array(z) - y = np.array(y) - order = np.array(order) - order = np.argsort(order) - z = z[order] - y = y[order] - if pd.isnull(y[0]): - y = None + for epoch in tqdm(range(n_epochs)): + self.model.train() + z = [] + y = [] + order = [] + + for input_nodes, output_nodes, blocks in dataloader: + blocks = [b.to(device) for b in blocks] + input_features = blocks[0].srcdata["features"] + g = blocks[-1] + + adj_logits, emb = self.model.forward(blocks, input_features) + z.extend(emb.detach().cpu().numpy()) + if "label" in blocks[-1].dstdata: + y.extend(blocks[-1].dstdata["label"].cpu().numpy()) + order.extend(blocks[-1].dstdata["order"].cpu().numpy()) + + adj = g.adjacency_matrix().to_dense().to(device) + adj = adj[g.dstnodes()] + pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) + factor = float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) + if factor == 0: + factor = 1 + norm = adj.shape[0] * adj.shape[0] / factor + adj_logits, _ = self.model.forward(blocks, input_features) + loss = norm * BCELoss(adj_logits, adj.to(device), pos_weight=pos_weight.to(device)) + optim.zero_grad() + loss.backward() + optim.step() + losses.append(loss.item()) + + z = np.array(z) + y = np.array(y) + order = np.array(order) + order = np.argsort(order) + z = z[order] + y = y[order] + if pd.isnull(y[0]): + y = None + self.z = z + + if eval_epoch: + score = self.score(None, y) + aris.append(score) + if show_epoch_ari: + print(f"epoch {epoch}, ARI {score}") + z_ = {f"epoch{epoch}": z} + Z = {**Z, **z_} + + elif epoch == n_epochs - 1: self.z = z - if self.args.eval_epoch: - score = self.score(y, n_clusters, cluster=cluster) - aris_kmeans.append(score["kmeans_ari"]) - if self.args.show_epoch_ari: - print(f'epoch {epoch}, ARI {score.get("kmeans_ari")}') - z_ = {f'epoch{epoch}': z} - Z = {**Z, **z_} - - elif epoch == n_epochs - 1: - self.z = z + if eval_epoch: + index = np.argmax(aris) + self.z = Z[f"epoch{index}"] - if self.args.eval_epoch: - index = np.argmax(aris_kmeans) - self.z = Z[f'epoch{index}'] - - def predict(self, n_clusters, cluster=["KMeans"]): + def predict(self, x: Optional[Any] = None): """Get predictions from the graph autoencoder model. Parameters ---------- - n_clusters : int - number of clusters. - cluster : list optional - clustering method. + x + Not used, for compatibility with BaseClusteringMethod class. Returns ------- - pred : dict - prediction of given clustering method. + pred + Prediction of given clustering method. """ - z = self.z - pred = {} - - if "KMeans" in cluster: - kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=5, n_init=10) - kmeans_pred = {"kmeans_pred": kmeans.fit_predict(z)} - pred = {**pred, **kmeans_pred} - - if "Leiden" in cluster: - leiden_pred = {"leiden_pred": run_leiden(z)} - pred = {**pred, **leiden_pred} - + if self.cluster_method == "kmeans": + kmeans = KMeans(n_clusters=self.n_clusters, init="k-means++", random_state=5, n_init=10) + pred = kmeans.fit_predict(self.z) + elif self.cluster_method == "leiden": + pred = run_leiden(self.z) + else: + raise ValueError(f"Unknown clustering {self.cluster_method}, available options are: 'kmeans', 'leiden'") return pred - def score(self, y, n_clusters, plot=False, cluster=["KMeans"]): - """Evaluate the graph autoencoder model. - - Parameters - ---------- - y : list - true labels. - n_clusters : int - number of clusters. - plot : bool optional - show plot or not. - cluster : list optional - clustering method. - - Returns - ------- - scores : dict - metric evaluation scores. - - """ - z = self.z - self.model.eval() - - k_start = time.time() - scores = {"ae_end": k_start} - - if "KMeans" in cluster: - kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=5, n_init=10) - kmeans_pred = kmeans.fit_predict(z) - ari_k = None - nmi_k = None - if y is not None: - ari_k = round(adjusted_rand_score(y, kmeans_pred), 4) - nmi_k = round(normalized_mutual_info_score(y, kmeans_pred), 4) - sil_k = silhouette_score(z, kmeans_pred) - cal_k = calinski_harabasz_score(z, kmeans_pred) - k_end = time.time() - scores_k = { - "kmeans_ari": ari_k, - "kmeans_nmi": nmi_k, - "kmeans_sil": sil_k, - "kmeans_cal": cal_k, - "kmeans_pred": kmeans_pred, - "kmeans_time": k_end - k_start, - } - scores = {**scores, **scores_k} - - if "Leiden" in cluster: - l_start = time.time() - leiden_pred = run_leiden(z) - ari_l = None - nmi_l = None - if y is not None: - ari_l = round(adjusted_rand_score(y, leiden_pred), 4) - nmi_l = round(normalized_mutual_info_score(y, leiden_pred), 4) - sil_l = silhouette_score(z, leiden_pred) - cal_l = calinski_harabasz_score(z, leiden_pred) - l_end = time.time() - scores_l = { - "leiden_ari": ari_l, - "leiden_nmi": nmi_l, - "leiden_sil": sil_l, - "leiden_cal": cal_l, - "leiden_pred": leiden_pred, - "leiden_time": l_end - l_start, - } - scores = {**scores, **scores_l} - - if plot: - pca = PCA(2).fit_transform(z) - plt.figure(figsize=(12, 4)) - plt.subplot(131) - plt.title("Ground truth") - plt.scatter(pca[:, 0], pca[:, 1], c=y, s=4) - - plt.subplot(132) - plt.title("K-Means pred") - plt.scatter(pca[:, 0], pca[:, 1], c=kmeans_pred, s=4) - - plt.subplot(133) - plt.title("Leiden pred") - plt.scatter(pca[:, 0], pca[:, 1], c=leiden_pred, s=4) - plt.show() - return scores - class GCNAE(nn.Module): """Graph convolutional autoencoder class. Parameters ---------- - args : argparse.Namespace - a Namespace contains arguments of scDSC. For details of parameters in parser args, please refer to link - (parser help document). - agg : str - aggregation layer. - activation :str - activation function. - in_feats : int - dimension of input feature - n_hidden : int - number of hidden layer. - hidden_dim :int - input dimension of hidden layer 1. - hidden_1 : int - output dimension of hidden layer 1. - hidden_2 : int - output dimension of hidden layer 2. - dropout : float - dropout rate. - n_layers :int - number of graph convolutional layers. - hidden_relu : bool - use relu activation in hidden layers or not. - hidden_bn : bool - use batch norm in hidden layers or not. + agg + Aggregation layer. + activation + Activation function. + in_feats + Dimension of input feature + n_hidden + Number of hidden layer. + hidden_dim + Input dimension of hidden layer 1. + hidden_1 + Output dimension of hidden layer 1. + hidden_2 + Output dimension of hidden layer 2. + dropout + Dropout rate. + n_layers + Number of graph convolutional layers. + hidden_relu + Use relu activation in hidden layers or not. + hidden_bn + Use batch norm in hidden layers or not. Returns ------- - adj_rec : - reconstructed adjacency matrix. - x : - embedding. + adj_rec + Reconstructed adjacency matrix. + x + Embedding. """ - def __init__(self, args): - + def __init__( + self, + *, + agg: str, + activation: str, + in_feats: int, + n_hidden: int, + hidden_dim: int, + hidden_1: int, + hidden_2: int, + dropout: float, + n_layers: int, + hidden_relu: bool, + hidden_bn: bool, + ): super().__init__() - self.args = args - self.agg = args.agg - if args.activation == 'gelu': + self.agg = agg + + if activation == "gelu": activation = F.gelu - elif args.activation == 'prelu': + elif activation == "prelu": activation = F.prelu - elif args.activation == 'relu': + elif activation == "relu": activation = F.relu - elif args.activation == 'leaky_relu': + elif activation == "leaky_relu": activation = F.leaky_relu - if args.n_hidden == 0: + if n_hidden == 0: hidden = None - elif args.n_hidden == 1: - hidden = [args.hidden_1] - elif args.n_hidden == 2: - hidden = [args.hidden_1, args.hidden_2] + elif n_hidden == 1: + hidden = [hidden_1] + elif n_hidden == 2: + hidden = [hidden_1, hidden_2] - if args.dropout != 0: - self.dropout = nn.Dropout(p=args.dropout) + if dropout != 0: + self.dropout = nn.Dropout(p=dropout) else: self.dropout = None - self.layer1 = WeightedGraphConv(in_feats=args.in_feats, out_feats=args.hidden_dim, activation=activation) - if args.n_layers == 2: - self.layer2 = WeightedGraphConv(in_feats=args.hidden_dim, out_feats=args.hidden_dim, activation=activation) + self.layer1 = WeightedGraphConv(in_feats=in_feats, out_feats=hidden_dim, activation=activation) + if n_layers == 2: + self.layer2 = WeightedGraphConv(in_feats=hidden_dim, out_feats=hidden_dim, activation=activation) self.decoder = InnerProductDecoder(activation=lambda x: x) self.hidden = hidden if hidden is not None: enc = [] for i, s in enumerate(hidden): if i == 0: - enc.append(nn.Linear(args.hidden_dim, hidden[i])) + enc.append(nn.Linear(hidden_dim, hidden[i])) else: enc.append(nn.Linear(hidden[i - 1], hidden[i])) - if args.hidden_bn and i != len(hidden): + if hidden_bn and i != len(hidden): enc.append(nn.BatchNorm1d(hidden[i])) - if args.hidden_relu and i != len(hidden): + if hidden_relu and i != len(hidden): enc.append(nn.ReLU()) self.encoder = nn.Sequential(*enc) def forward(self, blocks, features): - x = blocks[0].srcdata['features'] + x = blocks[0].srcdata["features"] for i in range(len(blocks)): with blocks[i].local_scope(): if self.dropout is not None: x = self.dropout(x) - blocks[i].srcdata['h'] = x + blocks[i].srcdata["h"] = x if i == 0: x = self.layer1(blocks[i], x, agg=self.agg) else: @@ -399,15 +385,15 @@ class InnerProductDecoder(nn.Module): Parameters ---------- - activation : optional - activation function. - dropout : float optional - dropout rate. + activation + Activation function. + dropout + Dropout rate. Returns ------- - adj : - reconstructed adjacency matrix. + adj + Reconstructed adjacency matrix. """ @@ -430,29 +416,29 @@ def edge_selection_simple(self, edges): Parameters ---------- - edges : - edges of graph. + edges + Edges of graph. """ - return {'m': edges.src['h'] * edges.data['weight']} + return {"m": edges.src["h"] * edges.data["weight"]} def forward(self, graph, feat, weight=None, agg="sum"): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): - raise DGLError('There are 0-in-degree nodes in the graph, ' - 'output for those nodes will be invalid. ' - 'This is harmful for some applications, ' - 'causing silent performance regression. ' - 'Adding self-loop on the input graph by ' - 'calling `g = dgl.add_self_loop(g)` will resolve ' - 'the issue. Setting ``allow_zero_in_degree`` ' - 'to be `True` when constructing this module will ' - 'suppress the check and let the code run.') + raise DGLError("There are 0-in-degree nodes in the graph, " + "output for those nodes will be invalid. " + "This is harmful for some applications, " + "causing silent performance regression. " + "Adding self-loop on the input graph by " + "calling `g = dgl.add_self_loop(g)` will resolve " + "the issue. Setting ``allow_zero_in_degree`` " + "to be `True` when constructing this module will " + "suppress the check and let the code run.") # (BarclayII) For RGCN on heterogeneous graphs we need to support GCN on bipartite. feat_src, feat_dst = expand_as_pair(feat, graph) - if self._norm == 'both': + if self._norm == "both": degs = graph.out_degrees().float().clamp(min=1) norm = torch.pow(degs, -0.5) shp = norm.shape + (1, ) * (feat_src.dim() - 1) @@ -461,24 +447,24 @@ def forward(self, graph, feat, weight=None, agg="sum"): if weight is not None: if self.weight is not None: - raise DGLError('External weight is provided while at the same time the' - ' module has defined its own weight parameter. Please' - ' create the module with flag weight=False.') + raise DGLError("External weight is provided while at the same time the" + " module has defined its own weight parameter. Please" + " create the module with flag weight=False.") else: weight = self.weight if weight is not None: feat_src = torch.matmul(feat_src, weight) - graph.srcdata['h'] = feat_src + graph.srcdata["h"] = feat_src if agg == "sum": - graph.update_all(self.edge_selection_simple, fn.sum(msg='m', out='h')) + graph.update_all(self.edge_selection_simple, fn.sum(msg="m", out="h")) if agg == "mean": - graph.update_all(self.edge_selection_simple, fn.mean(msg='m', out='h')) - rst = graph.dstdata['h'] - if self._norm != 'none': + graph.update_all(self.edge_selection_simple, fn.mean(msg="m", out="h")) + rst = graph.dstdata["h"] + if self._norm != "none": degs = graph.in_degrees().float().clamp(min=1) - if self._norm == 'both': + if self._norm == "both": norm = torch.pow(degs, -0.5) else: norm = 1.0 / degs @@ -504,20 +490,20 @@ def edge_selection_simple(self, edges): Parameters ---------- - edges : - edges of graph. + edges + Edges of graph. """ - number_of_edges = edges.src['h'].shape[0] + number_of_edges = edges.src["h"].shape[0] indices = np.expand_dims(np.array([self.gene_num + 1] * number_of_edges, dtype=np.int32), axis=1) - src_id, dst_id = edges.src['id'].cpu().numpy(), edges.dst['id'].cpu().numpy() + src_id, dst_id = edges.src["id"].cpu().numpy(), edges.dst["id"].cpu().numpy() indices = np.where((src_id >= 0) & (dst_id < 0), src_id, indices) # gene->cell indices = np.where((dst_id >= 0) & (src_id < 0), dst_id, indices) # cell->gene indices = np.where((dst_id >= 0) & (src_id >= 0), self.gene_num, indices) # gene-gene - h = edges.src['h'] * self.alpha[indices.squeeze()] - return {'m': h} + h = edges.src["h"] * self.alpha[indices.squeeze()] + return {"m": h} - # return {'m': h * edges.data['weight']} + # return {"m": h * edges.data["weight"]} def forward(self, graph, feat, weight=None, alpha=None, gene_num=None): self.alpha = alpha @@ -525,20 +511,20 @@ def forward(self, graph, feat, weight=None, alpha=None, gene_num=None): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): - raise DGLError('There are 0-in-degree nodes in the graph, ' - 'output for those nodes will be invalid. ' - 'This is harmful for some applications, ' - 'causing silent performance regression. ' - 'Adding self-loop on the input graph by ' - 'calling `g = dgl.add_self_loop(g)` will resolve ' - 'the issue. Setting ``allow_zero_in_degree`` ' - 'to be `True` when constructing this module will ' - 'suppress the check and let the code run.') + raise DGLError("There are 0-in-degree nodes in the graph, " + "output for those nodes will be invalid. " + "This is harmful for some applications, " + "causing silent performance regression. " + "Adding self-loop on the input graph by " + "calling `g = dgl.add_self_loop(g)` will resolve " + "the issue. Setting ``allow_zero_in_degree`` " + "to be `True` when constructing this module will " + "suppress the check and let the code run.") # (BarclayII) For RGCN on heterogeneous graphs we need to support GCN on bipartite. feat_src, feat_dst = expand_as_pair(feat, graph) # print(f"feat_src : {feat_src.shape}, feat_dst {feat_dst.shape}") - if self._norm == 'both': + if self._norm == "both": degs = graph.out_degrees().float().clamp(min=1) norm = torch.pow(degs, -0.5) shp = norm.shape + (1, ) * (feat_src.dim() - 1) @@ -547,22 +533,22 @@ def forward(self, graph, feat, weight=None, alpha=None, gene_num=None): if weight is not None: if self.weight is not None: - raise DGLError('External weight is provided while at the same time the' - ' module has defined its own weight parameter. Please' - ' create the module with flag weight=False.') + raise DGLError("External weight is provided while at the same time the" + " module has defined its own weight parameter. Please" + " create the module with flag weight=False.") else: feat_src = torch.matmul(feat_src, weight) else: weight = self.weight - graph.srcdata['h'] = feat_src - graph.update_all(self.edge_selection_simple, fn.sum(msg='m', out='h')) - rst = graph.dstdata['h'] + graph.srcdata["h"] = feat_src + graph.update_all(self.edge_selection_simple, fn.sum(msg="m", out="h")) + rst = graph.dstdata["h"] - if self._norm != 'none': + if self._norm != "none": degs = graph.in_degrees().float().clamp(min=1) - if self._norm == 'both': + if self._norm == "both": norm = torch.pow(degs, -0.5) else: norm = 1.0 / degs @@ -584,39 +570,18 @@ def run_leiden(data): Parameters ---------- - data : - data for leiden + data + Aata for leiden Returns ------- - pred : list - prediction of leiden. + pred + Prediction of leiden. """ adata = sc.AnnData(data) - sc.pp.neighbors(adata, use_rep='X', n_neighbors=300, n_pcs=0) + sc.pp.neighbors(adata, use_rep="X", n_neighbors=300, n_pcs=0) sc.tl.leiden(adata) - pred = adata.obs['leiden'].to_list() + pred = adata.obs["leiden"].to_list() pred = [int(x) for x in pred] return pred - - -def get_device(use_cpu=False): - """Get device for training. - - Parameters - ---------- - use_cpu : bool optional - use cpu or not. - - Returns - ------- - device : - torch device. - - """ - if torch.cuda.is_available() and not use_cpu: - device = torch.device('cuda') - else: - device = torch.device('cpu') - return device diff --git a/dance/utils/__init__.py b/dance/utils/__init__.py index 30a85b97..d60e02a3 100644 --- a/dance/utils/__init__.py +++ b/dance/utils/__init__.py @@ -8,6 +8,12 @@ from torch.utils.data import Dataset +def get_device(device: str) -> str: + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + return device + + def hexdigest(x: str, /) -> str: return hashlib.md5(x.encode()).hexdigest() diff --git a/examples/single_modality/clustering/graphsc.py b/examples/single_modality/clustering/graphsc.py index 650dcc8c..469ff5f1 100644 --- a/examples/single_modality/clustering/graphsc.py +++ b/examples/single_modality/clustering/graphsc.py @@ -10,7 +10,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", default=100, type=int) - parser.add_argument("-cpu", "--use_cpu", default=False, action="store_true") + parser.add_argument("-dv", "--device", default="auto") parser.add_argument("-if", "--in_feats", default=50, type=int) parser.add_argument("-bs", "--batch_size", default=128, type=int) parser.add_argument("-nw", "--normalize_weights", default="log_per_cell", choices=["log_per_cell", "per_cell"]) @@ -31,8 +31,8 @@ parser.add_argument("-ng", "--nb_genes", type=int, default=3000) parser.add_argument("-nr", "--num_run", type=int, default=1) parser.add_argument("-nbw", "--num_workers", type=int, default=1) - parser.add_argument("-eve", "--eval_epoch", default=True, action="store_true") - parser.add_argument("-show", "--show_epoch_ari", default=False, action="store_true") + parser.add_argument("-eve", "--eval_epoch", action="store_true") + parser.add_argument("-show", "--show_epoch_ari", action="store_true") parser.add_argument("-plot", "--plot", default=False, action="store_true") parser.add_argument("-dd", "--data_dir", default="./data", type=str) parser.add_argument("-data", "--dataset", default="10X_PBMC", @@ -56,13 +56,15 @@ for run in range(args.num_run): set_seed(run) - model = GraphSC(args) - model.fit(graph, args.epochs, n_clusters, args.learning_rate, cluster=["KMeans", "Leiden"]) - pred = model.predict(n_clusters, cluster=["KMeans", "Leiden"]) - print(f"kmeans_pred (first 10): {pred.get('kmeans_pred')[:10]}") - print(f"leiden_pred (first 10): {pred.get('leiden_pred')[:10]}") - score = model.score(y, n_clusters, plot=False, cluster=["KMeans", "Leiden"]) - print(f"kmeans_ari: {score.get('kmeans_ari')}, leiden_ari: {score.get('leiden_ari')}") + model = GraphSC(agg=args.agg, activation=args.activation, in_feats=args.in_feats, n_hidden=args.n_hidden, + hidden_dim=args.hidden_dim, hidden_1=args.hidden_1, hidden_2=args.hidden_2, + dropout=args.dropout, n_layers=args.n_layers, hidden_relu=args.hidden_relu, + hidden_bn=args.hidden_bn, n_clusters=n_clusters, cluster_method="leiden", + num_workers=args.num_workers, device=args.device) + model.fit(graph, n_epochs=args.epochs, lr=args.learning_rate, show_epoch_ari=args.show_epoch_ari, + eval_epoch=args.eval_epoch) + score = model.score(None, y) + print(f"{score:=.4f}") """ Reproduction information 10X PBMC: python graphsc.py --dataset 10X_PBMC