From 9ed2f3c60f072ba5f8e6ee3b4bff388389213f94 Mon Sep 17 00:00:00 2001 From: Remy Date: Sat, 18 Feb 2023 09:40:20 -0500 Subject: [PATCH 1/4] move get_device to utils --- .../single_modality/clustering/graphsc.py | 21 ------------------- dance/utils/__init__.py | 6 ++++++ 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/dance/modules/single_modality/clustering/graphsc.py b/dance/modules/single_modality/clustering/graphsc.py index 5c9b0a4f..1904ba66 100644 --- a/dance/modules/single_modality/clustering/graphsc.py +++ b/dance/modules/single_modality/clustering/graphsc.py @@ -599,24 +599,3 @@ def run_leiden(data): pred = adata.obs['leiden'].to_list() pred = [int(x) for x in pred] return pred - - -def get_device(use_cpu=False): - """Get device for training. - - Parameters - ---------- - use_cpu : bool optional - use cpu or not. - - Returns - ------- - device : - torch device. - - """ - if torch.cuda.is_available() and not use_cpu: - device = torch.device('cuda') - else: - device = torch.device('cpu') - return device diff --git a/dance/utils/__init__.py b/dance/utils/__init__.py index 30a85b97..d60e02a3 100644 --- a/dance/utils/__init__.py +++ b/dance/utils/__init__.py @@ -8,6 +8,12 @@ from torch.utils.data import Dataset +def get_device(device: str) -> str: + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + return device + + def hexdigest(x: str, /) -> str: return hashlib.md5(x.encode()).hexdigest() From 92fb73317df11d061cedc48bdf1e513ac9136a26 Mon Sep 17 00:00:00 2001 From: Remy Date: Sat, 18 Feb 2023 09:41:07 -0500 Subject: [PATCH 2/4] remove arg, unwrap arguments; update doc --- .../single_modality/clustering/graphsc.py | 386 +++++++++++------- .../single_modality/clustering/graphsc.py | 7 +- 2 files changed, 237 insertions(+), 156 deletions(-) diff --git a/dance/modules/single_modality/clustering/graphsc.py b/dance/modules/single_modality/clustering/graphsc.py index 1904ba66..fff26873 100644 --- a/dance/modules/single_modality/clustering/graphsc.py +++ b/dance/modules/single_modality/clustering/graphsc.py @@ -32,6 +32,7 @@ from dance.transforms import AnnDataTransform, Compose, SetConfig from dance.transforms.graph import PCACellFeatureGraph from dance.typing import LogLevel +from dance.utils import get_device class GraphSC: @@ -39,17 +40,69 @@ class GraphSC: Parameters ---------- - args : argparse.Namespace - a Namespace contains arguments of GCNAE. For details of parameters in parser args, please refer to link - (parser help document). + agg + Aggregation layer. + activation + Activation function. + in_feats + Dimension of input feature + n_hidden + Number of hidden layer. + hidden_dim + Input dimension of hidden layer 1. + hidden_1 + Output dimension of hidden layer 1. + hidden_2 + Output dimension of hidden layer 2. + dropout + Dropout rate. + n_layers + Number of graph convolutional layers. + hidden_relu + Use relu activation in hidden layers or not. + hidden_bn + Use batch norm in hidden layers or not. + num_workers + Number of workers. + device + Computation device to use. """ - def __init__(self, args): + def __init__( + self, + agg: str = "sum", + activation: str = "relu", + in_feats: int = 50, + n_hidden: int = 1, + hidden_dim: int = 200, + hidden_1: int = 300, + hidden_2: int = 0, + dropout: float = 0.1, + n_layers: int = 1, + hidden_relu: bool = False, + hidden_bn: bool = False, + num_workers: int = 1, + device: str = "auto", + ): super().__init__() - self.args = args - self.device = get_device(args.use_cpu) - self.model = GCNAE(args).to(self.device) + self.n_layers = n_layers + self.num_workers = num_workers + self.device = get_device(device) + + self.model = GCNAE( + agg=agg, + activation=activation, + in_feats=in_feats, + n_hidden=n_hidden, + hidden_dim=hidden_dim, + hidden_1=hidden_1, + hidden_2=hidden_2, + dropout=dropout, + n_layers=n_layers, + hidden_relu=hidden_relu, + hidden_bn=hidden_bn, + ).to(self.device) @staticmethod def preprocessing_pipeline(n_top_genes: int = 3000, normalize_weights: str = "log_per_cell", n_components: int = 50, @@ -90,89 +143,104 @@ def preprocessing_pipeline(n_top_genes: int = 3000, normalize_weights: str = "lo return Compose(*transforms, log_level=log_level) - def fit(self, g, n_epochs, n_clusters, lr, cluster=["KMeans"]): + def fit( + self, + g, + n_epochs, + n_clusters, + lr, + cluster=["KMeans"], + batch_size: int = 128, + show_epoch_ari: bool = False, + eval_epoch: bool = False, + ): """Train graph-sc. Parameters ---------- - g : dgl.DGLGraph - input cell-gene graph. - n_epochs : int - number of epochs. - n_clusters : int - number of clusters. - lr : float - learning rate. - cluster : list optional - clustering method. + g + Input cell-gene graph. + n_epochs + Number of epochs. + n_clusters + Number of clusters. + lr + Learning rate. + cluster + Clustering method. + batch_size + Batch size. + show_epoch_ari + Show ARI score for each epoch + eval_epoch + Evaluate every epoch. """ g.ndata["order"] = g.ndata["label"] = g.ndata["feat_id"] train_ids = np.where(g.ndata["label"] != -1)[0] - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(self.args.n_layers) - dataloader = dgl.dataloading.NodeDataLoader(g, train_ids, sampler, batch_size=self.args.batch_size, - shuffle=True, drop_last=False, num_workers=self.args.num_workers) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(self.n_layers) + dataloader = dgl.dataloading.NodeDataLoader(g, train_ids, sampler, batch_size=batch_size, shuffle=True, + drop_last=False, num_workers=self.num_workers) - device = get_device(self.args.use_cpu) + device = get_device(self.device) optim = torch.optim.Adam(self.model.parameters(), lr=lr) losses = [] aris_kmeans = [] Z = {} - with dataloader.enable_cpu_affinity(): - for epoch in tqdm(range(n_epochs)): - self.model.train() - z = [] - y = [] - order = [] - - for input_nodes, output_nodes, blocks in dataloader: - blocks = [b.to(device) for b in blocks] - input_features = blocks[0].srcdata['features'] - g = blocks[-1] - - adj_logits, emb = self.model.forward(blocks, input_features) - z.extend(emb.detach().cpu().numpy()) - if "label" in blocks[-1].dstdata: - y.extend(blocks[-1].dstdata["label"].cpu().numpy()) - order.extend(blocks[-1].dstdata["order"].cpu().numpy()) - - adj = g.adjacency_matrix().to_dense().to(device) - adj = adj[g.dstnodes()] - pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) - factor = float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) - if factor == 0: - factor = 1 - norm = adj.shape[0] * adj.shape[0] / factor - adj_logits, _ = self.model.forward(blocks, input_features) - loss = norm * BCELoss(adj_logits, adj.to(device), pos_weight=pos_weight.to(device)) - optim.zero_grad() - loss.backward() - optim.step() - losses.append(loss.item()) - - z = np.array(z) - y = np.array(y) - order = np.array(order) - order = np.argsort(order) - z = z[order] - y = y[order] - if pd.isnull(y[0]): - y = None + for epoch in tqdm(range(n_epochs)): + self.model.train() + z = [] + y = [] + order = [] + + for input_nodes, output_nodes, blocks in dataloader: + blocks = [b.to(device) for b in blocks] + input_features = blocks[0].srcdata['features'] + g = blocks[-1] + + adj_logits, emb = self.model.forward(blocks, input_features) + z.extend(emb.detach().cpu().numpy()) + if "label" in blocks[-1].dstdata: + y.extend(blocks[-1].dstdata["label"].cpu().numpy()) + order.extend(blocks[-1].dstdata["order"].cpu().numpy()) + + adj = g.adjacency_matrix().to_dense().to(device) + adj = adj[g.dstnodes()] + pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) + factor = float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) + if factor == 0: + factor = 1 + norm = adj.shape[0] * adj.shape[0] / factor + adj_logits, _ = self.model.forward(blocks, input_features) + loss = norm * BCELoss(adj_logits, adj.to(device), pos_weight=pos_weight.to(device)) + optim.zero_grad() + loss.backward() + optim.step() + losses.append(loss.item()) + + z = np.array(z) + y = np.array(y) + order = np.array(order) + order = np.argsort(order) + z = z[order] + y = y[order] + if pd.isnull(y[0]): + y = None + self.z = z + + if eval_epoch: + score = self.score(y, n_clusters, cluster=cluster) + aris_kmeans.append(score["kmeans_ari"]) + if show_epoch_ari: + print(f'epoch {epoch}, ARI {score.get("kmeans_ari")}') + z_ = {f'epoch{epoch}': z} + Z = {**Z, **z_} + + elif epoch == n_epochs - 1: self.z = z - if self.args.eval_epoch: - score = self.score(y, n_clusters, cluster=cluster) - aris_kmeans.append(score["kmeans_ari"]) - if self.args.show_epoch_ari: - print(f'epoch {epoch}, ARI {score.get("kmeans_ari")}') - z_ = {f'epoch{epoch}': z} - Z = {**Z, **z_} - - elif epoch == n_epochs - 1: - self.z = z - - if self.args.eval_epoch: + if eval_epoch: index = np.argmax(aris_kmeans) self.z = Z[f'epoch{index}'] @@ -181,15 +249,15 @@ def predict(self, n_clusters, cluster=["KMeans"]): Parameters ---------- - n_clusters : int - number of clusters. - cluster : list optional - clustering method. + n_clusters + Number of clusters. + cluster + Clustering method. Returns ------- - pred : dict - prediction of given clustering method. + pred + Prediction of given clustering method. """ z = self.z @@ -211,19 +279,19 @@ def score(self, y, n_clusters, plot=False, cluster=["KMeans"]): Parameters ---------- - y : list - true labels. - n_clusters : int - number of clusters. - plot : bool optional - show plot or not. - cluster : list optional - clustering method. + y + True labels. + n_clusters + Number of clusters. + plot + Show plot or not. + cluster + Clustering method. Returns ------- - scores : dict - metric evaluation scores. + scores + Metric evaluation scores. """ z = self.z @@ -297,83 +365,93 @@ class GCNAE(nn.Module): Parameters ---------- - args : argparse.Namespace - a Namespace contains arguments of scDSC. For details of parameters in parser args, please refer to link - (parser help document). - agg : str - aggregation layer. - activation :str - activation function. - in_feats : int - dimension of input feature - n_hidden : int - number of hidden layer. - hidden_dim :int - input dimension of hidden layer 1. - hidden_1 : int - output dimension of hidden layer 1. - hidden_2 : int - output dimension of hidden layer 2. - dropout : float - dropout rate. - n_layers :int - number of graph convolutional layers. - hidden_relu : bool - use relu activation in hidden layers or not. - hidden_bn : bool - use batch norm in hidden layers or not. + agg + Aggregation layer. + activation + Activation function. + in_feats + Dimension of input feature + n_hidden + Number of hidden layer. + hidden_dim + Input dimension of hidden layer 1. + hidden_1 + Output dimension of hidden layer 1. + hidden_2 + Output dimension of hidden layer 2. + dropout + Dropout rate. + n_layers + Number of graph convolutional layers. + hidden_relu + Use relu activation in hidden layers or not. + hidden_bn + Use batch norm in hidden layers or not. Returns ------- - adj_rec : - reconstructed adjacency matrix. - x : - embedding. + adj_rec + Reconstructed adjacency matrix. + x + Embedding. """ - def __init__(self, args): - + def __init__( + self, + *, + agg: str, + activation: str, + in_feats: int, + n_hidden: int, + hidden_dim: int, + hidden_1: int, + hidden_2: int, + dropout: float, + n_layers: int, + hidden_relu: bool, + hidden_bn: bool, + ): super().__init__() - self.args = args - self.agg = args.agg - if args.activation == 'gelu': + self.agg = agg + + if activation == 'gelu': activation = F.gelu - elif args.activation == 'prelu': + elif activation == 'prelu': activation = F.prelu - elif args.activation == 'relu': + elif activation == 'relu': activation = F.relu - elif args.activation == 'leaky_relu': + elif activation == 'leaky_relu': activation = F.leaky_relu - if args.n_hidden == 0: + if n_hidden == 0: hidden = None - elif args.n_hidden == 1: - hidden = [args.hidden_1] - elif args.n_hidden == 2: - hidden = [args.hidden_1, args.hidden_2] + elif n_hidden == 1: + hidden = [hidden_1] + elif n_hidden == 2: + hidden = [hidden_1, hidden_2] - if args.dropout != 0: - self.dropout = nn.Dropout(p=args.dropout) + if dropout != 0: + self.dropout = nn.Dropout(p=dropout) else: self.dropout = None - self.layer1 = WeightedGraphConv(in_feats=args.in_feats, out_feats=args.hidden_dim, activation=activation) - if args.n_layers == 2: - self.layer2 = WeightedGraphConv(in_feats=args.hidden_dim, out_feats=args.hidden_dim, activation=activation) + self.layer1 = WeightedGraphConv(in_feats=in_feats, out_feats=hidden_dim, activation=activation) + if n_layers == 2: + self.layer2 = WeightedGraphConv(in_feats=hidden_dim, out_feats=hidden_dim, activation=activation) self.decoder = InnerProductDecoder(activation=lambda x: x) self.hidden = hidden if hidden is not None: enc = [] for i, s in enumerate(hidden): if i == 0: - enc.append(nn.Linear(args.hidden_dim, hidden[i])) + enc.append(nn.Linear(hidden_dim, hidden[i])) else: enc.append(nn.Linear(hidden[i - 1], hidden[i])) - if args.hidden_bn and i != len(hidden): + if hidden_bn and i != len(hidden): enc.append(nn.BatchNorm1d(hidden[i])) - if args.hidden_relu and i != len(hidden): + if hidden_relu and i != len(hidden): enc.append(nn.ReLU()) self.encoder = nn.Sequential(*enc) @@ -399,15 +477,15 @@ class InnerProductDecoder(nn.Module): Parameters ---------- - activation : optional - activation function. - dropout : float optional - dropout rate. + activation + Activation function. + dropout + Dropout rate. Returns ------- - adj : - reconstructed adjacency matrix. + adj + Reconstructed adjacency matrix. """ @@ -430,8 +508,8 @@ def edge_selection_simple(self, edges): Parameters ---------- - edges : - edges of graph. + edges + Edges of graph. """ return {'m': edges.src['h'] * edges.data['weight']} @@ -504,8 +582,8 @@ def edge_selection_simple(self, edges): Parameters ---------- - edges : - edges of graph. + edges + Edges of graph. """ number_of_edges = edges.src['h'].shape[0] @@ -584,13 +662,13 @@ def run_leiden(data): Parameters ---------- - data : - data for leiden + data + Aata for leiden Returns ------- - pred : list - prediction of leiden. + pred + Prediction of leiden. """ adata = sc.AnnData(data) diff --git a/examples/single_modality/clustering/graphsc.py b/examples/single_modality/clustering/graphsc.py index 650dcc8c..929de0b6 100644 --- a/examples/single_modality/clustering/graphsc.py +++ b/examples/single_modality/clustering/graphsc.py @@ -10,7 +10,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", default=100, type=int) - parser.add_argument("-cpu", "--use_cpu", default=False, action="store_true") + parser.add_argument("-dv", "--device", default="auto") parser.add_argument("-if", "--in_feats", default=50, type=int) parser.add_argument("-bs", "--batch_size", default=128, type=int) parser.add_argument("-nw", "--normalize_weights", default="log_per_cell", choices=["log_per_cell", "per_cell"]) @@ -56,7 +56,10 @@ for run in range(args.num_run): set_seed(run) - model = GraphSC(args) + model = GraphSC(agg=args.agg, activation=args.activation, in_feats=args.in_feats, n_hidden=args.n_hidden, + hidden_dim=args.hidden_dim, hidden_1=args.hidden_1, hidden_2=args.hidden_2, + dropout=args.dropout, n_layers=args.n_layers, hidden_relu=args.hidden_relu, + hidden_bn=args.hidden_bn, num_workers=args.num_workers, device=args.device) model.fit(graph, args.epochs, n_clusters, args.learning_rate, cluster=["KMeans", "Leiden"]) pred = model.predict(n_clusters, cluster=["KMeans", "Leiden"]) print(f"kmeans_pred (first 10): {pred.get('kmeans_pred')[:10]}") From 5171ac79d17e8e932dce181f3187865ba9d67723 Mon Sep 17 00:00:00 2001 From: Remy Date: Sat, 18 Feb 2023 09:43:50 -0500 Subject: [PATCH 3/4] use double quote --- .../single_modality/clustering/graphsc.py | 110 +++++++++--------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/dance/modules/single_modality/clustering/graphsc.py b/dance/modules/single_modality/clustering/graphsc.py index fff26873..2dd066bd 100644 --- a/dance/modules/single_modality/clustering/graphsc.py +++ b/dance/modules/single_modality/clustering/graphsc.py @@ -196,7 +196,7 @@ def fit( for input_nodes, output_nodes, blocks in dataloader: blocks = [b.to(device) for b in blocks] - input_features = blocks[0].srcdata['features'] + input_features = blocks[0].srcdata["features"] g = blocks[-1] adj_logits, emb = self.model.forward(blocks, input_features) @@ -233,8 +233,8 @@ def fit( score = self.score(y, n_clusters, cluster=cluster) aris_kmeans.append(score["kmeans_ari"]) if show_epoch_ari: - print(f'epoch {epoch}, ARI {score.get("kmeans_ari")}') - z_ = {f'epoch{epoch}': z} + print(f"epoch {epoch}, ARI {score.get('kmeans_ari')}") + z_ = {f"epoch{epoch}": z} Z = {**Z, **z_} elif epoch == n_epochs - 1: @@ -242,7 +242,7 @@ def fit( if eval_epoch: index = np.argmax(aris_kmeans) - self.z = Z[f'epoch{index}'] + self.z = Z[f"epoch{index}"] def predict(self, n_clusters, cluster=["KMeans"]): """Get predictions from the graph autoencoder model. @@ -416,13 +416,13 @@ def __init__( self.agg = agg - if activation == 'gelu': + if activation == "gelu": activation = F.gelu - elif activation == 'prelu': + elif activation == "prelu": activation = F.prelu - elif activation == 'relu': + elif activation == "relu": activation = F.relu - elif activation == 'leaky_relu': + elif activation == "leaky_relu": activation = F.leaky_relu if n_hidden == 0: @@ -456,12 +456,12 @@ def __init__( self.encoder = nn.Sequential(*enc) def forward(self, blocks, features): - x = blocks[0].srcdata['features'] + x = blocks[0].srcdata["features"] for i in range(len(blocks)): with blocks[i].local_scope(): if self.dropout is not None: x = self.dropout(x) - blocks[i].srcdata['h'] = x + blocks[i].srcdata["h"] = x if i == 0: x = self.layer1(blocks[i], x, agg=self.agg) else: @@ -512,25 +512,25 @@ def edge_selection_simple(self, edges): Edges of graph. """ - return {'m': edges.src['h'] * edges.data['weight']} + return {"m": edges.src["h"] * edges.data["weight"]} def forward(self, graph, feat, weight=None, agg="sum"): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): - raise DGLError('There are 0-in-degree nodes in the graph, ' - 'output for those nodes will be invalid. ' - 'This is harmful for some applications, ' - 'causing silent performance regression. ' - 'Adding self-loop on the input graph by ' - 'calling `g = dgl.add_self_loop(g)` will resolve ' - 'the issue. Setting ``allow_zero_in_degree`` ' - 'to be `True` when constructing this module will ' - 'suppress the check and let the code run.') + raise DGLError("There are 0-in-degree nodes in the graph, " + "output for those nodes will be invalid. " + "This is harmful for some applications, " + "causing silent performance regression. " + "Adding self-loop on the input graph by " + "calling `g = dgl.add_self_loop(g)` will resolve " + "the issue. Setting ``allow_zero_in_degree`` " + "to be `True` when constructing this module will " + "suppress the check and let the code run.") # (BarclayII) For RGCN on heterogeneous graphs we need to support GCN on bipartite. feat_src, feat_dst = expand_as_pair(feat, graph) - if self._norm == 'both': + if self._norm == "both": degs = graph.out_degrees().float().clamp(min=1) norm = torch.pow(degs, -0.5) shp = norm.shape + (1, ) * (feat_src.dim() - 1) @@ -539,24 +539,24 @@ def forward(self, graph, feat, weight=None, agg="sum"): if weight is not None: if self.weight is not None: - raise DGLError('External weight is provided while at the same time the' - ' module has defined its own weight parameter. Please' - ' create the module with flag weight=False.') + raise DGLError("External weight is provided while at the same time the" + " module has defined its own weight parameter. Please" + " create the module with flag weight=False.") else: weight = self.weight if weight is not None: feat_src = torch.matmul(feat_src, weight) - graph.srcdata['h'] = feat_src + graph.srcdata["h"] = feat_src if agg == "sum": - graph.update_all(self.edge_selection_simple, fn.sum(msg='m', out='h')) + graph.update_all(self.edge_selection_simple, fn.sum(msg="m", out="h")) if agg == "mean": - graph.update_all(self.edge_selection_simple, fn.mean(msg='m', out='h')) - rst = graph.dstdata['h'] - if self._norm != 'none': + graph.update_all(self.edge_selection_simple, fn.mean(msg="m", out="h")) + rst = graph.dstdata["h"] + if self._norm != "none": degs = graph.in_degrees().float().clamp(min=1) - if self._norm == 'both': + if self._norm == "both": norm = torch.pow(degs, -0.5) else: norm = 1.0 / degs @@ -586,16 +586,16 @@ def edge_selection_simple(self, edges): Edges of graph. """ - number_of_edges = edges.src['h'].shape[0] + number_of_edges = edges.src["h"].shape[0] indices = np.expand_dims(np.array([self.gene_num + 1] * number_of_edges, dtype=np.int32), axis=1) - src_id, dst_id = edges.src['id'].cpu().numpy(), edges.dst['id'].cpu().numpy() + src_id, dst_id = edges.src["id"].cpu().numpy(), edges.dst["id"].cpu().numpy() indices = np.where((src_id >= 0) & (dst_id < 0), src_id, indices) # gene->cell indices = np.where((dst_id >= 0) & (src_id < 0), dst_id, indices) # cell->gene indices = np.where((dst_id >= 0) & (src_id >= 0), self.gene_num, indices) # gene-gene - h = edges.src['h'] * self.alpha[indices.squeeze()] - return {'m': h} + h = edges.src["h"] * self.alpha[indices.squeeze()] + return {"m": h} - # return {'m': h * edges.data['weight']} + # return {"m": h * edges.data["weight"]} def forward(self, graph, feat, weight=None, alpha=None, gene_num=None): self.alpha = alpha @@ -603,20 +603,20 @@ def forward(self, graph, feat, weight=None, alpha=None, gene_num=None): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): - raise DGLError('There are 0-in-degree nodes in the graph, ' - 'output for those nodes will be invalid. ' - 'This is harmful for some applications, ' - 'causing silent performance regression. ' - 'Adding self-loop on the input graph by ' - 'calling `g = dgl.add_self_loop(g)` will resolve ' - 'the issue. Setting ``allow_zero_in_degree`` ' - 'to be `True` when constructing this module will ' - 'suppress the check and let the code run.') + raise DGLError("There are 0-in-degree nodes in the graph, " + "output for those nodes will be invalid. " + "This is harmful for some applications, " + "causing silent performance regression. " + "Adding self-loop on the input graph by " + "calling `g = dgl.add_self_loop(g)` will resolve " + "the issue. Setting ``allow_zero_in_degree`` " + "to be `True` when constructing this module will " + "suppress the check and let the code run.") # (BarclayII) For RGCN on heterogeneous graphs we need to support GCN on bipartite. feat_src, feat_dst = expand_as_pair(feat, graph) # print(f"feat_src : {feat_src.shape}, feat_dst {feat_dst.shape}") - if self._norm == 'both': + if self._norm == "both": degs = graph.out_degrees().float().clamp(min=1) norm = torch.pow(degs, -0.5) shp = norm.shape + (1, ) * (feat_src.dim() - 1) @@ -625,22 +625,22 @@ def forward(self, graph, feat, weight=None, alpha=None, gene_num=None): if weight is not None: if self.weight is not None: - raise DGLError('External weight is provided while at the same time the' - ' module has defined its own weight parameter. Please' - ' create the module with flag weight=False.') + raise DGLError("External weight is provided while at the same time the" + " module has defined its own weight parameter. Please" + " create the module with flag weight=False.") else: feat_src = torch.matmul(feat_src, weight) else: weight = self.weight - graph.srcdata['h'] = feat_src - graph.update_all(self.edge_selection_simple, fn.sum(msg='m', out='h')) - rst = graph.dstdata['h'] + graph.srcdata["h"] = feat_src + graph.update_all(self.edge_selection_simple, fn.sum(msg="m", out="h")) + rst = graph.dstdata["h"] - if self._norm != 'none': + if self._norm != "none": degs = graph.in_degrees().float().clamp(min=1) - if self._norm == 'both': + if self._norm == "both": norm = torch.pow(degs, -0.5) else: norm = 1.0 / degs @@ -672,8 +672,8 @@ def run_leiden(data): """ adata = sc.AnnData(data) - sc.pp.neighbors(adata, use_rep='X', n_neighbors=300, n_pcs=0) + sc.pp.neighbors(adata, use_rep="X", n_neighbors=300, n_pcs=0) sc.tl.leiden(adata) - pred = adata.obs['leiden'].to_list() + pred = adata.obs["leiden"].to_list() pred = [int(x) for x in pred] return pred From ff0ce06fc4835c4625da5301b4127203470f64bc Mon Sep 17 00:00:00 2001 From: Remy Date: Sat, 18 Feb 2023 10:19:18 -0500 Subject: [PATCH 4/4] adapt graph-sc to BaseClusteringMethod object --- .../single_modality/clustering/graphsc.py | 154 ++++-------------- .../single_modality/clustering/graphsc.py | 17 +- 2 files changed, 39 insertions(+), 132 deletions(-) diff --git a/dance/modules/single_modality/clustering/graphsc.py b/dance/modules/single_modality/clustering/graphsc.py index 2dd066bd..17d6d97d 100644 --- a/dance/modules/single_modality/clustering/graphsc.py +++ b/dance/modules/single_modality/clustering/graphsc.py @@ -9,10 +9,7 @@ """ -import time - import dgl -import matplotlib.pyplot as plt import numpy as np import pandas as pd import scanpy as sc @@ -24,18 +21,17 @@ from dgl.nn.pytorch import GraphConv from dgl.utils import expand_as_pair from sklearn.cluster import KMeans -from sklearn.decomposition import PCA -from sklearn.metrics import adjusted_rand_score, calinski_harabasz_score, normalized_mutual_info_score, silhouette_score from torch.nn.functional import binary_cross_entropy_with_logits as BCELoss from tqdm import tqdm +from dance.modules.base import BaseClusteringMethod from dance.transforms import AnnDataTransform, Compose, SetConfig from dance.transforms.graph import PCACellFeatureGraph -from dance.typing import LogLevel +from dance.typing import Any, Literal, LogLevel, Optional from dance.utils import get_device -class GraphSC: +class GraphSC(BaseClusteringMethod): """GraphSC class. Parameters @@ -62,6 +58,8 @@ class GraphSC: Use relu activation in hidden layers or not. hidden_bn Use batch norm in hidden layers or not. + cluster_method + Method for clustering. num_workers Number of workers. device @@ -82,11 +80,15 @@ def __init__( n_layers: int = 1, hidden_relu: bool = False, hidden_bn: bool = False, + n_clusters: int = 10, + cluster_method: Literal["kmeans", "leiden"] = "kmeans", num_workers: int = 1, device: str = "auto", ): super().__init__() self.n_layers = n_layers + self.n_clusters = n_clusters + self.cluster_method = cluster_method self.num_workers = num_workers self.device = get_device(device) @@ -145,11 +147,11 @@ def preprocessing_pipeline(n_top_genes: int = 3000, normalize_weights: str = "lo def fit( self, - g, - n_epochs, - n_clusters, - lr, - cluster=["KMeans"], + g: dgl.DGLGraph, + y: Optional[Any] = None, + *, + n_epochs: int = 100, + lr: float = 1e-5, batch_size: int = 128, show_epoch_ari: bool = False, eval_epoch: bool = False, @@ -160,14 +162,12 @@ def fit( ---------- g Input cell-gene graph. + y + Not used, for compatibility with the BaseClusteringMethod class. n_epochs Number of epochs. - n_clusters - Number of clusters. lr Learning rate. - cluster - Clustering method. batch_size Batch size. show_epoch_ari @@ -185,7 +185,7 @@ def fit( device = get_device(self.device) optim = torch.optim.Adam(self.model.parameters(), lr=lr) losses = [] - aris_kmeans = [] + aris = [] Z = {} for epoch in tqdm(range(n_epochs)): @@ -230,10 +230,10 @@ def fit( self.z = z if eval_epoch: - score = self.score(y, n_clusters, cluster=cluster) - aris_kmeans.append(score["kmeans_ari"]) + score = self.score(None, y) + aris.append(score) if show_epoch_ari: - print(f"epoch {epoch}, ARI {score.get('kmeans_ari')}") + print(f"epoch {epoch}, ARI {score}") z_ = {f"epoch{epoch}": z} Z = {**Z, **z_} @@ -241,18 +241,16 @@ def fit( self.z = z if eval_epoch: - index = np.argmax(aris_kmeans) + index = np.argmax(aris) self.z = Z[f"epoch{index}"] - def predict(self, n_clusters, cluster=["KMeans"]): + def predict(self, x: Optional[Any] = None): """Get predictions from the graph autoencoder model. Parameters ---------- - n_clusters - Number of clusters. - cluster - Clustering method. + x + Not used, for compatibility with BaseClusteringMethod class. Returns ------- @@ -260,105 +258,15 @@ def predict(self, n_clusters, cluster=["KMeans"]): Prediction of given clustering method. """ - z = self.z - pred = {} - - if "KMeans" in cluster: - kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=5, n_init=10) - kmeans_pred = {"kmeans_pred": kmeans.fit_predict(z)} - pred = {**pred, **kmeans_pred} - - if "Leiden" in cluster: - leiden_pred = {"leiden_pred": run_leiden(z)} - pred = {**pred, **leiden_pred} - + if self.cluster_method == "kmeans": + kmeans = KMeans(n_clusters=self.n_clusters, init="k-means++", random_state=5, n_init=10) + pred = kmeans.fit_predict(self.z) + elif self.cluster_method == "leiden": + pred = run_leiden(self.z) + else: + raise ValueError(f"Unknown clustering {self.cluster_method}, available options are: 'kmeans', 'leiden'") return pred - def score(self, y, n_clusters, plot=False, cluster=["KMeans"]): - """Evaluate the graph autoencoder model. - - Parameters - ---------- - y - True labels. - n_clusters - Number of clusters. - plot - Show plot or not. - cluster - Clustering method. - - Returns - ------- - scores - Metric evaluation scores. - - """ - z = self.z - self.model.eval() - - k_start = time.time() - scores = {"ae_end": k_start} - - if "KMeans" in cluster: - kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=5, n_init=10) - kmeans_pred = kmeans.fit_predict(z) - ari_k = None - nmi_k = None - if y is not None: - ari_k = round(adjusted_rand_score(y, kmeans_pred), 4) - nmi_k = round(normalized_mutual_info_score(y, kmeans_pred), 4) - sil_k = silhouette_score(z, kmeans_pred) - cal_k = calinski_harabasz_score(z, kmeans_pred) - k_end = time.time() - scores_k = { - "kmeans_ari": ari_k, - "kmeans_nmi": nmi_k, - "kmeans_sil": sil_k, - "kmeans_cal": cal_k, - "kmeans_pred": kmeans_pred, - "kmeans_time": k_end - k_start, - } - scores = {**scores, **scores_k} - - if "Leiden" in cluster: - l_start = time.time() - leiden_pred = run_leiden(z) - ari_l = None - nmi_l = None - if y is not None: - ari_l = round(adjusted_rand_score(y, leiden_pred), 4) - nmi_l = round(normalized_mutual_info_score(y, leiden_pred), 4) - sil_l = silhouette_score(z, leiden_pred) - cal_l = calinski_harabasz_score(z, leiden_pred) - l_end = time.time() - scores_l = { - "leiden_ari": ari_l, - "leiden_nmi": nmi_l, - "leiden_sil": sil_l, - "leiden_cal": cal_l, - "leiden_pred": leiden_pred, - "leiden_time": l_end - l_start, - } - scores = {**scores, **scores_l} - - if plot: - pca = PCA(2).fit_transform(z) - plt.figure(figsize=(12, 4)) - plt.subplot(131) - plt.title("Ground truth") - plt.scatter(pca[:, 0], pca[:, 1], c=y, s=4) - - plt.subplot(132) - plt.title("K-Means pred") - plt.scatter(pca[:, 0], pca[:, 1], c=kmeans_pred, s=4) - - plt.subplot(133) - plt.title("Leiden pred") - plt.scatter(pca[:, 0], pca[:, 1], c=leiden_pred, s=4) - plt.show() - return scores - class GCNAE(nn.Module): """Graph convolutional autoencoder class. diff --git a/examples/single_modality/clustering/graphsc.py b/examples/single_modality/clustering/graphsc.py index 929de0b6..469ff5f1 100644 --- a/examples/single_modality/clustering/graphsc.py +++ b/examples/single_modality/clustering/graphsc.py @@ -31,8 +31,8 @@ parser.add_argument("-ng", "--nb_genes", type=int, default=3000) parser.add_argument("-nr", "--num_run", type=int, default=1) parser.add_argument("-nbw", "--num_workers", type=int, default=1) - parser.add_argument("-eve", "--eval_epoch", default=True, action="store_true") - parser.add_argument("-show", "--show_epoch_ari", default=False, action="store_true") + parser.add_argument("-eve", "--eval_epoch", action="store_true") + parser.add_argument("-show", "--show_epoch_ari", action="store_true") parser.add_argument("-plot", "--plot", default=False, action="store_true") parser.add_argument("-dd", "--data_dir", default="./data", type=str) parser.add_argument("-data", "--dataset", default="10X_PBMC", @@ -59,13 +59,12 @@ model = GraphSC(agg=args.agg, activation=args.activation, in_feats=args.in_feats, n_hidden=args.n_hidden, hidden_dim=args.hidden_dim, hidden_1=args.hidden_1, hidden_2=args.hidden_2, dropout=args.dropout, n_layers=args.n_layers, hidden_relu=args.hidden_relu, - hidden_bn=args.hidden_bn, num_workers=args.num_workers, device=args.device) - model.fit(graph, args.epochs, n_clusters, args.learning_rate, cluster=["KMeans", "Leiden"]) - pred = model.predict(n_clusters, cluster=["KMeans", "Leiden"]) - print(f"kmeans_pred (first 10): {pred.get('kmeans_pred')[:10]}") - print(f"leiden_pred (first 10): {pred.get('leiden_pred')[:10]}") - score = model.score(y, n_clusters, plot=False, cluster=["KMeans", "Leiden"]) - print(f"kmeans_ari: {score.get('kmeans_ari')}, leiden_ari: {score.get('leiden_ari')}") + hidden_bn=args.hidden_bn, n_clusters=n_clusters, cluster_method="leiden", + num_workers=args.num_workers, device=args.device) + model.fit(graph, n_epochs=args.epochs, lr=args.learning_rate, show_epoch_ari=args.show_epoch_ari, + eval_epoch=args.eval_epoch) + score = model.score(None, y) + print(f"{score:=.4f}") """ Reproduction information 10X PBMC: python graphsc.py --dataset 10X_PBMC