diff --git a/dance/modules/spatial/cell_type_deconvo/__init__.py b/dance/modules/spatial/cell_type_deconvo/__init__.py
index 2276b403..605033e3 100644
--- a/dance/modules/spatial/cell_type_deconvo/__init__.py
+++ b/dance/modules/spatial/cell_type_deconvo/__init__.py
@@ -5,7 +5,7 @@
 
 __all__ = [
     "Card",
-    "DSTGLearner",
+    "DSTG",
     "SpatialDecon",
     "SPOTlight",
 ]
diff --git a/dance/modules/spatial/cell_type_deconvo/card.py b/dance/modules/spatial/cell_type_deconvo/card.py
index 265fa0f2..6d0780cb 100644
--- a/dance/modules/spatial/cell_type_deconvo/card.py
+++ b/dance/modules/spatial/cell_type_deconvo/card.py
@@ -111,35 +111,36 @@ def CARDref(Xinput, U, W, phi, max_iter, epsilon, V, b, sigma_e2, Lambda):
 
 
 class Card:
-    """The CARD cell-type deconvolution model.
-
-    Parameters
-    ----------
-    sc_count : pd.DataFrame
-        Reference single cell RNA-seq counts data.
-    sc_meta : pd.DataFrame
-        Reference cell-type label information.
-    ct_varname : str, optional
-        Name of the cell-types column.
-    ct_select : str, optional
-        Selected cell-types to be considered for deconvolution.
-    cell_varname : str, optional
-        Name of the cells column.
-    sample_varname : str, optional
-        Name of the samples column.
-    minCountGene : int
-        Minimum number of genes required.
-    minCountSpot : int
-        Minimum number of spots required.
-    basis
-        The basis parameter.
-    markers
-        Markers.
-
-    """
+    """The CARD cell-type deconvolution model."""
 
     def __init__(self, sc_count, sc_meta, ct_varname=None, ct_select=None, cell_varname=None, sample_varname=None,
                  minCountGene=100, minCountSpot=5, basis=None, markers=None):
+        """Initialize Card.
+
+        Parameters
+        ----------
+        sc_count : pd.DataFrame
+            Reference single cell RNA-seq counts data.
+        sc_meta : pd.DataFrame
+            Reference cell-type label information.
+        ct_varname : str, optional
+            Name of the cell-types column.
+        ct_select : str, optional
+            Selected cell-types to be considered for deconvolution.
+        cell_varname : str, optional
+            Name of the cells column.
+        sample_varname : str, optional
+            Name of the samples column.
+        minCountGene : int
+            Minimum number of genes required.
+        minCountSpot : int
+            Minimum number of spots required.
+        basis
+            The basis parameter.
+        markers
+            Markers.
+
+        """
         self.sc_count = sc_count
         self.sc_meta = sc_meta
         self.ct_varname = ct_varname
diff --git a/dance/modules/spatial/cell_type_deconvo/spatialdecon.py b/dance/modules/spatial/cell_type_deconvo/spatialdecon.py
index c320b365..24bb4440 100644
--- a/dance/modules/spatial/cell_type_deconvo/spatialdecon.py
+++ b/dance/modules/spatial/cell_type_deconvo/spatialdecon.py
@@ -77,31 +77,32 @@ def cell_topic_profile(X, groups, ct_select, axis=0, method='median'):
 
 
 class SpatialDecon:
-    """SpatialDecon.
-
-    Parameters
-    ----------
-    sc_count : pd.DataFrame
-        Reference single cell RNA-seq counts data.
-    sc_annot : pd.DataFrame
-        Reference cell-type label information.
-    mix_count : pd.DataFrame
-        Target mixed-cell RNA-seq counts data to be deconvoluted.
-    ct_varname : str, optional
-        Name of the cell-types column.
-    ct_select : str, optional
-        Selected cell-types to be considered for deconvolution.
-    sc_profile: numpy array optional
-        Pre-constructed cell profile matrix.
-    bias : boolean optional
-        Include bias term, default False.
-    init_bias: numpy array optional
-        Initial bias term (background estimate).
-
-    """
+    """SpatialDecon."""
 
     def __init__(self, sc_count, sc_annot, ct_varname, ct_select, sc_profile=None, bias=False, init_bias=None,
                  device="cpu"):
+        """Initialize SpatialDecon.
+
+        Parameters
+        ----------
+        sc_count : pd.DataFrame
+            Reference single cell RNA-seq counts data.
+        sc_annot : pd.DataFrame
+            Reference cell-type label information.
+        mix_count : pd.DataFrame
+            Target mixed-cell RNA-seq counts data to be deconvoluted.
+        ct_varname : str, optional
+            Name of the cell-types column.
+        ct_select : str, optional
+            Selected cell-types to be considered for deconvolution.
+        sc_profile: numpy array optional
+            Pre-constructed cell profile matrix.
+        bias : boolean optional
+            Include bias term, default False.
+        init_bias: numpy array optional
+            Initial bias term (background estimate).
+
+        """
         super().__init__()
 
         self.device = device
@@ -199,9 +200,9 @@ def score(self, pred, true):
 
         Parameters
         ----------
-        pred :
+        pred
             Predicted cell-type proportions.
-        true :
+        true
             True cell-type proportions.
 
         Returns
diff --git a/dance/modules/spatial/cell_type_deconvo/spotlight.py b/dance/modules/spatial/cell_type_deconvo/spotlight.py
index 5b792392..223956f8 100644
--- a/dance/modules/spatial/cell_type_deconvo/spotlight.py
+++ b/dance/modules/spatial/cell_type_deconvo/spotlight.py
@@ -49,20 +49,21 @@ def cell_topic_profile(x, groups, ct_select, axis=0, method="median"):
 
 
 class NNLS(nn.Module):
-    """NNLS.
+    """NNLS."""
 
-    Parameters
-    ----------
-    in_dim : int
-        Input dimension.
-    out_dim : int
-        Output dimension.
-    bias : bool
-        Include bias term, default False.
+    def __init__(self, in_dim, out_dim, bias=False, init_bias=None, device="cpu"):
+        """Initialize NNLS.
 
-    """
+        Parameters
+        ----------
+        in_dim : int
+            Input dimension.
+        out_dim : int
+            Output dimension.
+        bias : bool
+            Include bias term, default False.
 
-    def __init__(self, in_dim, out_dim, bias=False, init_bias=None, device="cpu"):
+        """
         super().__init__()
         self.device = device
         self.model = nn.Linear(in_features=in_dim, out_features=out_dim, bias=bias)
@@ -116,35 +117,36 @@ def fit(self, x, y, max_iter, lr, print_res=False, print_period=100):
 
 
 class SPOTlight:
-    """SPOTlight class.
-
-    Parameters
-    ----------
-    ref_count : pd.DataFrame
-        Reference single cell RNA-seq counts data (cell x gene).
-    ref_annot : pd.DataFrame
-        Reference cell-type label information.
-    mix_count : pd.DataFrame
-        Target mixed-cell RNA-seq counts data to be deconvoluted.
-    ct_varname : str
-        Name of the cell-types column.
-    ct_select : str
-        Selected cell-types to be considered for deconvolution.
-    rank : int
-        Rank of the matrix factorization.
-    sc_profile: np.ndarray
-        Pre-constructed cell profile matrix.
-    bias : bool
-        Include bias term, default False.
-    init_bias: np.ndarray
-        Initial bias term (background estimate).
-    init : str
-        Initialization method for matrix factorization solver (see NMF from sklearn).
-
-    """
+    """SPOTlight."""
 
     def __init__(self, ref_count, ref_annot, ct_varname, ct_select, rank=2, sc_profile=None, bias=False, init_bias=None,
                  init="random", device="cpu"):
+        """Initialize SPOTlight.
+
+        Parameters
+        ----------
+        ref_count : pd.DataFrame
+            Reference single cell RNA-seq counts data (cell x gene).
+        ref_annot : pd.DataFrame
+            Reference cell-type label information.
+        mix_count : pd.DataFrame
+            Target mixed-cell RNA-seq counts data to be deconvoluted.
+        ct_varname : str
+            Name of the cell-types column.
+        ct_select : str
+            Selected cell-types to be considered for deconvolution.
+        rank : int
+            Rank of the matrix factorization.
+        sc_profile: np.ndarray
+            Pre-constructed cell profile matrix.
+        bias : bool
+            Include bias term, default False.
+        init_bias: np.ndarray
+            Initial bias term (background estimate).
+        init : str
+            Initialization method for matrix factorization solver (see NMF from sklearn).
+
+        """
         super().__init__()
         self.device = device
         self.bias = bias
diff --git a/dance/modules/spatial/spatial_domain/louvain.py b/dance/modules/spatial/spatial_domain/louvain.py
index f919e3ad..a9f17b50 100644
--- a/dance/modules/spatial/spatial_domain/louvain.py
+++ b/dance/modules/spatial/spatial_domain/louvain.py
@@ -175,7 +175,7 @@ def modularity(partition, graph, weight="weight"):
        and values the communities
     graph : networkx.Graph
        the networkx graph which is decomposed
-    weight : str, optional
+    weight : str
         the key in graph to use as weight. Default to "weight"
 
 
@@ -245,20 +245,20 @@ def best_partition(graph, partition=None, weight="weight", resolution=1., random
     ----------
     graph : networkx.Graph
        the networkx graph which is decomposed
-    partition : dict, optional
+    partition : dict
        the algorithm will start using this partition of the nodes.
        It's a dictionary where keys are their nodes and values the communities
-    weight : str, optional
+    weight : str
         the key in graph to use as weight. Default to "weight"
-    resolution :  double, optional
+    resolution :  double
         Will change the size of the communities, default to 1.
         represents the time described in
         "Laplacian Dynamics and Multiscale Modular Structure in Networks",
         R. Lambiotte, J.-C. Delvenne, M. Barahona
-    randomize : boolean, optional
+    randomize : boolean
         Will randomize the node evaluation order and the community evaluation
         order to get different partitions at each call
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance or None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
@@ -335,21 +335,21 @@ def __init__(self, resolution: float = 1):
         self.resolution = resolution
 
     def fit(self, adj, partition=None, weight="weight", randomize=None, random_state=None):
-        """fit function for model training.
+        """Fit function for model training.
 
         Parameters
         ----------
         adj :
             adjacent matrix.
-        partition : dict optional
+        partition : dict
             a dictionary where keys are graph nodes and values the part the node
             belongs to
-        weight : str, optional
+        weight : str
             the key in graph to use as weight. Default to "weight"
-        randomize : boolean, optional
+        randomize : boolean
             Will randomize the node evaluation order and the community evaluation
             order to get different partitions at each call
-        random_state : int, RandomState instance or None, optional (default=None)
+        random_state : int, RandomState instance or None
             If int, random_state is the seed used by the random number generator;
             If RandomState instance, random_state is the random number generator;
             If None, the random number generator is the RandomState instance used
@@ -368,31 +368,23 @@ def fit(self, adj, partition=None, weight="weight", randomize=None, random_state
         print("fit over ")
 
     def predict(self):
-        """prediction function.
-        Parameters
-        ----------
-
-        Returns
-        -------
-        self.predict_result :
-            predicted label.
-
-        """
+        """Prediction function."""
         self.predict_result = partition_at_level(self.dendo, len(self.dendo) - 1)
         self.y_pred = self.predict_result
         return self.predict_result
 
     def score(self, y_true):
-        """score function to get score of prediction.
+        """Score function to evaluate the prediction performance.
+
         Parameters
         ----------
-        y_true :
-            ground truth label.
+        y_true
+            Ground truth label.
 
         Returns
         -------
-        score : float
-            metric eval score.
+        float
+            Evaluation score.
 
         """
         pred_val = []
diff --git a/dance/modules/spatial/spatial_domain/spagcn.py b/dance/modules/spatial/spatial_domain/spagcn.py
index 8bcee803..30a189ab 100644
--- a/dance/modules/spatial/spatial_domain/spagcn.py
+++ b/dance/modules/spatial/spatial_domain/spagcn.py
@@ -19,10 +19,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from scipy.sparse import issparse
 from sklearn.cluster import KMeans
-from sklearn.decomposition import PCA
-# from models import GraphConvolution
 from torch.nn.parameter import Parameter
 
 from dance import utils
@@ -37,20 +34,19 @@ def refine(sample_id, pred, dis, shape="hexagon"):
 
     Parameters
     ----------
-    sample_id :
-        sample id
-    pred :
-        initial prediction
-    dis :
-        graph structure
-    shape : str optional
-        by default as "hexagon"
-
+    sample_id
+        Sample id.
+    pred
+        Initial prediction.
+    dis
+        Graph structure.
+    shape : str
+        Shape parameter.
 
     Returns
     -------
-    refined_pred :
-        refined prediction.
+    refined_pred
+        Refined predictions.
 
     """
     refined_pred = []
@@ -105,53 +101,34 @@ def forward(self, input, adj):
             return output
 
     def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-               + str(self.in_features) + ' -> ' \
-               + str(self.out_features) + ')'
+        return f"{self.__class__.__name__}({self.in_features} -> {self.out_features})"
 
 
 class SimpleGCDEC(nn.Module):
-    """Basic model used in SpaGCN training.
-
-    Parameters
-    ----------
-    nfeat : int
-        input feature dimension
+    """Basic model used in SpaGCN training."""
 
-    nhid : int
-        output feature dimension
+    def __init__(self, nfeat, nhid, alpha=0.2, device="cpu"):
+        """Initialize SimpleGCDEC.
 
-    alpha : float optional
-        alpha, by default as 0.2
-
-    """
+        Parameters
+        ----------
+        nfeat : int
+            Input feature dimension.
+        nhid : int
+            Output feature dimension.
+        alpha : float optional
+            Alphat parameter.
 
-    def __init__(self, nfeat, nhid, alpha=0.2):
+        """
         super().__init__()
         self.gc = GraphConvolution(nfeat, nhid)
         self.nhid = nhid
-        #self.mu determined by the init method
+        # self.mu is determined by the init method
         self.alpha = alpha
+        self.device = device
 
     def forward(self, x, adj):
-        """forward function.
-
-        Parameters
-        ----------
-        x :
-            node features.
-        adj :
-            adjacent matrix.
-
-
-        Returns
-        -------
-        x :
-            the output of graph convolution layer.
-        q :
-            the probability of assigning cell i to cluster j.
-
-        """
+        """Forward function."""
         x = self.gc(x, adj)
         q = 1.0 / ((1.0 + torch.sum((x.unsqueeze(1) - self.mu)**2, dim=2) / self.alpha) + 1e-8)
         q = q**(self.alpha + 1.0) / 2.0
@@ -159,22 +136,7 @@ def forward(self, x, adj):
         return x, q
 
     def loss_function(self, p, q):
-        """objective function as a Kullback–Leibler (KL) divergence loss.
-
-        Parameters
-        ----------
-        p :
-            target distribution.
-        q :
-            the probability of assigning cell i to cluster j.
-
-
-        Returns
-        -------
-        loss :
-            Kullback–Leibler (KL) divergence loss.
-
-        """
+        """Objective function as a Kullback–Leibler (KL) divergence loss."""
 
         def kld(target, pred):
             return torch.mean(torch.sum(target * torch.log(target / (pred + 1e-6)), dim=1))
@@ -183,18 +145,18 @@ def kld(target, pred):
         return loss
 
     def target_distribution(self, q):
-        """generate an auxiliary target distribution based on q the probability of
+        """Generate an auxiliary target distribution based on q the probability of
         assigning cell i to cluster j.
 
         Parameters
         ----------
-        q :
-            the probability of assigning cell i to cluster j.
+        q
+            The probability of assigning cell i to cluster j.
 
         Returns
         -------
-        p :
-            target distribution.
+        p
+            Target distribution.
 
         """
         p = q**2 / torch.sum(q, dim=0)
@@ -202,43 +164,39 @@ def target_distribution(self, q):
         return p
 
     def fit(self, X, adj, lr=0.001, max_epochs=5000, update_interval=3, trajectory_interval=50, weight_decay=5e-4,
-            opt="sgd", init="louvain", n_neighbors=10, res=0.4, n_clusters=10, init_spa=True, tol=1e-3, device="cuda"):
-        """fit function for model training.
+            opt="sgd", init="louvain", n_neighbors=10, res=0.4, n_clusters=10, init_spa=True, tol=1e-3):
+        """Fit function for model training.
 
         Parameters
         ----------
-        X :
-            node features.
-        adj :
-            adjacent matrix.
-        lr : float optional
-            learning rate.
-        max_epochs : int optional
-            max epochs.
-        update_interval: int optional
-            interval for update
-        trajectory_interval: int optional
-            trajectory interval
-        weight_decay : float optional
-            weight decay.
-        opt : str optional
-            optimization.
-        init : str optional
+        X
+            Node features.
+        adj
+            Adjacent matrix.
+        lr : float
+            Learning rate.
+        max_epochs : int
+            Maximum number of epochs.
+        update_interval : int
+            Interval for update.
+        trajectory_interval: int
+            Trajectory interval.
+        weight_decay : float
+            Weight decay.
+        opt : str
+            Optimizer.
+        init : str
             "louvain" or "kmeans".
-        n_neighbors : int optional
-            the number of neighbors used in louvain.
-        res : float optional
-            used for louvain .
-        n_clusters : int optional
-            the number of clusters usedd in kmeans.
-        init_spa : bool optional
-            initialize spatial.
-        tol : float optional
-            tolerant value for searching l.
-
-        Returns
-        -------
-        None.
+        n_neighbors : int
+            The number of neighbors used in louvain.
+        res : float
+            Used for louvain.
+        n_clusters : int
+            The number of clusters usedd in kmeans.
+        init_spa : bool
+            Initialize spatial.
+        tol : float
+            Tolerant value for searching l.
 
         """
         self.trajectory = []
@@ -248,17 +206,17 @@ def fit(self, X, adj, lr=0.001, max_epochs=5000, update_interval=3, trajectory_i
             optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
 
         features = self.gc(torch.FloatTensor(X), torch.FloatTensor(adj))
-        #----------------------------------------------------------------
+        # ---------------------------------------------------------------------
         if init == "kmeans":
             print("Initializing cluster centers with kmeans, n_clusters known")
             self.n_clusters = n_clusters
             kmeans = KMeans(self.n_clusters, n_init=20)
             if init_spa:
-                #------Kmeans use exp and spatial
+                # Kmeans using both expression and spatial information
                 y_pred = kmeans.fit_predict(features.detach().numpy())
             else:
-                #------Kmeans only use exp info, no spatial
-                y_pred = kmeans.fit_predict(X)  #Here we use X as numpy
+                # Kmeans using only expression information
+                y_pred = kmeans.fit_predict(X)  # use X as numpy
         elif init == "louvain":
             print("Initializing cluster centers with louvain, resolution = ", res)
             if init_spa:
@@ -271,7 +229,7 @@ def fit(self, X, adj, lr=0.001, max_epochs=5000, update_interval=3, trajectory_i
 
             y_pred = adata.obs['louvain'].astype(int).to_numpy()
             self.n_clusters = len(np.unique(y_pred))
-        #----------------------------------------------------------------
+        # ---------------------------------------------------------------------
         y_pred_last = y_pred
         self.mu = Parameter(torch.Tensor(self.n_clusters, self.nhid))
         X = torch.FloatTensor(X)
@@ -281,17 +239,10 @@ def fit(self, X, adj, lr=0.001, max_epochs=5000, update_interval=3, trajectory_i
         Group = pd.Series(y_pred, index=np.arange(0, features.shape[0]), name="Group")
         Mergefeature = pd.concat([features, Group], axis=1)
         cluster_centers = np.asarray(Mergefeature.groupby("Group").mean())
-
-        # judge have or no cuda device in torch
-        if torch.cuda.is_available():
-            if device == "cuda":
-                device = torch.device("cuda")
-        else:
-            device = torch.device("cpu")
-
         self.mu.data.copy_(torch.Tensor(cluster_centers))
 
-        # copy data and model in cuda
+        # Copy data and model in cuda
+        device = self.device
         self = self.to(device)
         X = X.to(device)
         adj = adj.to(device)
@@ -311,7 +262,7 @@ def fit(self, X, adj, lr=0.001, max_epochs=5000, update_interval=3, trajectory_i
             if epoch % trajectory_interval == 0:
                 self.trajectory.append(torch.argmax(q, dim=1).data.cpu().numpy())
 
-            #Check stop criterion
+            # Check stop criterion
             y_pred = torch.argmax(q, dim=1).data.detach().cpu().numpy()
             delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / X.shape[0]
             y_pred_last = y_pred
@@ -321,7 +272,7 @@ def fit(self, X, adj, lr=0.001, max_epochs=5000, update_interval=3, trajectory_i
                 print("Total epoch:", epoch)
                 break
 
-        # recover model and data in cpu
+        # Recover model and data in cpu
         self = self.cpu()
         X = X.cpu()
         adj = adj.cpu()
@@ -342,14 +293,8 @@ def fit_with_init(self, X, adj, init_y, lr=0.001, max_epochs=5000, update_interv
         cluster_centers = np.asarray(Mergefeature.groupby("Group").mean())
         self.mu.data.copy_(torch.Tensor(cluster_centers))
 
-        # judge have or no cuda device in torch
-        if torch.cuda.is_available():
-            if device == "cuda":
-                device = torch.device("cuda")
-        else:
-            device = torch.device("cpu")
-
-        # copy data and model in cuda
+        # Copy data and model in cuda
+        device = self.device
         self = self.to(device)
         X = X.to(device)
         adj = adj.to(device)
@@ -366,7 +311,8 @@ def fit_with_init(self, X, adj, init_y, lr=0.001, max_epochs=5000, update_interv
             loss = self.loss_function(p, q)
             loss.backward()
             optimizer.step()
-        # recover model and data in cpu
+
+        # Recover model and data in cpu
         self = self.cpu()
         X = X.cpu()
         adj = adj.cpu()
@@ -409,8 +355,6 @@ def kld(target, pred):
         return loss
 
     def target_distribution(self, q):
-        #weight = q ** 2 / q.sum(0)
-        #return torch.transpose((torch.transpose(weight,0,1) / weight.sum(1)),0,1)e
         p = q**2 / torch.sum(q, dim=0)
         p = p / torch.sum(p, dim=1, keepdim=True)
         return p
@@ -425,20 +369,18 @@ def fit(self, X, adj, lr=0.001, max_epochs=10, update_interval=5, weight_decay=5
             optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
 
         features, _ = self.forward(torch.FloatTensor(X), torch.FloatTensor(adj))
-        #----------------------------------------------------------------
+        # ---------------------------------------------------------------------
         if init == "kmeans":
-            #Kmeans only use exp info, no spatial
-            #kmeans = KMeans(self.n_clusters, n_init=20)
-            #y_pred = kmeans.fit_predict(X)  #Here we use X as numpy
-            #Kmeans use exp and spatial
+            # Kmeans using only expression information
             kmeans = KMeans(self.n_clusters, n_init=20)
             y_pred = kmeans.fit_predict(features.detach().numpy())
         elif init == "louvain":
+            # Louvain using only expression information
             adata = sc.AnnData(features.detach().numpy())
             sc.pp.neighbors(adata, n_neighbors=n_neighbors)
             sc.tl.louvain(adata, resolution=res)
             y_pred = adata.obs['louvain'].astype(int).to_numpy()
-        #----------------------------------------------------------------
+        # ---------------------------------------------------------------------
         X = torch.FloatTensor(X)
         adj = torch.FloatTensor(adj)
         self.trajectory.append(y_pred)
@@ -515,17 +457,17 @@ def search_l(self, p, adj, start=0.01, end=1000, tol=0.01, max_run=100):
         Parameters
         ----------
         p : float
-            percentage.
+            Percentage.
         adj :
-            adjacent matrix.
-        start : float optional
-            starting value for searching l.
-        end : float optional
-            ending value for searching l.
-        tol : float optional
-            tolerant value for searching l.
-        max_run : int optional
-            Max runs.
+            Adjacent matrix.
+        start : float
+            Starting value for searching l.
+        end : float
+            Ending value for searching l.
+        tol : float
+            Tolerant value for searching l.
+        max_run : int
+            Maximum number of runs.
 
         Returns
         -------
@@ -537,27 +479,19 @@ def search_l(self, p, adj, start=0.01, end=1000, tol=0.01, max_run=100):
         return l
 
     def set_l(self, l):
-        """set l.
+        """Set l.
 
         Parameters
         ----------
         l : float
-            the parameter to control percentage p.
-
-        Returns
-        -------
-        None.
+            The parameter to control percentage p.
 
         """
         self.l = l
 
     def search_set_res(self, embed, adj, l, target_num, start=0.4, step=0.1, tol=5e-3, lr=0.05, max_epochs=10,
                        r_seed=100, t_seed=100, n_seed=100, max_run=10):
-        """search res.
-
-        res: Resolution in the initial Louvain's Clustering methods.
-
-        """
+        """Search for optimal resolution parameter."""
         random.seed(r_seed)
         torch.manual_seed(t_seed)
         np.random.seed(n_seed)
@@ -603,55 +537,38 @@ def search_set_res(self, embed, adj, l, target_num, start=0.4, step=0.1, tol=5e-
         self.res = res
         return res
 
-    def fit(
-            self,
-            embed,
-            adj,
-            num_pcs=50,
-            lr=0.005,
-            max_epochs=2000,
-            weight_decay=0,
-            opt="admin",
-            init_spa=True,
-            init="louvain",  #louvain or kmeans
-            n_neighbors=10,  #for louvain
-            n_clusters=None,  #for kmeans
-            res=0.4,  #for louvain
-            tol=1e-3):
-        """fit function for model training.
+    def fit(self, embed, adj, num_pcs=50, lr=0.005, max_epochs=2000, weight_decay=0, opt="admin", init_spa=True,
+            init="louvain", n_neighbors=10, n_clusters=None, res=0.4, tol=1e-3):
+        """Fit function for model training.
 
         Parameters
         ----------
-        embed :
-            input data.
-        adj :
-            adjacent matrix.
+        embed
+            Input data.
+        adj
+            Adjacent matrix.
         num_pcs : int
-            the number of component used in PCA.
+            The number of component used in PCA.
         lr : float
-            learning rate.
+            Learning rate.
         max_epochs : int
-            max epochs.
+            Maximum number of epochs.
         weight_decay : float
-            weight decay.
+            Weight decay.
         opt : str
-            optimization.
+            Optimizer.
         init_spa : bool
-            initialize spatial.
+            Initialize spatial.
         init : str
             "louvain" or "kmeans".
         n_neighbors : int
-            the number of neighbors used in louvain.
+            The number of neighbors used by Louvain.
         n_clusters : int
-            the number of clusters usedd in kmeans.
+            The number of clusters usedd by kmeans.
         res : float
-            used for louvain .
+            The resolution parameter used by Louvain.
         tol : float
-            tolerant value for searching l.
-
-        Returns
-        -------
-        None.
+            Oolerant value for searching l.
 
         """
         self.num_pcs = num_pcs
@@ -678,17 +595,12 @@ def fit(
         self.adj_exp = adj_exp
 
     def predict(self):
-        """prediction function.
-
-        Parameters
-        ----------
+        """Prediction function.
 
         Returns
         -------
-        y_pred : numpy
-            predicted label.
-        prob : numpy
-            predicted probability.
+        Tuple[np.ndarray, np.ndarray]
+            The predicted labels and the predicted probabilities.
 
         """
         z, q = self.model.predict(self.embed, self.adj_exp)
@@ -699,7 +611,7 @@ def predict(self):
         return y_pred, prob
 
     def score(self, y_true):
-        """score function to get score of prediction.
+        """Score function to evaluate the prediction performance.
 
         Parameters
         ----------
diff --git a/dance/modules/spatial/spatial_domain/stagate.py b/dance/modules/spatial/spatial_domain/stagate.py
index 11c8e90f..bdb74c39 100644
--- a/dance/modules/spatial/spatial_domain/stagate.py
+++ b/dance/modules/spatial/spatial_domain/stagate.py
@@ -157,16 +157,17 @@ def __repr__(self):
 
 
 class Stagate(torch.nn.Module):
-    """Stagate class.
+    """Stagate class."""
 
-    Parameters
-    ----------
-    hidden_dims : int
-        hidden dimensions
+    def __init__(self, hidden_dims):
+        """Initialize Stagate.
 
-    """
+        Parameters
+        ----------
+        hidden_dims : int
+            Hidden dimensions.
 
-    def __init__(self, hidden_dims):
+        """
         super().__init__()
 
         [in_dim, num_hidden, out_dim] = hidden_dims
@@ -213,27 +214,27 @@ def fit(self, adata, graph, n_epochs=1, lr=0.001, key_added="STAGATE", gradient_
             Input data.
         graph :
             Graph structure.
-        n_epochs : int optional
+        n_epochs : int
             Number of epochs.
-        lr : float optional
+        lr : float
             Learning rate.
-        key_added : str optional
+        key_added : str
             Default "STAGATE".
-        gradient_clipping : float optional
+        gradient_clipping : float
             Gradient clipping.
-        pre_resolution : float optional
+        pre_resolution : float
             Pre-resolution.
-        weight_decay : float optional
+        weight_decay : float
             Weight decay.
-        verbose : bool optional
+        verbose : bool
             Verbosity, by default to be True.
-        random_seed : int optional
+        random_seed : int
             Random seed.
-        save_loss : bool optional
+        save_loss : bool
             Whether to save loss or not.
-        save_reconstrction : bool optional
+        save_reconstrction : bool
             Whether to save reconstruction or not.
-        device : str optional
+        device : str
             Computation device.
 
         """
diff --git a/dance/modules/spatial/spatial_domain/stlearn.py b/dance/modules/spatial/spatial_domain/stlearn.py
index d6a8f00b..5147f610 100644
--- a/dance/modules/spatial/spatial_domain/stlearn.py
+++ b/dance/modules/spatial/spatial_domain/stlearn.py
@@ -16,37 +16,38 @@
 
 
 class StKmeans:
-    """StKmeans class.
-
-    Parameters
-    ----------
-    n_clusters : int optional
-        The number of clusters to form as well as the number of centroids to generate.
-    init : str optional
-        Method for initialization: {‘k-means++’, ‘random’}.
-    n_init : int optional
-        Number of time the k-means algorithm will be run with different centroid seeds.
-        The final results will be the best output of n_init consecutive runs in terms of inertia.
-    max_iter : int optional
-        Maximum number of iterations of the k-means algorithm for a single run.
-    tol : float optional
-        Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive
-        iterations to declare convergence.
-    algorithm : str optional
-        {“lloyd”, “elkan”, “auto”, “full”}, default is "auto".
-    verbose : bool optional
-        Verbosity.
-    random_state : int optional
-        Determines random number generation for centroid initialization.
-    use_data : str optional
-        Default "X_pca".
-    key_added : str optional
-        Default "X_pca_kmeans".
-
-    """
+    """StKmeans class."""
 
     def __init__(self, n_clusters=19, init="k-means++", n_init=10, max_iter=300, tol=1e-4, algorithm="auto",
                  verbose=False, random_state=None, use_data="X_pca", key_added="X_pca_kmeans"):
+        """Initialize StKMeans.
+
+        Parameters
+        ----------
+        n_clusters : int
+            The number of clusters to form as well as the number of centroids to generate.
+        init : str
+            Method for initialization: {‘k-means++’, ‘random’}.
+        n_init : int
+            Number of time the k-means algorithm will be run with different centroid seeds.
+            The final results will be the best output of n_init consecutive runs in terms of inertia.
+        max_iter : int
+            Maximum number of iterations of the k-means algorithm for a single run.
+        tol : float
+            Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two
+            consecutive iterations to declare convergence.
+        algorithm : str
+            {“lloyd”, “elkan”, “auto”, “full”}, default is "auto".
+        verbose : bool
+            Verbosity.
+        random_state : int
+            Determines random number generation for centroid initialization.
+        use_data : str
+            Default "X_pca".
+        key_added : str
+            Default "X_pca_kmeans".
+
+        """
         self.use_data = use_data
         self.key_added = key_added
         self.model = KMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
@@ -91,6 +92,14 @@ class StLouvain:
     """StLouvain class."""
 
     def __init__(self, resolution: float = 1):
+        """Initialize StLouvain.
+
+        Parameters
+        ----------
+        resolution : float
+            Resolution parameter.
+
+        """
         self.model = Louvain(resolution)
 
     def fit(self, adj, partition=None, weight="weight", randomize=None, random_state=None):
@@ -100,17 +109,17 @@ def fit(self, adj, partition=None, weight="weight", randomize=None, random_state
         ----------
         adj
             Adjacent matrix.
-        partition : dict optional
+        partition : dict
             A dictionary where keys are graph nodes and values the part the node
             belongs to
-        weight : str, optional
+        weight : str,
             The key in graph to use as weight. Default to "weight"
-        resolution : float optional
+        resolution : float
             Resolution.
-        randomize : boolean, optional
+        randomize : boolean
             Will randomize the node evaluation order and the community evaluation
             order to get different partitions at each call
-        random_state : int, RandomState instance or None, optional (default=None)
+        random_state : int, RandomState instance or None
             If int, random_state is the seed used by the random number generator; If RandomState instance, random_state
             is the random number generator; If None, the random number generator is the RandomState instance used by
             `np.random`.
diff --git a/examples/spatial/cell_type_deconvo/dstg.py b/examples/spatial/cell_type_deconvo/dstg.py
index a2e12d94..28d04291 100644
--- a/examples/spatial/cell_type_deconvo/dstg.py
+++ b/examples/spatial/cell_type_deconvo/dstg.py
@@ -8,7 +8,7 @@
 
 from dance.data import Data
 from dance.datasets.spatial import CellTypeDeconvoDatasetLite
-from dance.modules.spatial.cell_type_deconvo.dstg import DSTG
+from dance.modules.spatial.cell_type_deconvo import DSTG
 from dance.transforms.graph import DSTGraph
 from dance.transforms.preprocess import pseudo_spatial_process
 from dance.utils.matrix import normalize