OmicsML · RemyLau · Jan 20, 2023 · Jan 20, 2023 · Jan 20, 2023 · Jan 20, 2023
diff --git a/dance/modules/single_modality/cell_type_annotation/celltypist.py b/dance/modules/single_modality/cell_type_annotation/celltypist.py
@@ -274,17 +274,17 @@ def to_plots(self, folder: str, plot_probability: bool = False, format: str = 'p
         if not os.path.isdir(folder):
             raise FileNotFoundError(f" Output folder {folder} does not exist. Please provide a valid folder")
         if 'X_umap' in self.adata.obsm:
-            logger.info(" Detected existing UMAP coordinates, will plot the results accordingly")
+            logger.info("Detected existing UMAP coordinates, will plot the results accordingly")
         elif 'connectivities' in self.adata.obsp:
             logger.info(" Generating UMAP coordinates based on the neighborhood graph")
             sc.tl.umap(self.adata)
         else:
-            logger.info(" Constructing the neighborhood graph and generating UMAP coordinates")
+            logger.info("Constructing the neighborhood graph and generating UMAP coordinates")
             adata = self.adata.copy()
             self.adata.obsm['X_pca'], self.adata.obsp['connectivities'], self.adata.obsp['distances'], self.adata.uns[
                 'neighbors'] = Classifier._construct_neighbor_graph(adata)
             sc.tl.umap(self.adata)
-        logger.info(" Plotting the results")
+        logger.info("Plotting the results")
         sc.settings.set_figure_params(figsize=[6.4, 6.4], format=format)
         self.adata.obs[self.predicted_labels.columns] = self.predicted_labels
         for column in self.predicted_labels:
@@ -464,13 +464,12 @@ def over_cluster(self, resolution: Optional[float] = None) -> pd.Series:
 
         """
         if 'connectivities' not in self.adata.obsp:
-            logger.info(" Can not detect a neighborhood graph, will construct one before the over-clustering")
+            logger.info("Can not detect a neighborhood graph, will construct one before the over-clustering")
             adata = self.adata.copy()
             self.adata.obsm['X_pca'], self.adata.obsp['connectivities'], self.adata.obsp['distances'], self.adata.uns[
                 'neighbors'] = Classifier._construct_neighbor_graph(adata)
         else:
-            logger.info(
-                " Detected a neighborhood graph in the input object, will run over-clustering on the basis of it")
+            logger.info("Detected a neighborhood graph in the input object, will run overclustering on the basis of it")
         if resolution is None:
             if self.adata.n_obs < 5000:
                 resolution = 5
@@ -484,7 +483,7 @@ def over_cluster(self, resolution: Optional[float] = None) -> pd.Series:
                 resolution = 25
             else:
                 resolution = 30
-        logger.info(f" Over-clustering input data with resolution set to {resolution}")
+        logger.info(f"Over-clustering input data with resolution set to {resolution}")
         sc.tl.leiden(self.adata, resolution=resolution, key_added='over_clustering')
         return self.adata.obs.pop('over_clustering')
 

diff --git a/examples/single_modality/cell_type_annotation/celltypist.py b/examples/single_modality/cell_type_annotation/celltypist.py
@@ -10,19 +10,15 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--cell_type_test", type=str, help="name for the cell type information for test data",
-                        default="Cell_type")
-    parser.add_argument("--cell_type_train", type=str, help="name for the cell type information for training data",
-                        default="Cell_type")
     parser.add_argument("--log_level", type=str, default="INFO", choices=LOGLEVELS)
     parser.add_argument("--max_iter", type=int, help="Max iteration during training", default=200)
+    parser.add_argument("--majority_voting", action="store_true",
+                        help="Whether to refine the predicted labels via majority voting after over-clustering.")
     parser.add_argument("--n_jobs", type=int, help="Number of jobs", default=10)
-    parser.add_argument("--random_seed", type=int, default=10)
     parser.add_argument("--species", default="mouse", type=str)
-    parser.add_argument("--test_dataset", type=int, nargs="+", default=[1759],
-                        help="List testing training dataset ids.")
+    parser.add_argument("--test_dataset", nargs="+", default=[1759], help="List of testing dataset ids.")
     parser.add_argument("--tissue", default="Spleen", type=str)
-    parser.add_argument("--train_dataset", type=int, nargs="+", default=[1970], help="List of training dataset ids.")
+    parser.add_argument("--train_dataset", nargs="+", default=[1970], help="List of training dataset ids.")
     parser.add_argument("--not_use_SGD", action="store_true",
                         help="Training algorithm -- weather it will be stochastic gradient descent.")
 
@@ -48,7 +44,7 @@
     # Train and evaluate the model
     model = Celltypist()
     model.fit(x_train, y_train, n_jobs=args.n_jobs, max_iter=args.max_iter, use_SGD=not args.not_use_SGD)
-    pred = model.predict(x_test)
+    pred = model.predict(x_test, majority_voting=args.majority_voting)
     score = model.score(pred, y_test)
     print(f"{score=:.4f}")
 """To reproduce CellTypist benchmarks, please refer to command lines below: