From 6cafee7a45dfba9f50e78455d9c46ea9231de5b3 Mon Sep 17 00:00:00 2001 From: nfancy Date: Sat, 18 Sep 2021 09:42:26 +0100 Subject: [PATCH 1/7] bug fixes --- bin/scflow_dge.r | 52 ++- bin/scflow_finalize_sce.r | 111 ++++-- bin/scflow_integrate.r | 124 ++---- bin/scflow_ipa.r | 94 +++-- bin/scflow_qc.r | 5 + conf/modules.config | 10 +- conf/scflow_analysis.config | 59 ++- modules/local/get_software_versions.nf | 11 +- modules/local/process/scflow/dge.nf | 2 +- modules/local/process/scflow/ipa.nf | 2 +- nextflow_schema.json | 497 ++++++++++--------------- workflows/scflow.nf | 23 +- 12 files changed, 451 insertions(+), 539 deletions(-) diff --git a/bin/scflow_dge.r b/bin/scflow_dge.r index 1893c7a..ef4d27e 100755 --- a/bin/scflow_dge.r +++ b/bin/scflow_dge.r @@ -146,6 +146,14 @@ required$add_argument( help = "p-value cutoff for DE [default %(default)s]" ) +required$add_argument( + "--n_label", + type = "integer", + default = 5, + metavar = "number", + help = "Number of genes to be highlighted on volcano plot" +) + required$add_argument( "--ensembl_mappings", help = "path to ensembl mappings file", @@ -177,9 +185,9 @@ options("scflow_species" = args$species) args$rescale_numerics <- as.logical(args$rescale_numerics) args$pseudobulk <- as.logical(args$pseudobulk) args$force_run <- as.logical(args$force_run) -if (tolower(args$random_effects_var) == "null") args$random_effects_var <- NULL +if(tolower(args$random_effects_var) == "null") args$random_effects_var <- NULL -args$max_cores <- if (toupper(args$max_cores) == "NULL") NULL else { +args$max_cores <- if(toupper(args$max_cores) == "NULL") NULL else { as.numeric(as.character(args$max_cores)) } @@ -202,6 +210,8 @@ cli::cli_alert(sprintf( n_cores )) +# RhpcBLASctl::omp_set_num_threads(1L) + library(scFlow) # ____________________________________________________________________________ @@ -220,9 +230,7 @@ if (args$pseudobulk) { pb_str <- "_pb" sce_subset <- pseudobulk_sce( sce_subset, - keep_vars = c( - args$dependent_var, args$confounding_vars, args$random_effects_var - ), + keep_vars = c(args$dependent_var, args$confounding_vars, args$random_effects_var), assay_name = "counts", celltype_var = args$celltype_var, sample_var = args$sample_var @@ -255,20 +263,36 @@ file_name <- paste0(args$celltype, "_", for (result in names(de_results)) { if (dim(de_results[[result]])[[1]] > 0) { write.table(de_results[[result]], - file = file.path(getwd(), - paste0(file_name, result, "_DE.tsv")), + file = file.path(getwd(), + paste0(file_name, result, "_DE.tsv")), quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE) + report_de(de_results[[result]], + fc_threshold = args$fc_threshold, + pval_cutoff = args$pval_cutoff, + n_label = args$n_label, report_folder_path = file.path(getwd()), report_file = paste0(file_name, result, "_scflow_de_report")) + print("report generated") - png(file.path(getwd(), - paste0(file_name, result, "_volcano_plot.png")), - width = 247, height = 170, units = "mm", res = 600) - print(attr(de_results[[result]], "plot")) - dev.off() - + + p <- scFlow::volcano_plot( + dt = de_results[[result]], + fc_threshold = args$fc_threshold, + pval_cutoff = args$pval_cutoff, + n_label = args$n_label + ) + ggplot2::ggsave(filename = file.path(getwd(), + paste0(file_name, result, "_volcano_plot.png")), + plot = p, + width = 7, height = 5, units = "in", dpi = 600) + + print("Volcano plot generated") + + } else { print(sprintf("No DE genes found for %s", result)) - } + } } + + diff --git a/bin/scflow_finalize_sce.r b/bin/scflow_finalize_sce.r index 58813af..ffa1743 100755 --- a/bin/scflow_finalize_sce.r +++ b/bin/scflow_finalize_sce.r @@ -5,12 +5,11 @@ # ____________________________________________________________________________ # Initialization #### +options(mc.cores = future::availableCores()) + ## ............................................................................ ## Load packages #### library(argparse) -library(scFlow) -library(magrittr) -library(SingleCellExperiment) ## ............................................................................ ## Parse command-line arguments #### @@ -25,42 +24,42 @@ optional <- parser$add_argument_group("Optional", "required arguments") required$add_argument( "--sce_path", help = "-path to the SingleCellExperiment", - metavar = "dir", + metavar = "dir", required = TRUE ) required$add_argument( "--celltype_mappings", help = "path to a tsv file with revised celltype mappings", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--clusters_colname", help = "name of the column with cluster numbers", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--celltype_var", help = "name of the column with celltype names", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--unique_id_var", help = "name of the column with unique sample ids", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--facet_vars", help = "names of variables to examine in the celltype metrics report", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) @@ -68,14 +67,14 @@ required$add_argument( required$add_argument( "--input_reduced_dim", help = "name of the reduced dimension slot to use for plots in the report", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--metric_vars", help = "names of variables to examine in the celltype metrics report", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) @@ -84,7 +83,7 @@ required$add_argument( default = 5, type = "integer", required = TRUE, - help = "The number of top marker genes", + help ="The number of top marker genes", metavar = "N" ) @@ -106,6 +105,13 @@ required$add_argument( metavar = "N" ) +required$add_argument( + "--max_cores", + default = NULL, + help = "override for lower cpu core usage", + metavar = "N", + required = TRUE +) ### . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. ### Pre-process args #### @@ -117,6 +123,31 @@ args$metric_vars <- strsplit(args$metric_vars, ",")[[1]] options("scflow_reddimplot_pointsize" = args$reddimplot_pointsize) options("scflow_reddimplot_alpha" = args$reddimplot_alpha) +args$max_cores <- if(toupper(args$max_cores) == "NULL") NULL else { + as.numeric(as.character(args$max_cores)) +} + +# ____________________________________________________________________________ +# Delay Package Loading for Optional Max Cores Override + +n_cores <- future::availableCores(methods = "mc.cores") + +if (is.null(args$max_cores)) { + options(mc.cores = n_cores) +} else { + options(mc.cores = min(args$max_cores, n_cores)) +} + +cli::cli_alert(sprintf( + "Using %s cores on system with %s available cores.", + getOption("mc.cores"), + n_cores +)) + +library(scFlow) +library(magrittr) +library(SingleCellExperiment) + ## ............................................................................ ## Start #### @@ -161,8 +192,8 @@ celltypes <- as.data.frame(SummarizedExperiment::colData(sce)) %>% colnames(celltypes) <- c("celltype", "n_cells") write.table( - data.frame(celltypes), - file = "celltypes.tsv", + data.frame(celltypes), + file = "celltypes.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") ### Save Marker Gene Plots @@ -170,58 +201,58 @@ folder_path <- file.path(getwd(), "celltype_marker_plots") dir.create(folder_path) for (group in names(sce@metadata$markers)) { + pwidth <- max(10, - length( - unique(sce@metadata$markers[[group]]$marker_plot$data$Group) - ) + length(unique(sce@metadata$markers[[group]]$marker_plot$data$Group)) ) - pheight <- length( - unique(sce@metadata$markers[[group]]$marker_plot$data$Gene) - ) + pheight <- length(unique(sce@metadata$markers[[group]]$marker_plot$data$Gene)) + p <- sce@metadata$markers[[group]]$marker_plot + plot_file_name <- paste0(group, "_markers") + # save PNG - png(file.path(folder_path, paste0(plot_file_name, ".png")), - width = pwidth * 12, height = pheight * 5, units = "mm", res = 600) + png(file.path(folder_path, paste0(plot_file_name, ".png")), + width = pwidth * 12, height = pheight*5, units = "mm", res = 600) print(p) dev.off() - + # save PDF ggsave( file.path(folder_path, paste0(group, ".pdf")), - p, - width = pwidth * 12, - height = pheight * 5, - units = "mm", + p, + width = pwidth * 12, + height = pheight * 5, + units = "mm", scale = 1 ) - + } ### Save Marker Gene Tables folder_path <- file.path(getwd(), "celltype_marker_tables") dir.create(folder_path) for (group in names(sce@metadata$markers)) { - + marker_test_file_name <- paste0(group, "_markers_test.tsv") top_markers_file_name <- paste0(group, "_top_markers.tsv") - + write.table( - sce@metadata$markers[[group]]$marker_test_res, - file = file.path(folder_path, marker_test_file_name), - row.names = FALSE, - col.names = TRUE, + sce@metadata$markers[[group]]$marker_test_res, + file = file.path(folder_path, marker_test_file_name), + row.names = FALSE, + col.names = TRUE, sep = "\t" ) - + write.table( - sce@metadata$markers[[group]]$top_specific_markers, - file = file.path(folder_path, top_markers_file_name), - row.names = FALSE, - col.names = TRUE, + sce@metadata$markers[[group]]$top_specific_markers, + file = file.path(folder_path, top_markers_file_name), + row.names = FALSE, + col.names = TRUE, sep = "\t" ) - + } diff --git a/bin/scflow_integrate.r b/bin/scflow_integrate.r index b9879e0..48ec53a 100755 --- a/bin/scflow_integrate.r +++ b/bin/scflow_integrate.r @@ -5,13 +5,13 @@ # ____________________________________________________________________________ # Initialization #### -options(mc.cores = future::availableCores()) +options(mc.cores = future::availableCores(methods = "mc.cores")) ## ............................................................................ ## Load packages #### -library(argparse) library(scFlow) -library(parallel) +library(argparse) +#library(parallel) ## ............................................................................ ## Parse command-line arguments #### @@ -33,14 +33,14 @@ required$add_argument( required$add_argument( "--method", required = TRUE, - help = "The integration method to use", + help ="The integration method to use", metavar = "Liger" ) required$add_argument( "--unique_id_var", required = TRUE, - help = "Unique id variable", + help ="Unique id variable", metavar = "manifest" ) @@ -48,7 +48,7 @@ required$add_argument( "--take_gene_union", default = FALSE, required = TRUE, - help = "Whether to fill out raw.data matrices with union of genes", + help ="Whether to fill out raw.data matrices with union of genes across all datasets (filling in 0 for missing data)", metavar = "Boolean" ) @@ -56,7 +56,7 @@ required$add_argument( "--remove_missing", default = TRUE, required = TRUE, - help = "Remove non-expressive genes and cells", + help ="Whether to remove cells not expressing any measured genes, and genes not expressed in any cells", metavar = "Boolean" ) @@ -65,7 +65,7 @@ required$add_argument( default = 3000, type = "integer", required = TRUE, - help = "Number of genes to find for each dataset", + help ="Number of genes to find for each dataset", metavar = "N" ) @@ -73,23 +73,15 @@ required$add_argument( "--combine", default = "union", required = TRUE, - help = "How to combine variable genes across experiments", + help ="How to combine variable genes across experiments", metavar = "union,intersect" ) -required$add_argument( - "--keep_unique", - default = FALSE, - required = TRUE, - help = "Keep genes that occur only in one dataset", - metavar = "Boolean" -) - required$add_argument( "--capitalize", default = FALSE, required = TRUE, - help = "Capitalize gene names to match homologous genes(i.e. across species)", + help ="Capitalize gene names to match homologous genes(ie. across species)", metavar = "Boolean" ) @@ -97,7 +89,7 @@ required$add_argument( "--use_cols", default = TRUE, required = TRUE, - help = "Treat each column as a cell", + help ="Treat each column as a cell", metavar = "Boolean" ) @@ -106,7 +98,7 @@ required$add_argument( default = 30, type = "integer", required = TRUE, - help = "Inner dimension of factorization (number of factors)", + help ="Inner dimension of factorization (number of factors)", metavar = "N" ) @@ -115,7 +107,7 @@ required$add_argument( default = 5.0, type = "double", required = TRUE, - help = "Regularization parameter", + help ="Regularization parameter. Larger values penalize dataset-specific effects more strongly (ie. alignment should increase as lambda increases)", metavar = "N" ) @@ -124,7 +116,7 @@ required$add_argument( default = 0.0001, type = "double", required = TRUE, - help = "Convergence threshold.", + help ="Convergence threshold. Convergence occurs when |obj0-obj|/(mean(obj0,obj)) < thresh", metavar = "N" ) @@ -133,7 +125,7 @@ required$add_argument( default = 100, type = "integer", required = TRUE, - help = "Maximum number of block coordinate descent iterations to perform", + help ="Maximum number of block coordinate descent iterations to perform", metavar = "N" ) @@ -142,7 +134,7 @@ required$add_argument( default = 1, type = "integer", required = TRUE, - help = "Number of restarts to perform", + help ="Number of restarts to perform", metavar = "N" ) @@ -151,7 +143,7 @@ required$add_argument( default = 1, type = "integer", required = TRUE, - help = "Random seed to allow reproducible results", + help ="Random seed to allow reproducible results", metavar = "N" ) @@ -160,33 +152,15 @@ required$add_argument( default = 20, type = "integer", required = TRUE, - help = "Number of nearest neighbors for within-dataset knn graph", - metavar = "N" -) - -required$add_argument( - "--k2", - default = 500, - type = "integer", - required = TRUE, - help = "Horizon parameter for shared nearest factor graph", - metavar = "N" -) - -required$add_argument( - "--prune_thresh", - default = 0.2, - type = "double", - required = TRUE, - help = "Minimum allowed edge weight. Any edges below this are removed", + help ="Number of nearest neighbors for within-dataset knn graph", metavar = "N" ) required$add_argument( "--ref_dataset", - default = "", + default = '', required = TRUE, - help = "Name of dataset to use as a reference for normalization", + help ="Name of dataset to use as a reference for normalization", metavar = "ref" ) @@ -195,7 +169,7 @@ required$add_argument( default = 2, type = "integer", required = TRUE, - help = "Minimum number of cells to consider a cluster shared across datasets", + help ="Minimum number of cells to consider a cluster shared across datasets", metavar = "N" ) @@ -204,16 +178,7 @@ required$add_argument( default = 50, type = "integer", required = TRUE, - help = "Number of quantiles to use for quantile normalization", - metavar = "N" -) - -required$add_argument( - "--nstart", - default = 10, - type = "integer", - required = TRUE, - help = "Number of times to perform Louvain community detection", + help ="Number of quantiles to use for quantile normalization", metavar = "N" ) @@ -222,43 +187,18 @@ required$add_argument( default = 1, type = "double", required = TRUE, - help = "Controls the number of communities detected", + help ="Controls the number of communities detected (Higher resolution -> more communities)", metavar = "N" ) -required$add_argument( - "--dims_use", - default = "null", - required = TRUE, - help = "Indices of factors to use for shared nearest factor determination", - metavar = "Indices" -) - -required$add_argument( - "--dist_use", - default = "CR", - required = TRUE, - help = "Distance metric to use in calculating nearest neighbors", - metavar = "CR" -) - required$add_argument( "--center", default = FALSE, required = TRUE, - help = "Centers the data when scaling factors", + help ="Centers the data when scaling factors (useful for less sparse modalities like methylation data)", metavar = "Boolean" ) -required$add_argument( - "--small_clust_thresh", - default = 0, - type = "double", - required = TRUE, - help = "Extracts small clusters loading highly on single factor", - metavar = "N" -) - ### . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. ### Pre-process args #### @@ -290,12 +230,11 @@ sce <- integrate_sce( unique_id_var = args$unique_id_var, take_gene_union = args$take_gene_union, remove.missing = args$remove_missing, - make.sparse = T, num_genes = args$num_genes, combine = args$combine, - keep_unique = args$keep_unique, capitalize = args$capitalize, use_cols = args$use_cols, + num_cores = future::availableCores(methods = "mc.cores"), k = args$k, lambda = args$lambda, thresh = args$thresh, @@ -306,24 +245,15 @@ sce <- integrate_sce( V_init = NULL, rand_seed = args$rand_seed, knn_k = args$knn_k, - k2 = args$k2, - prune_thresh = args$prune_thresh, ref_dataset = args$ref_dataset, min_cells = args$min_cells, quantiles = args$quantiles, - nstart = args$nstart, resolution = args$resolution, - dims_use = args$dims_use, - dist_use = args$dist_use, center = args$center, - small_clust_thresh = args$small_clust_thresh, - do_plot = FALSE, - id_number = NULL, - print_obj = FALSE, - print_mod = FALSE, - print_align_summary = FALSE + print_obj = FALSE ) + ## ............................................................................ ## Save Outputs #### diff --git a/bin/scflow_ipa.r b/bin/scflow_ipa.r index 0cca3a8..10f6335 100755 --- a/bin/scflow_ipa.r +++ b/bin/scflow_ipa.r @@ -12,6 +12,7 @@ options(mc.cores = parallel::detectCores()) library(argparse) library(scFlow) library(cli) +library(dplyr) ## ............................................................................ ## Parse command-line arguments #### @@ -50,9 +51,32 @@ required$add_argument( required$add_argument( "--enrichment_database", help = "name of the enrichment databases", - metavar = "GO_Biological_Process,GO_Cellular_Component,GO_Molecular_Function", + metavar = "GO_Biological_Process,Reactome,Wikipathway", required = TRUE, - default = "KEGG" + default = "GO_Biological_Process" +) + +required$add_argument( + "--species", + help = "the biological species (e.g. mouse, human)", + default = "human", + required = TRUE +) + +required$add_argument( + "--fc_threshold", + type = "double", + default = 1.1, + metavar = "number", + help = "Absolute fold-change cutoff for DE [default %(default)s]" +) + +required$add_argument( + "--pval_cutoff", + type = "double", + default = 0.05, + metavar = "number", + help = "p-value cutoff for DE [default %(default)s]" ) @@ -62,6 +86,8 @@ required$add_argument( args <- parser$parse_args() +options("scflow_species" = args$species) + args$enrichment_method <- strsplit(args$enrichment_method, ",")[[1]] args$enrichment_tool <- strsplit(args$enrichment_tool, ",")[[1]] args$enrichment_database <- strsplit(args$enrichment_database, ",")[[1]] @@ -91,23 +117,47 @@ dir.create(output_dir) dir.create(report_dir) for (gene_file in args$gene_file) { - enrichment_result <- find_impacted_pathways( - gene_file = gene_file, - enrichment_tool = args$enrichment_tool, - enrichment_method = args$enrichment_method, - enrichment_database = args$enrichment_database, - is_output = TRUE, - output_dir = output_dir - ) - report_name <- tools::file_path_sans_ext(gene_file) - report_fp <- paste0(report_name, "_scflow_ipa_report") - report_impacted_pathway( - res = enrichment_result, - report_folder_path = report_dir, - report_file = report_fp - ) - cli::cli_text(c( - "{cli::col_green(symbol$tick)} Analysis complete, output is found at: ", - "{.file {output_dir}}" - )) -} + + dt <- read.delim(gene_file) + + dt <- dt %>% + dplyr::filter(padj <= args$pval_cutoff, + abs(logFC) >= log2(args$fc_threshold)) + + if (nrow(dt) < 5 ) { + cli::cli_alert_danger("Gene list is very short!") + } else { + + enrichment_result <- find_impacted_pathways( + gene_file = dt, + reference_file = NULL, + organism = getOption("scflow_species"), + enrichment_tool = args$enrichment_tool, + enrichment_method = args$enrichment_method, + enrichment_database = args$enrichment_database, + is_output = TRUE, + output_dir = output_dir + ) + + if (all(unlist(lapply( + enrichment_result, function(dt){ + isFALSE(dt$metadata$result)})))) { + cli::cli_alert_danger("No significant pathway was found at FDR 0.05") + } else { + + report_name <- tools::file_path_sans_ext(gene_file) + report_fp <- paste0(report_name, "_scflow_ipa_report") + + report_impacted_pathway( + res = enrichment_result, + report_folder_path = report_dir, + report_file = report_fp + ) + + cli::cli_text(c( + "{cli::col_green(symbol$tick)} Analysis complete, output is found at: ", + "{.file {output_dir}}" + )) + } + } +} \ No newline at end of file diff --git a/bin/scflow_qc.r b/bin/scflow_qc.r index 726a29a..6fff875 100755 --- a/bin/scflow_qc.r +++ b/bin/scflow_qc.r @@ -422,6 +422,11 @@ if (args$find_singlets) { ) } + +sce <- sce[ , sce$total_counts >= args$min_library_size] +sce <- sce[ , sce$total_features_by_counts >= args$min_features] + + dir.create(file.path(getwd(), "qc_report")) report_qc_sce( diff --git a/conf/modules.config b/conf/modules.config index 92362c1..ce287c0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -70,8 +70,10 @@ params { } 'scflow_reportintegrated' { - publish_dir = 'integration' - publish_files = ['integration_report':'../reports'] + publish_dir = 'reports' + publish_files = [ + 'integration_report':'' + ] } 'scflow_mapcelltypes' { @@ -112,9 +114,9 @@ params { } 'scflow_dirichlet' { - publish_dir = 'dirichlet' + publish_dir = 'reports' publish_files = [ - 'dirichlet_report':'../reports' + 'dirichlet_report':'' ] } diff --git a/conf/scflow_analysis.config b/conf/scflow_analysis.config index 88ac706..6077605 100644 --- a/conf/scflow_analysis.config +++ b/conf/scflow_analysis.config @@ -14,13 +14,13 @@ params { qc_max_ribo = 1 qc_min_counts = 2 qc_min_cells = 2 - qc_drop_unmapped = true - qc_drop_mito = true - qc_drop_ribo = true + qc_drop_unmapped = 'true' + qc_drop_mito = 'true' + qc_drop_ribo = 'true' qc_nmads = 4.0 // Options: Ambient RNA Profiling - amb_find_cells = false + amb_find_cells = 'false' amb_lower = 100 amb_retain = 'auto' // if numeric, pass as string amb_alpha_cutoff = 0.001 @@ -28,7 +28,7 @@ params { amb_expect_cells = 3000 // Options: Multiplet Identification - mult_find_singlets = false + mult_find_singlets = 'false' mult_singlets_method = 'doubletfinder' mult_vars_to_regress_out = 'nCount_RNA,pc_mito' // * mult_pca_dims = 10 @@ -39,32 +39,25 @@ params { // Options: Integration integ_method = 'Liger' + integ_k = 30 integ_unique_id_var = 'manifest' - integ_take_gene_union = false - integ_remove_missing = true + integ_take_gene_union = 'false' + integ_remove_missing = 'true' integ_num_genes = 3000 integ_combine = 'union' - integ_keep_unique = false - integ_capitalize = false - integ_use_cols = true - integ_k = 30 + integ_capitalize = 'false' + integ_use_cols = 'true' integ_lambda = 5.0 integ_thresh = 0.0001 integ_max_iters = 100 integ_nrep = 1 integ_rand_seed = 1 - integ_knn_k = 20 - integ_k2 = 500 - integ_prune_thresh = 0.2 - integ_ref_dataset = null - integ_min_cells = 2 integ_quantiles = 50 - integ_nstart = 10 + integ_ref_dataset = 'NULL' + integ_min_cells = 2 + integ_knn_k = 20 + integ_center = 'false' integ_resolution = 1 - integ_dims_use = null - integ_dist_use = 'CR' - integ_center = false - integ_small_clust_thresh = 0 // Options: Integration report integ_categorical_covariates = 'manifest,diagnosis,sex' // * @@ -72,7 +65,7 @@ params { // Options: Merge merge_plot_vars = 'total_features_by_counts,total_counts,pc_mito,pc_ribo' - merge_facet_vars = null // * + merge_facet_vars = 'null' // * merge_outlier_vars = 'total_features_by_counts,total_counts' // * // Options: Dimensionality Reduction @@ -93,7 +86,7 @@ params { reddim_umap_local_connectivity = 1 reddim_umap_repulsion_strength = 1 reddim_umap_negative_sample_rate = 5 - reddim_umap_fast_sgd = false + reddim_umap_fast_sgd = 'false' // tsne reddim_tsne_dims = 2 reddim_tsne_initial_dims = 50 @@ -102,9 +95,9 @@ params { reddim_tsne_stop_lying_iter = 250 reddim_tsne_mom_switch_iter = 250 reddim_tsne_max_iter = 1000 - reddim_tsne_pca_center = true - reddim_tsne_pca_scale = false - reddim_tsne_normalize = true + reddim_tsne_pca_center = 'true' + reddim_tsne_pca_scale = 'false' + reddim_tsne_normalize = 'true' reddim_tsne_momentum = 0.5 reddim_tsne_final_momentum = 0.8 reddim_tsne_eta = 1000 @@ -133,18 +126,19 @@ params { dge_mast_method = 'bayesglm' dge_min_counts = 1 dge_min_cells_pc = 0.1 - dge_rescale_numerics = true - dge_pseudobulk = false + dge_rescale_numerics = 'true' + dge_pseudobulk = 'false' dge_celltype_var = 'cluster_celltype' dge_sample_var = 'manifest' dge_dependent_var = 'diagnosis' dge_ref_class = 'Control' dge_confounding_vars = 'cngeneson' // * - dge_random_effects_var = null + dge_random_effects_var = 'null' dge_fc_threshold = 1.1 dge_pval_cutoff = 0.05 - dge_force_run = false - dge_max_cores = null + dge_n_label = 5 + dge_force_run = 'false' + dge_max_cores = 'null' // Options: Integrated Pathway Analysis ipa_enrichment_tool = 'WebGestaltR' @@ -156,7 +150,7 @@ params { dirich_celltype_var = 'cluster_celltype' dirich_dependent_var = 'diagnosis' dirich_ref_class = 'Control' - dirich_var_order = null // * + dirich_var_order = 'null' // * // Options: Plots (Reduced Dim) plotreddim_reduction_methods = 'UMAP_Liger' // * @@ -165,4 +159,5 @@ params { // Misc species = 'human' + max_cores = 'null' } diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 7c83440..1e70bdc 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -7,13 +7,14 @@ process GET_SOFTWARE_VERSIONS { publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } - - tag 'Version Info' - label 'process_tiny' + + tag "Version Info" + label 'process_low' //cache false + output: - path 'software_versions.tsv' , emit: tsv + path "software_versions.tsv" , emit: tsv script: // This script is bundled with the pipeline, in nf-core/scflow/bin/ """ @@ -21,4 +22,4 @@ process GET_SOFTWARE_VERSIONS { echo $workflow.nextflow.version > nextflow.version.txt scrape_software_versions.r software_versions.tsv """ -} +} \ No newline at end of file diff --git a/modules/local/process/scflow/dge.nf b/modules/local/process/scflow/dge.nf index 7459e3b..c866516 100644 --- a/modules/local/process/scflow/dge.nf +++ b/modules/local/process/scflow/dge.nf @@ -1,5 +1,5 @@ /* - * Generate 2D reduced dimension plots of gene expression + * Run differential gene expression analysis */ // Import generic module functions diff --git a/modules/local/process/scflow/ipa.nf b/modules/local/process/scflow/ipa.nf index 79c1169..42791b2 100644 --- a/modules/local/process/scflow/ipa.nf +++ b/modules/local/process/scflow/ipa.nf @@ -1,5 +1,5 @@ /* - * Integrated pathway analysis of differentially expressed genes + * Impacted pathway analysis of differentially expressed genes */ // Import generic module functions diff --git a/nextflow_schema.json b/nextflow_schema.json index 35c68a1..37e35c9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -55,11 +55,16 @@ "description": "Input sample species.", "help_text": "Currently, \"human\" and \"mouse\" are supported." }, + "max_cores": { + "type": "string", + "default": "'null'", + "description": "Maximum CPU cores.", + "help_text": "The default value of 'null' utilizes all available CPU cores. Manually overriding this parameter can reduce the memory demands of parallelization across multiple cores." + }, "outdir": { "type": "string", "default": "./results", - "description": "Outputs directory.", - "fa_icon": "fas fa-folder-open" + "description": "Outputs directory." } }, "required": [ @@ -67,7 +72,8 @@ "input", "ensembl_mappings", "ctd_path", - "species" + "species", + "max_cores" ], "help_text": "" }, @@ -88,7 +94,7 @@ "default": "seqdate", "description": "The sample sheet variables to treat as factors.", "help_text": "All sample sheet columns with numbers which should be treated as factors should be specified here separated by commas. Examples include columns with dates, numeric sample identifiers, etc.", - "fa_icon": "fas fa-layer-group" + "fa_icon": "fas fa-quote-left" }, "qc_min_library_size": { "type": "integer", @@ -123,7 +129,7 @@ }, "qc_max_ribo": { "type": "number", - "default": 1.0, + "default": 1, "description": "Maximum proportion of counts mapping to ribosomal genes.", "fa_icon": "fas fa-less-than-equal", "minimum": 0, @@ -150,25 +156,26 @@ "fa_icon": "fas fa-greater-than-equal" }, "qc_drop_unmapped": { - "type": "boolean", - "default": true, + "type": "string", + "default": "True", "description": "Option to drop unmapped genes.", "fa_icon": "fas fa-cut" }, "qc_drop_mito": { - "type": "boolean", - "default": true, + "type": "string", + "default": "True", "description": "Option to drop mitochondrial genes.", "fa_icon": "fas fa-cut" }, "qc_drop_ribo": { - "type": "boolean", + "type": "string", "description": "Option to drop ribosomal genes.", - "fa_icon": "fas fa-cut" + "fa_icon": "fas fa-cut", + "default": "false" }, "qc_nmads": { "type": "number", - "default": 4.0, + "default": 4, "description": "The number of MADs for outlier detection.", "help_text": "The number of median absolute deviations (MADs) used to define outliers for adaptive thresholding.", "fa_icon": "fas fa-mountain" @@ -199,8 +206,8 @@ "default": "", "properties": { "amb_find_cells": { - "type": "boolean", - "default": true, + "type": "string", + "default": "true", "description": "Enable ambient RNA / empty droplet profiling.", "fa_icon": "fas fa-cut" }, @@ -257,40 +264,34 @@ "default": "", "properties": { "mult_find_singlets": { - "type": "boolean", - "default": true, - "description": "Enable doublet/multiplet identification.", - "fa_icon": "fas fa-cut" + "type": "string", + "default": "true", + "description": "Enable doublet/multiplet identification." }, "mult_singlets_method": { "type": "string", "default": "doubletfinder", - "description": "Algorithm to use for doublet/multiplet identification.", - "fa_icon": "fas fa-toolbox" + "description": "Algorithm to use for doublet/multiplet identification." }, "mult_vars_to_regress_out": { "type": "string", "default": "nCount_RNA,pc_mito", - "description": "Variables to regress out for dimensionality reduction.", - "fa_icon": "fas fa-layer-group" + "description": "Variables to regress out for dimensionality reduction." }, "mult_pca_dims": { "type": "integer", "default": 10, - "description": "Number of PCA dimensions to use.", - "fa_icon": "fas fa-calculator" + "description": "Number of PCA dimensions to use." }, "mult_var_features": { "type": "integer", "default": 2000, - "description": "The top n most variable features to use.", - "fa_icon": "fas fa-calculator" + "description": "The top n most variable features to use." }, "mult_doublet_rate": { "type": "number", "description": "A fixed doublet rate.", - "help_text": "Use a fixed default rate (e.g. 0.075 to specify that 7.5% of all cells should be marked as doublets), or set to 0 to use the \"dpk\" method (recommended).", - "fa_icon": "fas fa-calculator" + "help_text": "Use a fixed default rate (e.g. 0.075 to specify that 7.5% of all cells should be marked as doublets), or set to 0 to use the \"dpk\" method (recommended)." }, "mult_dpk": { "type": "integer", @@ -298,15 +299,13 @@ "description": "Doublets per thousand cells increment.", "help_text": "The doublets per thousand cell increment specifies the expected doublet rate based on the number of cells, i.e. with a dpk of 8 (recommended by 10X), a dataset with 1000 cells is expected to contain 8 doublets per thousand cells, a dataset with 2000 cells is expected to contain 16 doublets per thousand cells, and a dataset with 10000 cells is expected to contain 80 cells per thousand cells (or 800 doublets in total). If the \"doublet_rate\" parameter is manually specified this recommended incremental behaviour is overridden.", "minimum": 0, - "maximum": 1000, - "fa_icon": "fas fa-calculator" + "maximum": 1000 }, "mult_pK": { "type": "number", "default": 0.02, "description": "Specify a pK value instead of parameter sweep.", - "help_text": "The optimal pK value used by the doubletFinder algorithm is determined following a compute-intensive parameter sweep. The parameter sweep can be overridden by manually specifying a pK value.", - "fa_icon": "fas fa-calculator" + "help_text": "The optimal pK value used by the doubletFinder algorithm is determined following a compute-intensive parameter sweep. The parameter sweep can be overridden by manually specifying a pK value." } }, "fa_icon": "fas fa-adjust", @@ -330,26 +329,25 @@ "type": "string", "default": "total_features_by_counts,total_counts,pc_mito,pc_ribo", "description": "Numeric variables for inter-sample metrics.", - "help_text": "A comma-separated list of numeric variables which differ between individual cells of each sample. The merged sample report will include plots facilitating between-sample comparisons for each of these numeric variables.", - "fa_icon": "fas fa-layer-group" + "help_text": "A comma-separated list of numeric variables which differ between individual cells of each sample. The merged sample report will include plots facilitating between-sample comparisons for each of these numeric variables." }, "merge_facet_vars": { "type": "string", + "default": "NULL", "description": "Categorical variables for further sub-setting of plots", - "help_text": "A comma-separated list of categorical variables. The merged sample report will include additional plots of sample metrics subset by each of these variables (e.g. sex, diagnosis).", - "fa_icon": "fas fa-layer-group" + "help_text": "A comma-separated list of categorical variables. The merged sample report will include additional plots of sample metrics subset by each of these variables (e.g. sex, diagnosis)." }, "merge_outlier_vars": { "type": "string", "default": "total_features_by_counts,total_counts", "description": "Numeric variables for outlier identification.", - "help_text": "The merged report will include tables highlighting samples that are putative outliers for each of these numeric variables.", - "fa_icon": "fas fa-layer-group" + "help_text": "The merged report will include tables highlighting samples that are putative outliers for each of these numeric variables." } }, "fa_icon": "fas fa-object-ungroup", "required": [ "merge_plot_vars", + "merge_facet_vars", "merge_outlier_vars" ] }, @@ -362,223 +360,155 @@ "integ_method": { "type": "string", "default": "Liger", - "description": "Choice of integration method.", - "fa_icon": "fas fa-toolbox" + "description": "Choice of integration method." + }, + "integ_k": { + "type": "integer", + "default": 30, + "description": "Inner dimension of factorization (n factors).", + "help_text": "See rliger::optimizeALS(). Inner dimension of factorization (number of factors). Run suggestK to determine appropriate value; a general rule of thumb is that a higher k will be needed for datasets with more sub-structure." }, "integ_unique_id_var": { "type": "string", "default": "manifest", - "description": "Unique sample identifier variable.", - "fa_icon": "fas fa-key" + "description": "Unique sample identifier variable." }, "integ_take_gene_union": { - "type": "boolean", + "type": "string", + "default": "false", "description": "Fill out matrices with union of genes.", - "help_text": "See rliger::createLiger(). Whether to fill out raw.data matrices with union of genes across all datasets (filling in 0 for missing data) (requires make.sparse = TRUE) (default FALSE).", - "fa_icon": "fas fa-cut" + "help_text": "See rliger::createLiger(). Whether to fill out raw.data matrices with union of genes across all datasets (filling in 0 for missing data) (requires make.sparse = TRUE) (default FALSE)." }, "integ_remove_missing": { - "type": "boolean", - "default": true, + "type": "string", + "default": "true", "description": "Remove non-expressing cells/genes.", - "help_text": "See rliger::createLiger(). Whether to remove cells not expressing any measured genes, and genes not expressed in any cells (if take.gene.union = TRUE, removes only genes not expressed in any dataset) (default TRUE).", - "fa_icon": "fas fa-cut" + "help_text": "See rliger::createLiger(). Whether to remove cells not expressing any measured genes, and genes not expressed in any cells (if take.gene.union = TRUE, removes only genes not expressed in any dataset) (default TRUE)." }, "integ_num_genes": { "type": "integer", "default": 3000, "description": "Number of genes to find for each dataset.", - "help_text": "See rliger::selectGenes(). Number of genes to find for each dataset. Optimises the value of var.thresh for each dataset to get this number of genes.", - "fa_icon": "fas fa-calculator" + "help_text": "See rliger::selectGenes(). Number of genes to find for each dataset. Optimises the value of var.thresh for each dataset to get this number of genes." }, "integ_combine": { "type": "string", "default": "union", "description": "How to combine variable genes across experiments.", - "help_text": "See rliger::selectGenes(). Either \"union\" or \"intersection\".", - "fa_icon": "fas fa-calculator" - }, - "integ_keep_unique": { - "type": "boolean", - "description": "Keep unique genes.", - "help_text": "See rliger::selectGenes().", - "fa_icon": "fas fa-cut" + "help_text": "See rliger::selectGenes(). Either \"union\" or \"intersection\"." }, "integ_capitalize": { - "type": "boolean", + "type": "string", + "default": "false", "description": "Capitalize gene names to match homologous genes.", - "help_text": "See rliger::selectGenes().", - "fa_icon": "fab fa-adn" + "help_text": "See rliger::selectGenes()." }, "integ_use_cols": { - "type": "boolean", - "default": true, + "type": "string", + "default": "true", "description": "Treat each column as a cell.", - "help_text": "See rliger::removeMissingObs().", - "fa_icon": "fas fa-columns" - }, - "integ_k": { - "type": "integer", - "default": 30, - "description": "Inner dimension of factorization (n factors).", - "help_text": "See rliger::optimizeALS(). Inner dimension of factorization (number of factors). Run suggestK to determine appropriate value; a general rule of thumb is that a higher k will be needed for datasets with more sub-structure.", - "fa_icon": "fas fa-calculator" + "help_text": "See rliger::removeMissingObs()." }, "integ_lambda": { "type": "number", - "default": 5.0, + "default": 5, "description": "Regularization parameter.", - "help_text": "See rliger::optimizeALS(). Regularization parameter. Larger values penalize dataset-specific effects more strongly (ie. alignment should increase as lambda increases). Run suggestLambda to determine most appropriate value for balancing dataset alignment and agreement (default 5.0).", - "fa_icon": "fas fa-calculator" + "help_text": "See rliger::optimizeALS(). Regularization parameter. Larger values penalize dataset-specific effects more strongly (ie. alignment should increase as lambda increases). Run suggestLambda to determine most appropriate value for balancing dataset alignment and agreement (default 5.0)." }, "integ_thresh": { "type": "number", "default": 0.0001, "description": "Convergence threshold.", - "help_text": "See rliger::optimizeALS().", - "fa_icon": "fas fa-calculator" + "help_text": "See rliger::optimizeALS()." }, "integ_max_iters": { "type": "integer", "default": 100, "description": "Maximum number of block coordinate descent iterations.", - "help_text": "See rliger::optimizeALS().", - "fa_icon": "fas fa-less-than-equal" + "help_text": "See rliger::optimizeALS()." }, "integ_nrep": { "type": "integer", "default": 1, "description": "Number of restarts to perform.", - "help_text": "See rliger::optimizeALS().", - "fa_icon": "fas fa-calculator" + "help_text": "See rliger::optimizeALS()." }, "integ_rand_seed": { "type": "integer", "default": 1, - "description": "Random seed for reproducible results.", - "fa_icon": "fas fa-calculator" + "description": "Random seed for reproducible results." }, - "integ_knn_k": { - "type": "integer", - "default": 20, - "description": "Number of neearest neighbours for within-dataset knn graph.", - "help_text": "See rliger::quantile_norm().", - "fa_icon": "fas fa-calculator" - }, - "integ_k2": { + "integ_quantiles": { "type": "integer", - "default": 500, - "description": "Horizon parameter for shared nearest factor graph.", - "help_text": "See rliger::quantileAlignSNF(). Distances to all but the k2 nearest neighbors are set to 0 (cuts down on memory usage for very large graphs).", - "fa_icon": "fas fa-calculator" - }, - "integ_prune_thresh": { - "type": "number", - "default": 0.2, - "description": "Minimum allowed edge weight.", - "help_text": "See rliger::quantileAlignSNF().", - "fa_icon": "fas fa-greater-than-equal" + "default": 50, + "description": "Number of quantiles to use for normalization.", + "help_text": "See rliger::quantile_norm()." }, "integ_ref_dataset": { "type": "string", + "default": "NULL", "description": "Name of dataset to use as a reference.", - "help_text": "See rliger::quantile_norm(). Name of dataset to use as a \"reference\" for normalization. By default, the dataset with the largest number of cells is used.", - "fa_icon": "fas fa-quote-left" + "help_text": "See rliger::quantile_norm(). Name of dataset to use as a \"reference\" for normalization. By default, the dataset with the largest number of cells is used." }, "integ_min_cells": { "type": "integer", "default": 2, "description": "Minimum number of cells to consider a cluster shared across datasets.", - "help_text": "See rliger::quantile_norm().", - "fa_icon": "fas fa-greater-than-equal" + "help_text": "See rliger::quantile_norm()." }, - "integ_quantiles": { + "integ_knn_k": { "type": "integer", - "default": 50, - "description": "Number of quantiles to use for normalization.", - "help_text": "See rliger::quantile_norm().", - "fa_icon": "fas fa-calculator" + "default": 20, + "description": "Number of neearest neighbours for within-dataset knn graph.", + "help_text": "See rliger::quantile_norm()." }, - "integ_nstart": { - "type": "integer", - "default": 10, - "description": "Number of times to perform Louvain community detection.", - "help_text": "See rliger::quantileAlignSNF(). Number of times to perform Louvain community detection with different random starts (default 10).", - "fa_icon": "fas fa-recycle" + "integ_center": { + "type": "string", + "default": "false", + "description": "Center the data when scaling factors.", + "help_text": "See rliger::quantile_norm()." }, "integ_resolution": { "type": "integer", "default": 1, "description": "Controls the number of communities detected.", - "help_text": "See rliger::quantileAlignSNF().", - "fa_icon": "fas fa-calculator" - }, - "integ_dims_use": { - "type": "string", - "description": "Indices of factors to use for shared nearest factor determination.", - "help_text": "See rliger::quantile_norm().", - "fa_icon": "fas fa-calculator" - }, - "integ_dist_use": { - "type": "string", - "default": "CR", - "description": "Distance metric to use in calculating nearest neighbour.", - "help_text": "See rliger::quantileAlignSNF(). Default \"CR\".", - "fa_icon": "fas fa-digital-tachograph" - }, - "integ_center": { - "type": "boolean", - "description": "Center the data when scaling factors.", - "help_text": "See rliger::quantile_norm().", - "fa_icon": "fas fa-compress-arrows-alt" - }, - "integ_small_clust_thresh": { - "type": "integer", - "help_text": "See rliger::quantileAlignSNF(). Extracts small clusters loading highly on single factor with fewer cells than this before regular alignment (default 0 \u2013 no small cluster extraction).", - "description": "Small cluster extraction cells threshold.", - "fa_icon": "fas fa-calculator" + "help_text": "See rliger::quantileAlignSNF()." }, "integ_categorical_covariates": { "type": "string", "default": "individual,diagnosis,region,sex", "description": "Categorical variables for integration report metrics.", - "help_text": "The integration report will provide plots and integration metrics for these categorical variables.", - "fa_icon": "fas fa-layer-group" + "help_text": "The integration report will provide plots and integration metrics for these categorical variables." }, "integ_input_reduced_dim": { "type": "string", "default": "UMAP", "description": "Reduced dimension embedding for the integration report.", - "help_text": "The integration report will provide with and without integration plots using this embedding.", - "fa_icon": "fas fa-chess-board" + "help_text": "The integration report will provide with and without integration plots using this embedding." } }, "fa_icon": "far fa-object-group", "required": [ "integ_method", + "integ_k", "integ_unique_id_var", "integ_take_gene_union", "integ_remove_missing", "integ_num_genes", "integ_combine", - "integ_keep_unique", "integ_capitalize", "integ_use_cols", - "integ_k", "integ_lambda", "integ_thresh", "integ_max_iters", "integ_nrep", "integ_rand_seed", - "integ_knn_k", - "integ_k2", - "integ_prune_thresh", - "integ_min_cells", "integ_quantiles", - "integ_nstart", - "integ_resolution", - "integ_dist_use", + "integ_ref_dataset", + "integ_min_cells", + "integ_knn_k", "integ_center", + "integ_resolution", "integ_categorical_covariates", "integ_input_reduced_dim" ] @@ -592,42 +522,36 @@ "reddim_input_reduced_dim": { "type": "string", "default": "PCA,Liger", - "description": "Input matrix for dimension reduction.", - "fa_icon": "fas fa-chess-board" + "description": "Input matrix for dimension reduction." }, "reddim_reduction_methods": { "type": "string", "default": "tSNE,UMAP,UMAP3D", "description": "Dimension reduction outputs to generate.", - "help_text": "Typically 'UMAP,UMAP3D' or 'tSNE'.", - "fa_icon": "fas fa-toolbox" + "help_text": "Typically 'UMAP,UMAP3D' or 'tSNE'." }, "reddim_vars_to_regress_out": { "type": "string", "default": "nCount_RNA,pc_mito", - "description": "Variables to regress out before dimension reduction.", - "fa_icon": "fas fa-layer-group" + "description": "Variables to regress out before dimension reduction." }, "reddim_umap_pca_dims": { "type": "integer", "default": 30, "description": "Number of PCA dimensions.", - "help_text": "See uwot::umap().", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap()." }, "reddim_umap_n_neighbors": { "type": "integer", "default": 35, "description": "Number of nearest neighbours to use.", - "help_text": "See uwot::umap().", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap()." }, "reddim_umap_n_components": { "type": "integer", "default": 2, "description": "The dimension of the space to embed into.", - "help_text": "See uwot::umap(). The dimension of the space to embed into. This defaults to 2 to provide easy visualization, but can reasonably be set to any integer value in the range 2 to 100.", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap(). The dimension of the space to embed into. This defaults to 2 to provide easy visualization, but can reasonably be set to any integer value in the range 2 to 100." }, "reddim_umap_init": { "type": "string", @@ -643,8 +567,7 @@ "pca", "spca", "agspectral" - ], - "fa_icon": "fas fa-calculator" + ] }, "reddim_umap_metric": { "type": "string", @@ -658,169 +581,147 @@ "hamming", "correlation", "categorical" - ], - "fa_icon": "fas fa-digital-tachograph" + ] }, "reddim_umap_n_epochs": { "type": "integer", "default": 200, "description": "Number of epochs to us during optimization of embedded coordinates.", - "help_text": "See uwot::umap().", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap()." }, "reddim_umap_learning_rate": { "type": "integer", "default": 1, "description": "Initial learning rate used in optimization of coordinates.", - "help_text": "See uwot::umap().", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap()." }, "reddim_umap_min_dist": { "type": "number", "default": 0.4, "description": "Effective minimum distance between embedded points.", - "help_text": "See uwot::umap(). Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the spread value, which determines the scale at which embedded points will be spread out.", - "fa_icon": "fas fa-greater-than-equal" + "help_text": "See uwot::umap(). Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the spread value, which determines the scale at which embedded points will be spread out." }, "reddim_umap_spread": { "type": "number", "default": 0.85, "description": "Effective scale of embedded points.", - "help_text": "See uwot::umap(). In combination with min_dist, this determines how clustered/clumped the embedded points are.", - "fa_icon": "fas fa-arrows-alt-h" + "help_text": "See uwot::umap(). In combination with min_dist, this determines how clustered/clumped the embedded points are." }, "reddim_umap_set_op_mix_ratio": { "type": "number", - "default": 1.0, + "default": 1, "description": "Interpolation to combine local fuzzy sets.", "help_text": "See uwot::umap(). The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.", "minimum": 0, - "maximum": 1, - "fa_icon": "fas fa-adjust" + "maximum": 1 }, "reddim_umap_local_connectivity": { "type": "integer", "default": 1, "description": "Local connectivity required.", - "help_text": "See uwot::umap(). The local connectivity required \u2013 i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally.", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap(). The local connectivity required \u2013 i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally." }, "reddim_umap_repulsion_strength": { "type": "integer", "default": 1, "description": "Weighting applied to negative samples in embedding optimization.", - "help_text": "See uwot::umap(). Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples.", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap(). Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples." }, "reddim_umap_negative_sample_rate": { "type": "integer", "default": 5, "description": "Number of negative edge samples to use per positive edge sample.", - "help_text": "See uwot::umap(). The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding.", - "fa_icon": "fas fa-calculator" + "help_text": "See uwot::umap(). The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding." }, "reddim_umap_fast_sgd": { - "type": "boolean", + "type": "string", + "default": "false", "description": "Use fast SGD.", - "help_text": "See uwot::umap(). Setting this to TRUE will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, fast_sgd = TRUE will give perfectly good results. For more generic dimensionality reduction, it's safer to leave fast_sgd = FALSE.", - "fa_icon": "fas fa-skiing" + "help_text": "See uwot::umap(). Setting this to TRUE will speed up the stochastic optimization phase, but give a potentially less accurate embedding, and which will not be exactly reproducible even with a fixed seed. For visualization, fast_sgd = TRUE will give perfectly good results. For more generic dimensionality reduction, it's safer to leave fast_sgd = FALSE." }, "reddim_tsne_dims": { "type": "integer", "default": 2, "description": "Output dimensionality.", - "help_text": "See Rtsne::Rtsne().", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne()." }, "reddim_tsne_initial_dims": { "type": "integer", "default": 50, "description": "Number of dimensions retained in the initial PCA step.", - "help_text": "See Rtsne::Rtsne().", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne()." }, "reddim_tsne_perplexity": { "type": "integer", "default": 150, "description": "Perplexity parameter.", - "help_text": "See Rtsne::Rtsne().", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne()." }, "reddim_tsne_theta": { "type": "number", "default": 0.5, "description": "Speed/accuracy trade-off.", - "help_text": "See Rtsne::Rtsne(). Speed/accuracy trade-off (increase for less accuracy), set to 0.0 for exact TSNE (default: 0.5).", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). Speed/accuracy trade-off (increase for less accuracy), set to 0.0 for exact TSNE (default: 0.5)." }, "reddim_tsne_stop_lying_iter": { "type": "integer", "default": 250, "description": "Iteration after which perplexities are no longer exaggerated.", - "help_text": "See Rtsne::Rtsne(). Iteration after which the perplexities are no longer exaggerated (default: 250, except when Y_init is used, then 0).", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). Iteration after which the perplexities are no longer exaggerated (default: 250, except when Y_init is used, then 0)." }, "reddim_tsne_mom_switch_iter": { "type": "integer", "default": 250, "description": "Iteration after which the final momentum is used.", - "help_text": "See Rtsne::Rtsne(). Iteration after which the final momentum is used (default: 250, except when Y_init is used, then 0).", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). Iteration after which the final momentum is used (default: 250, except when Y_init is used, then 0)." }, "reddim_tsne_max_iter": { "type": "integer", "default": 1000, "description": "Number of iterations.", - "help_text": "See Rtsne::Rtsne(). ", - "fa_icon": "fas fa-less-than-equal" + "help_text": "See Rtsne::Rtsne(). " }, "reddim_tsne_pca_center": { - "type": "boolean", - "default": true, + "type": "string", + "default": "true", "description": "Center data before PCA.", - "help_text": "See Rtsne::Rtsne(). Should data be centered before pca is applied? (default: TRUE)", - "fa_icon": "fas fa-compress-arrows-alt" + "help_text": "See Rtsne::Rtsne(). Should data be centered before pca is applied? (default: TRUE)" }, "reddim_tsne_pca_scale": { - "type": "boolean", + "type": "string", + "default": "false", "description": "Scale data before PCA.", - "help_text": "See Rtsne::Rtsne(). Should data be scaled before pca is applied? (default: FALSE).", - "fa_icon": "fas fa-balance-scale" + "help_text": "See Rtsne::Rtsne(). Should data be scaled before pca is applied? (default: FALSE)." }, "reddim_tsne_normalize": { - "type": "boolean", - "default": true, + "type": "string", + "default": "true", "description": "Normalize data before distance calculations.", - "help_text": "See Rtsne::Rtsne(). Should data be normalized internally prior to distance calculations with normalize_input? (default: TRUE)", - "fa_icon": "fas fa-balance-scale" + "help_text": "See Rtsne::Rtsne(). Should data be normalized internally prior to distance calculations with normalize_input? (default: TRUE)" }, "reddim_tsne_momentum": { "type": "number", "default": 0.5, "description": "Momentum used in the first part of optimization.", - "help_text": "See Rtsne::Rtsne(). ", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). " }, "reddim_tsne_final_momentum": { "type": "number", "default": 0.8, "description": "Momentum used in the final part of optimization.", - "help_text": "See Rtsne::Rtsne(). ", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). " }, "reddim_tsne_eta": { "type": "integer", "default": 1000, "description": "Learning rate.", - "help_text": "See Rtsne::Rtsne(). ", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). " }, "reddim_tsne_exaggeration_factor": { "type": "integer", "default": 12, "description": "Exaggeration factor used in the first part of the optimization.", - "help_text": "See Rtsne::Rtsne(). Exaggeration factor used to multiply the P matrix in the first part of the optimization (default: 12.0).", - "fa_icon": "fas fa-calculator" + "help_text": "See Rtsne::Rtsne(). Exaggeration factor used to multiply the P matrix in the first part of the optimization (default: 12.0)." } }, "fa_icon": "fas fa-cubes", @@ -868,34 +769,29 @@ "type": "string", "default": "leiden", "description": "Clustering method.", - "help_text": "Specify \"leiden\" or \"louvain\".", - "fa_icon": "fas fa-toolbox" + "help_text": "Specify \"leiden\" or \"louvain\"." }, "clust_reduction_method": { "type": "string", "default": "UMAP_Liger", "description": "Reduced dimension input(s) for clustering.", - "help_text": "One or more of \"UMAP\", \"tSNE\", \"PCA\", \"LSI\".", - "fa_icon": "fas fa-chess-board" + "help_text": "One or more of \"UMAP\", \"tSNE\", \"PCA\", \"LSI\"." }, "clust_res": { "type": "number", "default": 0.001, - "description": "The resolution of clustering.", - "fa_icon": "fas fa-calculator" + "description": "The resolution of clustering." }, "clust_k": { "type": "integer", "default": 50, "description": "Integer number of nearest neighbours for clustering.", - "help_text": "Integer number of nearest neighbors to use when creating the k nearest neighbor graph for Louvain/Leiden clustering. k is related to the resolution of the clustering result, a bigger k will result in lower resolution and vice versa.", - "fa_icon": "fas fa-calculator" + "help_text": "Integer number of nearest neighbors to use when creating the k nearest neighbor graph for Louvain/Leiden clustering. k is related to the resolution of the clustering result, a bigger k will result in lower resolution and vice versa." }, "clust_louvain_iter": { "type": "integer", "default": 1, - "description": "The number of iterations for clustering.", - "fa_icon": "fas fa-recycle" + "description": "The number of iterations for clustering." } }, "fa_icon": "fas fa-braille", @@ -916,44 +812,37 @@ "cta_clusters_colname": { "type": "string", "default": "clusters", - "description": "SingleCellExperiment clusters colData variable name.", - "fa_icon": "fas fa-quote-left" + "description": "SingleCellExperiment clusters colData variable name." }, "cta_cells_to_sample": { "type": "integer", "default": 10000, - "description": "Max cells to sample.", - "fa_icon": "fas fa-calculator" + "description": "Max cells to sample." }, "cta_unique_id_var": { "type": "string", "default": "individual", - "description": "A sample metadata unique sample ID.", - "fa_icon": "fas fa-key" + "description": "A sample metadata unique sample ID." }, "cta_celltype_var": { "type": "string", "default": "cluster_celltype", - "description": "SingleCellExperiment cell-type colData variable name.", - "fa_icon": "fas fa-quote-left" + "description": "SingleCellExperiment cell-type colData variable name." }, "cta_facet_vars": { "type": "string", "default": "manifest,diagnosis,sex,capdate,prepdate,seqdate", - "description": "Cell-type metrics for categorical variables.", - "fa_icon": "fas fa-layer-group" + "description": "Cell-type metrics for categorical variables." }, "cta_metric_vars": { "type": "string", "default": "pc_mito,pc_ribo,total_counts,total_features_by_counts", - "description": "Cell-type metrics for numeric variables.", - "fa_icon": "fas fa-layer-group" + "description": "Cell-type metrics for numeric variables." }, "cta_top_n": { "type": "integer", "default": 5, - "description": "Number of top marker genes for plot/table generation.", - "fa_icon": "fas fa-calculator" + "description": "Number of top marker genes for plot/table generation." } }, "fa_icon": "fas fa-brain", @@ -976,8 +865,7 @@ "dge_de_method": { "type": "string", "default": "MASTZLM", - "description": "Differential gene expression method.", - "fa_icon": "fas fa-toolbox" + "description": "Differential gene expression method." }, "dge_mast_method": { "type": "string", @@ -988,15 +876,13 @@ "glm", "glmer", "bayesglm" - ], - "fa_icon": "fas fa-toolbox" + ] }, "dge_min_counts": { "type": "integer", "default": 1, "description": "Expressive gene minimum counts.", - "help_text": "Only genes with at least min_counts in min_cells_pc will be tested for differential gene expression.", - "fa_icon": "fas fa-greater-than-equal" + "help_text": "Only genes with at least min_counts in min_cells_pc will be tested for differential gene expression." }, "dge_min_cells_pc": { "type": "number", @@ -1004,87 +890,83 @@ "minimum": 0, "maximum": 1, "description": "Expressive gene minimum cells fraction.", - "help_text": "Only genes with at least min_counts in min_cells_pc will be tested for differential gene expression. Default 0.1 (i.e. 10% of cells).", - "fa_icon": "fas fa-greater-than-equal" + "help_text": "Only genes with at least min_counts in min_cells_pc will be tested for differential gene expression. Default 0.1 (i.e. 10% of cells)." }, "dge_rescale_numerics": { - "type": "boolean", - "default": true, + "type": "string", + "default": "true", "description": "Re-scale numeric covariates.", - "help_text": "Re-scaling and centring numeric covariates in a model can improve model performance.", - "fa_icon": "fas fa-balance-scale" + "help_text": "Re-scaling and centring numeric covariates in a model can improve model performance." }, "dge_pseudobulk": { - "type": "boolean", + "type": "string", + "default": "false", "description": "Pseudobulked differential gene expression.", - "help_text": "Perform differential gene expression on a smaller matrix where counts are first summed across all cells within a sample (defined by dge_sample_var level).", - "fa_icon": "far fa-object-group" + "help_text": "Perform differential gene expression on a smaller matrix where counts are first summed across all cells within a sample (defined by dge_sample_var level)." }, "dge_celltype_var": { "type": "string", "default": "cluster_celltype", "description": "Cell-type annotation variable name.", - "help_text": "Differential gene expression is performed separately for each cell-type of this colData variable.", - "fa_icon": "fas fa-quote-left" + "help_text": "Differential gene expression is performed separately for each cell-type of this colData variable." }, "dge_sample_var": { "type": "string", "default": "manifest", - "description": "Unique sample identifier variable.", - "fa_icon": "fas fa-key" + "description": "Unique sample identifier variable." }, "dge_dependent_var": { "type": "string", "default": "group", "description": "Dependent variable of DGE model.", - "help_text": "The dependent variable may be a categorical (e.g. diagnosis) or a numeric (e.g. histopathology score) variable.", - "fa_icon": "fas fa-quote-left" + "help_text": "The dependent variable may be a categorical (e.g. diagnosis) or a numeric (e.g. histopathology score) variable." }, "dge_ref_class": { "type": "string", "default": "Control", "help_text": "If a categorical dependent variable is specified, then the reference class of the dependent variable is specified here (e.g. 'Control').", - "description": "Reference class of categorical dependent variable.", - "fa_icon": "fas fa-quote-left" + "description": "Reference class of categorical dependent variable." }, "dge_confounding_vars": { "type": "string", "default": "cngeneson,seqdate,pc_mito", "description": "Confounding variables.", - "help_text": "A comma-separated list of confounding variables to account for in the DGE model.", - "fa_icon": "fas fa-layer-group" + "help_text": "A comma-separated list of confounding variables to account for in the DGE model." }, "dge_random_effects_var": { "type": "string", + "default": "NULL", "description": "Random effect confounding variable.", - "help_text": "If specified, the term `+ (1 | x ) +`is added to the model, where x is the specified random effects variable.", - "fa_icon": "fas fa-quote-left" + "help_text": "If specified, the term `+ (1 | x ) +`is added to the model, where x is the specified random effects variable." }, "dge_fc_threshold": { "type": "number", "default": 1.1, "description": "Fold-change threshold for plotting.", - "help_text": "This absolute fold-change cut-off value is used in plots (e.g. volcano) and the DGE report.", - "fa_icon": "fas fa-calculator" + "help_text": "This absolute fold-change cut-off value is used in plots (e.g. volcano) and the DGE report." }, "dge_pval_cutoff": { "type": "number", "default": 0.05, "description": "Adjusted p-value cutoff.", - "help_text": "The adjusted p-value cutoff value is used in plots (e.g. volcano) and the DGE report.", - "fa_icon": "fas fa-less-than-equal" + "help_text": "The adjusted p-value cutoff value is used in plots (e.g. volcano) and the DGE report." + }, + "dge_n_label": { + "type": "number", + "default": 5, + "help_text": "The number of genes to label in plots (e.g. volcano) and the DGE report." }, "dge_force_run": { - "type": "boolean", + "type": "string", + "default": "false", "description": "Force model fit for non-full rank.", - "help_text": "A non-full rank model specification will return an error; to override this to return a warning only, set to TRUE.", - "fa_icon": "fas fa-exclamation" + "help_text": "A non-full rank model specification will return an error; to override this to return a warning only, set to TRUE." }, "dge_max_cores": { - "type": "integer", + "type": "string", + "default": "'null'", "description": "Maximum CPU cores.", - "help_text": "The default value of 'null' utilizes all available CPU cores. As each additional CPU core increases the number of genes simultaneously fit, the RAM/memory demand increases concomitantly. Manually overriding this parameter can reduce the memory demands of parallelization across multiple cores.", - "fa_icon": "fas fa-microchip" + "help_text": "The default value of 'null' utilizes all available CPU cores. As each additional CPU core increases the number of genes simultaneously fit, the RAM/memory demand increases concomitantly. Manually overriding this parameter can reduce the memory demands of parallelization across multiple cores." } }, "fa_icon": "fas fa-chart-bar", @@ -1100,9 +982,12 @@ "dge_dependent_var", "dge_ref_class", "dge_confounding_vars", + "dge_random_effects_var", "dge_fc_threshold", "dge_pval_cutoff", - "dge_force_run" + "dge_n_label", + "dge_force_run", + "dge_max_cores" ] }, "impacted_pathway_analysis": { @@ -1114,26 +999,18 @@ "ipa_enrichment_tool": { "type": "string", "default": "WebGestaltR", - "description": "Pathway enrichment tool(s) to use.", - "enum": [ - "WebGestaltR", - "ROntoTools", - "enrichR" - ], - "fa_icon": "fas fa-toolbox" + "description": "Pathway enrichment tool(s) to use." }, "ipa_enrichment_method": { "type": "string", "default": "ORA", - "description": "Enrichment method.", - "fa_icon": "fas fa-layer-group" + "description": "Enrichment method." }, "ipa_enrichment_database": { "type": "string", "default": "GO_Biological_Process", "description": "Database(s) to use for enrichment.", - "help_text": "See scFlow::list_databases(). Name of the database(s) for enrichment. Examples include \"GO_Biological_Process\", \"GO_Cellular_Component\", \"GO_Molecular_Function\", \"KEGG\", \"Reactome\", \"Wikipathway\".", - "fa_icon": "fas fa-layer-group" + "help_text": "See scFlow::list_databases(). Name of the database(s) for enrichment. Examples include \"GO_Biological_Process\", \"GO_Cellular_Component\", \"GO_Molecular_Function\", \"KEGG\", \"Reactome\", \"Wikipathway\"." } }, "fa_icon": "fas fa-project-diagram", @@ -1152,33 +1029,28 @@ "dirich_unique_id_var": { "type": "string", "default": "individual", - "description": "Unique sampler identifier.", - "fa_icon": "fas fa-key" + "description": "Unique sampler identifier." }, "dirich_celltype_var": { "type": "string", "default": "cluster_celltype", - "description": "Cell-type annotation variable name.", - "fa_icon": "fas fa-quote-left" + "description": "Cell-type annotation variable name." }, "dirich_dependent_var": { "type": "string", "default": "group", - "description": "Dependent variable of Dirichlet model.", - "fa_icon": "fas fa-quote-left" + "description": "Dependent variable of Dirichlet model." }, "dirich_ref_class": { "type": "string", "default": "Control", - "description": "Reference class of categorical dependent variable.", - "fa_icon": "fas fa-quote-left" + "description": "Reference class of categorical dependent variable." }, "dirich_var_order": { "type": "string", "default": "Control,Low,High", "description": "Dependent variable classes order.", - "help_text": "For plotting and reports, the order of classes for the dependent variable can be manually specified (e.g. 'Control,Low,High').", - "fa_icon": "fas fa-layer-group" + "help_text": "For plotting and reports, the order of classes for the dependent variable can be manually specified (e.g. 'Control,Low,High')." } }, "fa_icon": "fas fa-chart-pie", @@ -1401,6 +1273,11 @@ "description": "Send plain-text email instead of HTML.", "hidden": true, "fa_icon": "fas fa-envelope" + }, + "options": { + "type": "string", + "description": "NA", + "hidden": true } } } diff --git a/workflows/scflow.nf b/workflows/scflow.nf index 9f27efd..d79ff84 100644 --- a/workflows/scflow.nf +++ b/workflows/scflow.nf @@ -94,32 +94,25 @@ scflow_merge_options.args = def scflow_integrate_options = modules['scflow_integrate'] scflow_integrate_options.args = "--method ${params.integ_method} \ + --k ${params.integ_k} \ --unique_id_var ${params.integ_unique_id_var} \ --take_gene_union ${params.integ_take_gene_union} \ --remove_missing ${params.integ_remove_missing} \ --num_genes ${params.integ_num_genes} \ --combine ${params.integ_combine} \ - --keep_unique ${params.integ_keep_unique} \ --capitalize ${params.integ_capitalize} \ --use_cols ${params.integ_use_cols} \ - --k ${params.integ_k} \ --lambda ${params.integ_lambda} \ --thresh ${params.integ_thresh} \ --max_iters ${params.integ_max_iters} \ --nrep ${params.integ_nrep} \ --rand_seed ${params.integ_rand_seed} \ - --knn_k ${params.integ_knn_k} \ - --k2 ${params.integ_k2} \ - --prune_thresh ${params.integ_prune_thresh} \ + --quantiles ${params.integ_quantiles} \ --ref_dataset ${params.integ_ref_dataset} \ --min_cells ${params.integ_min_cells} \ - --quantiles ${params.integ_quantiles} \ - --nstart ${params.integ_nstart} \ - --resolution ${params.integ_resolution} \ - --dims_use ${params.integ_dims_use} \ - --dist_use ${params.integ_dist_use} \ + --knn_k ${params.integ_knn_k} \ --center ${params.integ_center} \ - --small_clust_thresh ${params.integ_small_clust_thresh}" + --resolution ${params.integ_resolution}" def scflow_reducedims_options = modules['scflow_reducedims'] scflow_reducedims_options.args = @@ -188,7 +181,8 @@ scflow_finalize_options.args = --metric_vars ${params.cta_metric_vars} \ --top_n ${params.cta_top_n} \ --reddimplot_pointsize ${params.reddimplot_pointsize} \ - --reddimplot_alpha ${params.reddimplot_alpha}" + --reddimplot_alpha ${params.reddimplot_alpha} \ + --max_cores ${params.max_cores}" def scflow_dge_options = modules['scflow_dge'] scflow_dge_options.args = @@ -219,7 +213,10 @@ def scflow_ipa_options = modules['scflow_ipa'] scflow_ipa_options.args = "--enrichment_tool ${params.ipa_enrichment_tool} \ --enrichment_method ${params.ipa_enrichment_method} \ - --enrichment_database ${params.ipa_enrichment_database}" + --enrichment_database ${params.ipa_enrichment_database} \ + --pval_cutoff ${params.dge_pval_cutoff} \ + --fc_threshold ${params.dge_fc_threshold} \ + --species ${params.species}" def scflow_dirichlet_options = modules['scflow_dirichlet'] scflow_dirichlet_options.args = From ed044bb0c609be2ddc4ef391b5a36852156e015c Mon Sep 17 00:00:00 2001 From: nfancy Date: Mon, 11 Oct 2021 14:06:34 +0100 Subject: [PATCH 2/7] minor updates in cluster.nf --- bin/scflow_integrate.r | 2 +- modules/local/process/scflow/cluster.nf | 2 +- nextflow.config | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/scflow_integrate.r b/bin/scflow_integrate.r index 48ec53a..d0c2266 100755 --- a/bin/scflow_integrate.r +++ b/bin/scflow_integrate.r @@ -11,7 +11,7 @@ options(mc.cores = future::availableCores(methods = "mc.cores")) ## Load packages #### library(scFlow) library(argparse) -#library(parallel) +library(parallel) ## ............................................................................ ## Parse command-line arguments #### diff --git a/modules/local/process/scflow/cluster.nf b/modules/local/process/scflow/cluster.nf index 9c2ab10..4f5f42e 100644 --- a/modules/local/process/scflow/cluster.nf +++ b/modules/local/process/scflow/cluster.nf @@ -10,7 +10,7 @@ def options = initOptions(params.options) process SCFLOW_CLUSTER { tag 'MERGED' - label 'process_low' + label 'process_medium' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } diff --git a/nextflow.config b/nextflow.config index 7de6214..39a402b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,7 @@ manifest { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'almurphy/scfdev:dev' +process.container = 'almurphy/scfdev:0.7.1' //workDir = "/rds/general/user/$USER/ephemeral/tmp" workDir = './work' From 1b2fc11cd7c74c8c8cb241676ca7196500417695 Mon Sep 17 00:00:00 2001 From: nfancy Date: Mon, 11 Oct 2021 15:15:12 +0100 Subject: [PATCH 3/7] lint checks --- bin/scflow_dge.r | 65 +++++++++++++++----------- bin/scflow_ipa.r | 29 ++++++------ modules/local/get_software_versions.nf | 1 - 3 files changed, 52 insertions(+), 43 deletions(-) diff --git a/bin/scflow_dge.r b/bin/scflow_dge.r index ef4d27e..b7114b5 100755 --- a/bin/scflow_dge.r +++ b/bin/scflow_dge.r @@ -185,9 +185,11 @@ options("scflow_species" = args$species) args$rescale_numerics <- as.logical(args$rescale_numerics) args$pseudobulk <- as.logical(args$pseudobulk) args$force_run <- as.logical(args$force_run) -if(tolower(args$random_effects_var) == "null") args$random_effects_var <- NULL +if (tolower(args$random_effects_var) == "null") args$random_effects_var <- NULL -args$max_cores <- if(toupper(args$max_cores) == "NULL") NULL else { +args$max_cores <- if (toupper(args$max_cores) == "NULL") { + NULL +} else { as.numeric(as.character(args$max_cores)) } @@ -210,7 +212,6 @@ cli::cli_alert(sprintf( n_cores )) -# RhpcBLASctl::omp_set_num_threads(1L) library(scFlow) @@ -230,7 +231,9 @@ if (args$pseudobulk) { pb_str <- "_pb" sce_subset <- pseudobulk_sce( sce_subset, - keep_vars = c(args$dependent_var, args$confounding_vars, args$random_effects_var), + keep_vars = c(args$dependent_var, + args$confounding_vars, + args$random_effects_var), assay_name = "counts", celltype_var = args$celltype_var, sample_var = args$sample_var @@ -257,42 +260,48 @@ de_results <- perform_de( species = getOption("scflow_species") ) -file_name <- paste0(args$celltype, "_", - args$de_method, pb_str, "_") +file_name <- paste0( + args$celltype, "_", + args$de_method, pb_str, "_" +) for (result in names(de_results)) { if (dim(de_results[[result]])[[1]] > 0) { write.table(de_results[[result]], - file = file.path(getwd(), - paste0(file_name, result, "_DE.tsv")), - quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE) - + file = file.path( + getwd(), + paste0(file_name, result, "_DE.tsv") + ), + quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE + ) + report_de(de_results[[result]], - fc_threshold = args$fc_threshold, - pval_cutoff = args$pval_cutoff, - n_label = args$n_label, - report_folder_path = file.path(getwd()), - report_file = paste0(file_name, result, "_scflow_de_report")) - + fc_threshold = args$fc_threshold, + pval_cutoff = args$pval_cutoff, + n_label = args$n_label, + report_folder_path = file.path(getwd()), + report_file = paste0(file_name, result, "_scflow_de_report") + ) + print("report generated") - + p <- scFlow::volcano_plot( dt = de_results[[result]], fc_threshold = args$fc_threshold, - pval_cutoff = args$pval_cutoff, + pval_cutoff = args$pval_cutoff, n_label = args$n_label ) - ggplot2::ggsave(filename = file.path(getwd(), - paste0(file_name, result, "_volcano_plot.png")), - plot = p, - width = 7, height = 5, units = "in", dpi = 600) - + ggplot2::ggsave( + filename = file.path( + getwd(), + paste0(file_name, result, "_volcano_plot.png") + ), + plot = p, + width = 7, height = 5, units = "in", dpi = 600 + ) + print("Volcano plot generated") - - } else { print(sprintf("No DE genes found for %s", result)) } -} - - +} \ No newline at end of file diff --git a/bin/scflow_ipa.r b/bin/scflow_ipa.r index 10f6335..3b18b5e 100755 --- a/bin/scflow_ipa.r +++ b/bin/scflow_ipa.r @@ -117,17 +117,17 @@ dir.create(output_dir) dir.create(report_dir) for (gene_file in args$gene_file) { - dt <- read.delim(gene_file) - + dt <- dt %>% - dplyr::filter(padj <= args$pval_cutoff, - abs(logFC) >= log2(args$fc_threshold)) - - if (nrow(dt) < 5 ) { + dplyr::filter( + padj <= args$pval_cutoff, + abs(logFC) >= log2(args$fc_threshold) + ) + + if (nrow(dt) < 5) { cli::cli_alert_danger("Gene list is very short!") } else { - enrichment_result <- find_impacted_pathways( gene_file = dt, reference_file = NULL, @@ -138,22 +138,23 @@ for (gene_file in args$gene_file) { is_output = TRUE, output_dir = output_dir ) - + if (all(unlist(lapply( - enrichment_result, function(dt){ - isFALSE(dt$metadata$result)})))) { + enrichment_result, function(dt) { + isFALSE(dt$metadata$result) + } + )))) { cli::cli_alert_danger("No significant pathway was found at FDR 0.05") } else { - - report_name <- tools::file_path_sans_ext(gene_file) + report_name <- tools::file_path_sans_ext(gene_file) report_fp <- paste0(report_name, "_scflow_ipa_report") - + report_impacted_pathway( res = enrichment_result, report_folder_path = report_dir, report_file = report_fp ) - + cli::cli_text(c( "{cli::col_green(symbol$tick)} Analysis complete, output is found at: ", "{.file {output_dir}}" diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 9acdf16..e910f09 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -12,7 +12,6 @@ process GET_SOFTWARE_VERSIONS { label 'process_tiny' errorStrategy 'ignore' //cache false - output: path "software_versions.tsv" , emit: tsv From 63028a1de9ff686d509de7a8044c6fd9f92dde7e Mon Sep 17 00:00:00 2001 From: nfancy Date: Mon, 11 Oct 2021 15:24:47 +0100 Subject: [PATCH 4/7] EClint checks --- bin/scflow_dge.r | 11 ++++-- bin/scflow_finalize_sce.r | 79 ++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/bin/scflow_dge.r b/bin/scflow_dge.r index b7114b5..d419394 100755 --- a/bin/scflow_dge.r +++ b/bin/scflow_dge.r @@ -231,9 +231,11 @@ if (args$pseudobulk) { pb_str <- "_pb" sce_subset <- pseudobulk_sce( sce_subset, - keep_vars = c(args$dependent_var, - args$confounding_vars, - args$random_effects_var), + keep_vars = c( + args$dependent_var, + args$confounding_vars, + args$random_effects_var + ), assay_name = "counts", celltype_var = args$celltype_var, sample_var = args$sample_var @@ -291,6 +293,7 @@ for (result in names(de_results)) { pval_cutoff = args$pval_cutoff, n_label = args$n_label ) + ggplot2::ggsave( filename = file.path( getwd(), @@ -304,4 +307,4 @@ for (result in names(de_results)) { } else { print(sprintf("No DE genes found for %s", result)) } -} \ No newline at end of file +} diff --git a/bin/scflow_finalize_sce.r b/bin/scflow_finalize_sce.r index ffa1743..aed989e 100755 --- a/bin/scflow_finalize_sce.r +++ b/bin/scflow_finalize_sce.r @@ -24,42 +24,42 @@ optional <- parser$add_argument_group("Optional", "required arguments") required$add_argument( "--sce_path", help = "-path to the SingleCellExperiment", - metavar = "dir", + metavar = "dir", required = TRUE ) required$add_argument( "--celltype_mappings", help = "path to a tsv file with revised celltype mappings", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--clusters_colname", help = "name of the column with cluster numbers", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--celltype_var", help = "name of the column with celltype names", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--unique_id_var", help = "name of the column with unique sample ids", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--facet_vars", help = "names of variables to examine in the celltype metrics report", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) @@ -67,14 +67,14 @@ required$add_argument( required$add_argument( "--input_reduced_dim", help = "name of the reduced dimension slot to use for plots in the report", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) required$add_argument( "--metric_vars", help = "names of variables to examine in the celltype metrics report", - metavar = "foo/bar", + metavar = "foo/bar", required = TRUE ) @@ -83,7 +83,7 @@ required$add_argument( default = 5, type = "integer", required = TRUE, - help ="The number of top marker genes", + help = "The number of top marker genes", metavar = "N" ) @@ -123,7 +123,9 @@ args$metric_vars <- strsplit(args$metric_vars, ",")[[1]] options("scflow_reddimplot_pointsize" = args$reddimplot_pointsize) options("scflow_reddimplot_alpha" = args$reddimplot_alpha) -args$max_cores <- if(toupper(args$max_cores) == "NULL") NULL else { +args$max_cores <- if (toupper(args$max_cores) == "NULL") { + NULL +} else { as.numeric(as.character(args$max_cores)) } @@ -192,67 +194,66 @@ celltypes <- as.data.frame(SummarizedExperiment::colData(sce)) %>% colnames(celltypes) <- c("celltype", "n_cells") write.table( - data.frame(celltypes), - file = "celltypes.tsv", - row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") + data.frame(celltypes), + file = "celltypes.tsv", + row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t" +) ### Save Marker Gene Plots folder_path <- file.path(getwd(), "celltype_marker_plots") dir.create(folder_path) for (group in names(sce@metadata$markers)) { - - pwidth <- max(10, - length(unique(sce@metadata$markers[[group]]$marker_plot$data$Group)) + pwidth <- max( + 10, + length(unique(sce@metadata$markers[[group]]$marker_plot$data$Group)) ) pheight <- length(unique(sce@metadata$markers[[group]]$marker_plot$data$Gene)) - + p <- sce@metadata$markers[[group]]$marker_plot - + plot_file_name <- paste0(group, "_markers") - + # save PNG - png(file.path(folder_path, paste0(plot_file_name, ".png")), - width = pwidth * 12, height = pheight*5, units = "mm", res = 600) + png(file.path(folder_path, paste0(plot_file_name, ".png")), + width = pwidth * 12, height = pheight * 5, units = "mm", res = 600 + ) print(p) dev.off() - + # save PDF ggsave( file.path(folder_path, paste0(group, ".pdf")), - p, - width = pwidth * 12, - height = pheight * 5, - units = "mm", + p, + width = pwidth * 12, + height = pheight * 5, + units = "mm", scale = 1 ) - } ### Save Marker Gene Tables folder_path <- file.path(getwd(), "celltype_marker_tables") dir.create(folder_path) for (group in names(sce@metadata$markers)) { - marker_test_file_name <- paste0(group, "_markers_test.tsv") top_markers_file_name <- paste0(group, "_top_markers.tsv") - + write.table( - sce@metadata$markers[[group]]$marker_test_res, - file = file.path(folder_path, marker_test_file_name), - row.names = FALSE, - col.names = TRUE, + sce@metadata$markers[[group]]$marker_test_res, + file = file.path(folder_path, marker_test_file_name), + row.names = FALSE, + col.names = TRUE, sep = "\t" ) - + write.table( - sce@metadata$markers[[group]]$top_specific_markers, - file = file.path(folder_path, top_markers_file_name), - row.names = FALSE, - col.names = TRUE, + sce@metadata$markers[[group]]$top_specific_markers, + file = file.path(folder_path, top_markers_file_name), + row.names = FALSE, + col.names = TRUE, sep = "\t" ) - } From 5d218aba1c5750525d83eaa080c0793023ffb471 Mon Sep 17 00:00:00 2001 From: nfancy Date: Mon, 11 Oct 2021 15:28:36 +0100 Subject: [PATCH 5/7] EClint checks --- bin/scflow_finalize_sce.r | 2 +- bin/scflow_ipa.r | 2 +- modules/local/get_software_versions.nf | 2 +- nextflow.config | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/scflow_finalize_sce.r b/bin/scflow_finalize_sce.r index aed989e..d82cac2 100755 --- a/bin/scflow_finalize_sce.r +++ b/bin/scflow_finalize_sce.r @@ -264,4 +264,4 @@ write_sce( ) ## ............................................................................ -## Clean up #### +## Clean up #### \ No newline at end of file diff --git a/bin/scflow_ipa.r b/bin/scflow_ipa.r index 3b18b5e..5e09573 100755 --- a/bin/scflow_ipa.r +++ b/bin/scflow_ipa.r @@ -161,4 +161,4 @@ for (gene_file in args$gene_file) { )) } } -} \ No newline at end of file +} diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index e910f09..76a74f5 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -22,4 +22,4 @@ process GET_SOFTWARE_VERSIONS { echo $workflow.nextflow.version > nextflow.version.txt scrape_software_versions.r software_versions.tsv """ -} \ No newline at end of file +} diff --git a/nextflow.config b/nextflow.config index db640b7..c90fd42 100644 --- a/nextflow.config +++ b/nextflow.config @@ -192,4 +192,4 @@ def check_max(obj, type) { return obj } } -} \ No newline at end of file +} From 08e388c3ab7958c0efb95028ce26a9f572712bbc Mon Sep 17 00:00:00 2001 From: nfancy Date: Mon, 11 Oct 2021 15:29:33 +0100 Subject: [PATCH 6/7] EClint checks --- bin/scflow_finalize_sce.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scflow_finalize_sce.r b/bin/scflow_finalize_sce.r index d82cac2..aed989e 100755 --- a/bin/scflow_finalize_sce.r +++ b/bin/scflow_finalize_sce.r @@ -264,4 +264,4 @@ write_sce( ) ## ............................................................................ -## Clean up #### \ No newline at end of file +## Clean up #### From 4eb8e301864ca436d5980d4ba6bd6883bad8770c Mon Sep 17 00:00:00 2001 From: nfancy Date: Mon, 11 Oct 2021 15:30:35 +0100 Subject: [PATCH 7/7] EClint checks --- bin/scflow_finalize_sce.r | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/scflow_finalize_sce.r b/bin/scflow_finalize_sce.r index aed989e..2b4e473 100755 --- a/bin/scflow_finalize_sce.r +++ b/bin/scflow_finalize_sce.r @@ -263,5 +263,3 @@ write_sce( folder_path = file.path(getwd(), "final_sce") ) -## ............................................................................ -## Clean up ####