diff --git a/.gitignore b/.gitignore index de1abeac..43a5787a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,8 @@ scpca-references/ # ignore template htmls -qc_report.html +main_qc_report.html +celltypes_supplemental_report.html *_qc.html # ignore hidden `DS_Store` diff --git a/main.nf b/main.nf index 3d21e686..05973dba 100644 --- a/main.nf +++ b/main.nf @@ -26,7 +26,7 @@ cellhash_techs = single_cell_techs.findAll{it.startsWith('cellhash')} // report template path report_template_dir = file("${projectDir}/templates/qc_report", type: 'dir') -report_template_file = "qc_report.rmd" +report_template_file = "main_qc_report.rmd" report_template_tuple = tuple(report_template_dir, report_template_file) diff --git a/templates/qc_report/celltypes_qc.rmd b/templates/qc_report/celltypes_qc.rmd index b02bc35e..708113c0 100644 --- a/templates/qc_report/celltypes_qc.rmd +++ b/templates/qc_report/celltypes_qc.rmd @@ -1,5 +1,122 @@ # Cell type Annotation Summary +```{r} +## function definitions ## + +# Source functions for preparing cell type data +source(file.path("utils", "celltype_functions.R")) + +#' Create tables of cell type annotation counts +#' +#' @param df Data frame with cell types +#' @param celltype_column Column with cell type annotations, not a string. +#' +#' @return table with cell type counts +create_celltype_n_table <- function(df, celltype_column) { + df |> + dplyr::count({{ celltype_column }}) |> + # Add percentage column + dplyr::mutate( + `Percent of cells` = paste0(round(n / sum(n) * 100, digits = 2), "%") + ) |> + # set column order & rename + dplyr::select( + `Annotated cell type` = {{ celltype_column }}, + `Number of cells` = n, + `Percent of cells` + ) +} + +#' Format tables of cell type counts as kable +#' +#' @param df Data frame to format +#' +#' @return kable table of cell type counts +format_celltype_n_table <- function(df) { + df |> + knitr::kable(align = "r") |> + kableExtra::kable_styling( + bootstrap_options = "striped", + full_width = FALSE, + position = "left" + ) |> + kableExtra::column_spec(2, monospace = TRUE) +} + +#' Function to lump celltype columns in an existing data frame for all of the +#' following columns, if they exist: `_celltype_annotation`. +#' The resulting lumped column will be named: +#' `_celltype_annotation_lumped`. +#' +#' +#' @param df Data frame to manipulate +#' @param n_celltypes Number of groups to lump into, with rest put into "Other" group. Default is 7. +#' +#' @return Updated df with new column of lumped celltypes for each present method +lump_celltypes <- function(df, n_celltypes = 7) { + + # Update the df for each of the following, if present + methods <- c("singler", "cellassign", "submitter") + df <- df |> + dplyr::mutate( + across( + ends_with("_celltype_annotation"), + \(x) forcats::fct_lump_n(x, n_celltypes, other_level = "Other cell type"), + .names = "{.col}_lumped" + ) + ) + + return(df) +} + + + + + +#' Make UMAP colored by given variable +#' +#' @param umap_df Data frame with UMAP1 and UMAP2 columns +#' @param color_variable Column in data frame to color by, not a string. +#' @param legend_title Title for legend. +#' @param legend_nrow Number of rows in legend. Default is 2. +#' +#' @return UMAP plot as a ggplot2 object +plot_umap <- function( + umap_df, + color_variable, + legend_title, + legend_nrow = 2 +){ + ggplot(umap_df) + + aes( + x = UMAP1, + y = UMAP2, + color = {{ color_variable }} + ) + + geom_point( + size = 0.3, + alpha = 0.5 + ) + + # remove axis numbers and background grid + scale_x_continuous(labels = NULL, breaks = NULL) + + scale_y_continuous(labels = NULL, breaks = NULL) + + coord_fixed() + + guides( + color = guide_legend( + title = legend_title, + nrow = legend_nrow, + # more visible points in legend + override.aes = list( + alpha = 1, + size = 1.5 + ) + ) + ) + + theme(legend.position = "bottom") +} +``` + + @@ -7,7 +124,7 @@ ```{r, eval = has_submitter & !(has_singler | has_cellassign)} knitr::asis_output( glue::glue(" - This section details cell type annotations provided by the original lab ('submitter') which generated this data. + This section details cell type annotations provided by the original lab (`submitter-provided`) which generated this data. ") ) ``` @@ -55,339 +172,46 @@ knitr::asis_output( ) ``` +For additional information about cell typing, including diagnostic plots and/or heatmap comparisons among annotations (if available), please refer to the [supplementary cell type QC report](`r params$celltype_report`). ## Statistics -Below, cells labeled as "Unknown cell type" are those which could not be confidently identified using the given annotation method. - -```{r} -# This chunk defines a helper function to reformat annotations: -# * Annotations that are `NA`/"other" are changed to "Unknown cell type" -# * Annotations are converted to a factor ordered by frequency, but with -# "Unknown cell type" always last. -prepare_annotation_values <- function(df, annotation_column) { - df |> - dplyr::mutate( - {{ annotation_column }} := dplyr::case_when( - # singler condition - is.na({{ annotation_column }}) ~ "Unknown cell type", - # cellassign conditon - {{ annotation_column }} == "other" ~ "Unknown cell type", - # otherwise, keep it - TRUE ~ {{ annotation_column }} - ) |> - forcats::fct_infreq() |> - forcats::fct_relevel("Unknown cell type", after = Inf) - ) -} -``` - ```{r} # Create data frame of cell types -celltype_df <- colData(processed_sce) |> - as.data.frame() |> - # barcodes to a column - tibble::rownames_to_column(var = "barcode") |> - # keep only cell name, celltyping, and clusters - dplyr::select( - barcode, - clusters, - contains("singler"), - contains("cellassign"), - contains("submitter") - ) - - - -if (has_singler) { - celltype_df <- celltype_df |> - prepare_annotation_values(singler_celltype_annotation) -} -if (has_cellassign) { - celltype_df <- celltype_df |> - prepare_annotation_values(cellassign_celltype_annotation) -} -``` - -```{r} -# Define a helper function to create tables for singler and cellassign annotations -create_celltype_n_table <- function(df, celltype_column) { - df |> - dplyr::count({{ celltype_column }}) |> - # Add percentage column - dplyr::mutate( - `Percent of cells` = paste0(round(n / sum(n) * 100, digits = 2), "%") - ) |> - # set column order & rename - dplyr::select( - `Annotated cell type` = {{ celltype_column }}, - `Number of cells` = n, - `Percent of cells` - ) |> - # kable formatting - knitr::kable(align = "r") |> - kableExtra::kable_styling( - bootstrap_options = "striped", - full_width = FALSE, - position = "left" - ) |> - kableExtra::column_spec(2, monospace = TRUE) -} -``` - - - -```{r, eval = has_singler} -knitr::asis_output("### `SingleR` cell type annotations\n") -create_celltype_n_table(celltype_df, singler_celltype_annotation) +celltype_df <- create_celltype_df(processed_sce) ``` -```{r, eval = has_cellassign} -knitr::asis_output("### `CellAssign` cell type annotations\n") -create_celltype_n_table(celltype_df, cellassign_celltype_annotation) -``` ```{r, eval = has_submitter} knitr::asis_output(' ### Submitter-provided cell type annotations\n -Below, cells labeled "Unclassified cell" are those for which submitters did not provide an annotation. - ') -create_celltype_n_table(celltype_df, submitter_celltype_annotation) -``` - - -```{r, eval = has_singler | has_cellassign } -knitr::asis_output(" -## Annotation Assessment - -In this section, we assess the reliability of cell type annotations using diagnostic plots. -") +In this table, cells labeled "Unclassified cell" are those for which submitters did not provide an annotation. +') +create_celltype_n_table(celltype_df, submitter_celltype_annotation) |> + format_celltype_n_table() ``` ```{r, eval = has_singler} -knitr::asis_output(" -### `SingleR` assessment - -`SingleR` assigns cell type scores based on Spearman correlations across features in the reference dataset. -We evaluate the reliability of cell type annotations using the per-cell _delta median_ statistic, which is the difference between the score for the cell's assigned label and the median score of all labels for the given cell. -Higher _delta median_ values indicate more confidence in the cell type annotation, although there is no specific threshold for calling absolute high vs. low confidence. -For more information, refer to the [`SingleR` book section on 'Annotation diagnostics'](https://bioconductor.org/books/release/SingleRBook/annotation-diagnostics.html#annotation-diagnostics). - - -In the plot below, each point is the _delta median_ statistic of a given cell with the given cell type annotation. -Points (cells) are colored by `SingleR`'s internal confidence assessment: High-quality cell annotations are shown in black, and low-quality cell annotations are shown in blue. -All blue points correspond to cells labeled as `Unknown cell type` in the `SingleR` result table in the previous section. -The red overlayed boxes represent the median ± interquartile range (IQR), specifically for high-quality annotations. -") +knitr::asis_output('### `SingleR` cell type annotations\n + +In this table, cells labeled as "Unknown cell type" are those which `SingleR` could not confidently identify. +') +create_celltype_n_table(celltype_df, singler_celltype_annotation) |> + format_celltype_n_table() ``` - -```{r, eval = has_singler, warning=FALSE, message=FALSE,fig.height = 6, fig.width = 8} -# Prepare SingleR scores for plot - -# extract scores into matrix -singler_scores <- metadata(processed_sce)$singler_result$scores - -# Create data frame for plotting with delta median and the full *non-pruned* cell labels -delta_median_df <- tibble::tibble( - delta_median = rowMaxs(singler_scores) - rowMedians(singler_scores), - # Need to grab the non-pruned label for this plot - full_labels = metadata(processed_sce)$singler_result$labels, - # TRUE pruned values ==> low confidence - # so, negate for this variable: - confident = !is.na(metadata(processed_sce)$singler_result$pruned.labels) -) - -# If ontologies were used for `full_labels`, we'll need to map back to cell names -# for the plot itself. -if ("singler_celltype_ontology" %in% names(celltype_df)) { - # we use inner_join b/c the above tibble does NOT contain "Unknown cell type", which - # we do not want to display here - delta_median_df <- delta_median_df |> - dplyr::inner_join( - tibble::tibble( - full_labels = celltype_df$singler_celltype_ontology, - celltype = celltype_df$singler_celltype_annotation - ) |> dplyr::distinct() - ) |> - dplyr::select(-full_labels) -} else { - # otherwise, full_labels already contain what we want to plot, so just rename it - delta_median_df <- delta_median_df |> - dplyr::rename(celltype = full_labels) - # still need to add levels though: - # unknown will still be present, but it's not in the data, so it won't matter - levels(delta_median_df$celltype) <- levels(celltype_df$singler_celltype_annotation) -} - -# Ensure we have no "Unknown cell type" values left: -if (sum(delta_median_df$celltype == "Unknown cell type") > 0) { - stop("Failed to process SingleR data for diagnostic plot.") -} - -# add column with ordered levels with wrapped labels for visualization -delta_median_df$annotation_wrapped <- factor( - delta_median_df$celltype, - levels = levels(delta_median_df$celltype), - labels = stringr::str_wrap(levels(delta_median_df$celltype), 30) -) - -# Subset the data to just confident points for median+/-IQR -delta_median_confident_df <- delta_median_df |> - dplyr::filter(confident) - -# Plot delta_median across celltypes colored by pruning -ggplot(delta_median_df) + - aes( - x = annotation_wrapped, - y = delta_median, - color = confident - ) + - ggforce::geom_sina( - size = 0.75, - alpha = 0.5, - # Keep red points mostly in line with black - position = position_dodge(width = 0.05) - ) + - labs( - x = "Cell type annotation", - y = "Delta median statistic", - color = "Confident cell type assignment" - ) + - scale_color_manual(values = c("blue", "black")) + - # add median/IQR - geom_boxplot( - data = delta_median_confident_df, # only use black points for median - color = "red", - width = 0.2, - size = 0.3, - alpha = 0, - # remove whiskers, outliers - outlier.shape = 0, - coef = 0 - ) + - guides( - color = guide_legend(override.aes = list(size = 1, alpha = 0.9)) - ) + - theme( - axis.text.x = element_text(angle = 55, hjust = 1, size = rel(0.85)), - legend.title = element_text(size = rel(0.75)), - legend.text = element_text(size = rel(0.75)), - legend.position = "bottom" - ) -``` - - - ```{r, eval = has_cellassign} -knitr::asis_output(" -### `CellAssign` assessment - -`CellAssign` computes the probability that each cell in the library is one of the provided cell types in the reference and ultimately annotates cells by assigning the cell type with the highest probability. -We therefore expect that cell-level probabilities that correspond to the annotated cell types will be high. -Conversely, we expect that cell-level probabilities that correspond to unannotated cell types will be low. - -In the plot below, we show distributions of the `CellAssign` probabilities for each assigned cell type, where the median is shown as a vertical line in each. -Colors represent whether the probabilities correspond to a cell with that given annotation: Purple distributions represent probabilies for cells that _were_ annotated as the given cell type. -Yellow distributions represent probabilities for cells that _were not_ annotated as the given cell type. -For distributions with fewer than three points, density plots cannot be calculated. -In these cases, we directly show values for individual cell probabilities as line segments. -") -``` - - -```{r, eval = has_cellassign, warning=FALSE, message=FALSE, fig.height = 8, fig.width = 7} -# Prepare CellAssign scores for plot -cellassign_prob_df <- metadata(processed_sce)$cellassign_predictions |> - # Change "other" to "Unknown cell type" - dplyr::rename(`Unknown cell type` = other) |> - tidyr::pivot_longer( - -barcode, - names_to = "celltype", - values_to = "probability" - ) |> - # remove cell types that were not annotated - dplyr::filter(celltype %in% celltype_df$cellassign_celltype_annotation) |> - # join with actual annotations - dplyr::inner_join( - tibble::tibble( - cellassign_celltype_annotation = celltype_df$cellassign_celltype_annotation, - barcode = colData(processed_sce)$barcodes # plural! - ) - ) |> - # add indicator for whether the celltype matches the annotation - dplyr::mutate( - annotated = cellassign_celltype_annotation == celltype - ) - -# reverse levels for ridgeplot layout, and wrap the labels -new_levels <- rev(levels(cellassign_prob_df$cellassign_celltype_annotation)) -cellassign_prob_df$annotation_wrapped <- factor( - cellassign_prob_df$celltype, - levels = new_levels, - labels = stringr::str_wrap(new_levels, 30) -) - -# find groups with <=2 cells to add back into plot -# These will always be `annotated = TRUE` -celltypes_leq2_cells <- cellassign_prob_df |> - dplyr::count(annotation_wrapped, annotated) |> - dplyr::filter(n <= 2) |> - dplyr::pull(annotation_wrapped) - -# data frame for adding those points back -leq2_probabilities_df <- cellassign_prob_df |> - dplyr::filter( - annotation_wrapped %in% celltypes_leq2_cells, - annotated - ) |> - dplyr::select(annotation_wrapped, probability, annotated) - -# Finally, the plot: -ggplot(cellassign_prob_df) + - aes( - x = probability, - y = annotation_wrapped, - fill = annotated - ) + - ggridges::stat_density_ridges( - quantile_lines = TRUE, - quantiles = 2, - alpha = 0.6, - # avoid overlap to extent possible in a template - - scale = 0.85 - ) + - scale_fill_viridis_d(direction = -1) + - labs( - x = "CellAssign probability", - y = "Annotated cell type", - fill = "Cell annotated as given cell type" - ) + - theme( - axis.text.y = element_text(size = 9), - legend.position = "bottom" - ) + - #### add line segment for N<3 distributions - geom_segment( - data = leq2_probabilities_df, - aes( - x = probability, - xend = probability, - y = annotation_wrapped, - yend = as.numeric(annotation_wrapped) + 0.25, - color = annotated - ), - show.legend = FALSE - ) + - scale_color_viridis_d() +knitr::asis_output('### `CellAssign` cell type annotations\n + +In this table, cells labeled as "Unknown cell type" are those which `CellAssign` could not confidently identify. +') +create_celltype_n_table(celltype_df, cellassign_celltype_annotation) |> + format_celltype_n_table() ``` - -## Comparison with clusters - -### UMAPs +## UMAPs In this section, we show UMAPs colored by clusters and cell types. A separate UMAP is shown for each cell typing method used. @@ -399,77 +223,10 @@ All other cell types are grouped together and labeled "Other cell type" (not to ```{r} # Create dataset for plotting UMAPs with lumped cell types -umap_df <- tibble::as_tibble( - reducedDim(processed_sce, "UMAP") -) |> - dplyr::mutate(clusters = processed_sce$clusters) - -# helper function for lumping cell types -lump_celltypes <- function(umap_df, - celltype_df, - celltype_method, - n_celltypes = 7) { - umap_df |> - dplyr::mutate( - celltypes = celltype_df[[glue::glue("{celltype_method}_celltype_annotation")]] |> - forcats::fct_lump_n( - n_celltypes, - other_level = "Other cell type" - ) - ) |> - # finally, rename temporary `celltypes` column to the provided method - dplyr::rename({{ celltype_method }} := celltypes) -} - -if (has_singler) { - umap_df <- lump_celltypes(umap_df, celltype_df, "singler") -} -if (has_cellassign) { - umap_df <- lump_celltypes(umap_df, celltype_df, "cellassign") -} -if (has_submitter) { - umap_df <- lump_celltypes(umap_df, celltype_df, "submitter") -} +umap_df <- lump_celltypes(celltype_df) ``` -```{r} -# Define helper function for making UMAPs in this section -# this function uses the default palette, which can be customized when -# calling the function -plot_umap <- function(umap_df, - color_variable, - legend_title, - legend_nrow = 2) { - ggplot(umap_df) + - aes( - x = UMAP1, - y = UMAP2, - color = {{ color_variable }} - ) + - geom_point( - size = 0.3, - alpha = 0.5 - ) + - # remove axis numbers and background grid - scale_x_continuous(labels = NULL, breaks = NULL) + - scale_y_continuous(labels = NULL, breaks = NULL) + - coord_fixed() + - guides( - color = guide_legend( - title = legend_title, - nrow = legend_nrow, - # more visible points in legend - override.aes = list( - alpha = 1, - size = 1.5 - ) - ) - ) + - theme(legend.position = "bottom") -} -``` - ```{r message=FALSE, warning=FALSE} clusters_plot <- plot_umap( @@ -492,173 +249,34 @@ if (length(levels(umap_df$clusters)) <= 8) { -```{r eval=has_singler, message=FALSE, warning=FALSE} +```{r eval = has_submitter, message=FALSE, warning=FALSE} plot_umap(umap_df, - singler, + submitter_celltype_annotation_lumped, "Cell types", legend_nrow = 4 ) + - ggtitle("UMAP colored by SingleR annotations") + + ggtitle("UMAP colored by submitter-provided annotations") + scale_color_brewer(palette = "Dark2") ``` -```{r, eval = has_cellassign, message=FALSE, warning=FALSE} +```{r eval=has_singler, message=FALSE, warning=FALSE} plot_umap(umap_df, - cellassign, + singler_celltype_annotation_lumped, "Cell types", legend_nrow = 4 ) + - ggtitle("UMAP colored by CellAssign annotations") + + ggtitle("UMAP colored by SingleR annotations") + scale_color_brewer(palette = "Dark2") ``` -```{r eval = has_submitter, message=FALSE, warning=FALSE} +```{r, eval = has_cellassign, message=FALSE, warning=FALSE} plot_umap(umap_df, - submitter, + cellassign_celltype_annotation_lumped, "Cell types", legend_nrow = 4 ) + - ggtitle("UMAP colored by submitter-provided annotations") + + ggtitle("UMAP colored by CellAssign annotations") + scale_color_brewer(palette = "Dark2") ``` - -### Heatmaps - -Below, we show heat maps comparing cell type annotations (along the y-axis) to clustering results (along the x-axis). -Heatmap colors represent the log number of cells present in both the given cell type and cluster. - - -```{r} -# Helper function for making a heatmap -create_celltype_heatmap <- function(x_vector, - y_vector, - x_label = "", - y_label = "", - y_title_location = "bottom", - column_names_rotation = 0, - row_font_size = 8, - column_font_size = 10) { - # build a matrix for making a heatmap - celltype_mtx <- table( - x_vector, - y_vector - ) |> - log1p() # log transform for visualization - - # Define CVD-friendly palette - heatmap_palette <- viridisLite::inferno(7, alpha = 1, begin = 0, end = 1, direction = 1) - - # heatmap - heat <- ComplexHeatmap::Heatmap(celltype_mtx, - # Overall heatmap parameters - col = heatmap_palette, - # Column parameters - column_title = y_label, - column_title_side = y_title_location, - column_dend_side = "top", - column_names_rot = column_names_rotation, - column_names_gp = grid::gpar(fontsize = column_font_size), - # Row parameters - row_dend_side = "left", - row_title_side = "right", - row_title = x_label, - row_names_gp = grid::gpar(fontsize = row_font_size), - # Legend parameters - heatmap_legend_param = list( - title = "Log(Number of cells)", - title_position = "leftcenter-rot", - legend_height = unit(4, "cm") - ) - ) - # draw with legend on left for spacing - ComplexHeatmap::draw(heat, heatmap_legend_side = "left") -} -``` - - -```{r, eval = has_singler, fig.height=5, fig.width=7} -knitr::asis_output("### `SingleR` cell type and cluster heatmap\n") -create_celltype_heatmap( - x_vector = celltype_df$singler_celltype_annotation, - y_vector = celltype_df$clusters, - y_label = "Clusters" -) -``` - -```{r, eval = has_cellassign, fig.height=5, fig.width=7} -knitr::asis_output("### `CellAssign` cell type and cluster heatmap\n") -create_celltype_heatmap( - x_vector = celltype_df$cellassign_celltype_annotation, - y_vector = celltype_df$clusters, - y_label = "Clusters" -) -``` - -```{r, eval = has_submitter, fig.height=5, fig.width=7} -knitr::asis_output("### Submitter-provided cell type and cluster heatmap\n") -create_celltype_heatmap( - x_vector = celltype_df$submitter_celltype_annotation, - y_vector = celltype_df$clusters, - y_label = "Clusters" -) -``` - - - -```{r, eval = has_cellassign & has_singler, fig.height=7, fig.width=8} -knitr::asis_output(" -## Comparison between annotations - -Below, we show a heatmap directly comparing `SingleR` and `CellAssign` cell type annotations. -Note that due to different annotations references, these methods may use different names for similar cell types. -") - -create_celltype_heatmap( - x_vector = celltype_df$singler_celltype_annotation, - y_vector = celltype_df$cellassign_celltype_annotation, - x_label = "SingleR annotations", - y_label = "CellAssign annotations", - column_names_rotation = 55, - column_font_size = 8 -) -``` - -```{r, eval = has_submitter & (has_cellassign | has_singler)} -knitr::asis_output(" -## Comparison with submitter annotations - -This section shows heatmap(s) directly comparing submitter-provided cell type annotations to cell type inferred with the given method(s). -Again, different annotations may use different names for similar cell types. -") -``` - - -```{r, eval = has_submitter & has_singler, fig.height=7, fig.width=8} - -create_celltype_heatmap( - x_vector = celltype_df$submitter_celltype_annotation, - y_vector = celltype_df$singler_celltype_annotation, - x_label = "Submitter-provided annotations", - y_label = "SingleR annotations", - y_title_location = "top", - column_names_rotation = 55, - column_font_size = 8 -) -``` - - -
- -```{r, eval = has_submitter & has_cellassign, fig.height=7, fig.width=8} -create_celltype_heatmap( - x_vector = celltype_df$submitter_celltype_annotation, - y_vector = celltype_df$cellassign_celltype_annotation, - x_label = "Submitter-provided annotations", - y_label = "CellAssign annotations", - y_title_location = "top", - column_names_rotation = 55, - column_font_size = 8 -) -``` \ No newline at end of file diff --git a/templates/qc_report/celltypes_supplemental_report.rmd b/templates/qc_report/celltypes_supplemental_report.rmd new file mode 100644 index 00000000..944d2a97 --- /dev/null +++ b/templates/qc_report/celltypes_supplemental_report.rmd @@ -0,0 +1,433 @@ +--- +params: + library: Example + processed_sce: NULL + date: !r Sys.Date() + +title: "`r glue::glue('ScPCA Cell type annotation supplemental QC report for {params$library}')`" +author: "Childhood Cancer Data Lab" +date: "`r params$date`" +output: + html_document: + toc: true + toc_depth: 2 + toc_float: + collapsed: false + number_sections: false + code_download: true +--- + +```{r setup, message = FALSE, echo = FALSE} +# knitr options +knitr::opts_chunk$set( + echo = FALSE +) + +library(SingleCellExperiment) +library(ggplot2) + +# Set default ggplot theme +theme_set( + theme_bw() + + theme( + plot.margin = margin(rep(20, 4)), + strip.background = element_rect(fill = "transparent") + ) +) + +# Source functions for preparing cell type data +source(file.path("utils", "celltype_functions.R")) + +# define library and sce object +library_id <- params$library +processed_sce <- params$processed_sce + +# check for annotation methods +has_singler <- "singler" %in% metadata(processed_sce)$celltype_methods +has_cellassign <- "cellassign" %in% metadata(processed_sce)$celltype_methods +has_submitter <- "submitter" %in% metadata(processed_sce)$celltype_methods + +# Create data frame of cell types +celltype_df <- create_celltype_df(processed_sce) +``` + +This supplemental cell type annotation report provides additional information about cell type annotation results. + + +```{r, eval = has_singler | has_cellassign } +knitr::asis_output(" +## Annotation Assessment + +In this section, we assess the reliability of cell type annotations using diagnostic plots. +") +``` + +```{r, eval = has_singler} +knitr::asis_output(" +### `SingleR` assessment + +`SingleR` assigns cell type scores based on Spearman correlations across features in the reference dataset. +We evaluate the reliability of cell type annotations using the per-cell _delta median_ statistic, which is the difference between the score for the cell's assigned label and the median score of all labels for the given cell. +Higher _delta median_ values indicate more confidence in the cell type annotation, although there is no specific threshold for calling absolute high vs. low confidence. +For more information, refer to the [`SingleR` book section on 'Annotation diagnostics'](https://bioconductor.org/books/release/SingleRBook/annotation-diagnostics.html#annotation-diagnostics). + + +In the plot below, each point is the _delta median_ statistic of a given cell with the given cell type annotation. +Points (cells) are colored by `SingleR`'s internal confidence assessment: High-quality cell annotations are shown in black, and low-quality cell annotations are shown in blue. +All blue points correspond to cells labeled as `Unknown cell type` in the `SingleR` result table in the previous section. +The red overlayed boxes represent the median ± interquartile range (IQR), specifically for high-quality annotations. +") +``` + + +```{r, eval = has_singler, warning=FALSE, message=FALSE,fig.height = 6, fig.width = 8} +# Prepare SingleR scores for plot + +# extract scores into matrix +singler_scores <- metadata(processed_sce)$singler_result$scores + +# Create data frame for plotting with delta median and the full *non-pruned* cell labels +delta_median_df <- tibble::tibble( + delta_median = rowMaxs(singler_scores) - rowMedians(singler_scores), + # Need to grab the non-pruned label for this plot + full_labels = metadata(processed_sce)$singler_result$labels, + # if pruned.labels are NA ==> low confidence + # so, negate for this variable: + confident = !is.na(metadata(processed_sce)$singler_result$pruned.labels) +) + +# If ontologies were used for `full_labels`, we'll need to map back to cell type names +# for the plot itself. +if ("singler_celltype_ontology" %in% names(celltype_df)) { + # we use inner_join b/c the above tibble does NOT contain "Unknown cell type", which + # we do not want to display here + delta_median_df <- delta_median_df |> + dplyr::inner_join( + tibble::tibble( + full_labels = celltype_df$singler_celltype_ontology, + celltype = celltype_df$singler_celltype_annotation + ) |> dplyr::distinct() + ) |> + dplyr::select(-full_labels) +} else { + # otherwise, full_labels already contain what we want to plot, so just rename it + delta_median_df <- delta_median_df |> + dplyr::rename(celltype = full_labels) + # still need to add levels: + # Note that the level "Unknown cell type" will still be present, but there is no such value in the data, so it won't matter + levels(delta_median_df$celltype) <- levels(celltype_df$singler_celltype_annotation) +} + +# Ensure we have no "Unknown cell type" values left: +if (any(delta_median_df$celltype == "Unknown cell type")) { + stop("Failed to process SingleR data for diagnostic plot.") +} + +# add column with ordered levels with wrapped labels for visualization +delta_median_df$annotation_wrapped <- factor( + delta_median_df$celltype, + levels = levels(delta_median_df$celltype), + labels = stringr::str_wrap(levels(delta_median_df$celltype), 30) +) + +# Subset the data to just confident points for median+/-IQR +delta_median_confident_df <- delta_median_df |> + dplyr::filter(confident) + +# Plot delta_median across celltypes colored by pruning +ggplot(delta_median_df) + + aes( + x = annotation_wrapped, + y = delta_median, + color = confident + ) + + ggforce::geom_sina( + size = 0.75, + alpha = 0.5, + # Keep red points mostly in line with black + position = position_dodge(width = 0.05) + ) + + labs( + x = "Cell type annotation", + y = "Delta median statistic", + color = "Confident cell type assignment" + ) + + scale_color_manual(values = c("blue", "black")) + + # add median/IQR + geom_boxplot( + data = delta_median_confident_df, # only use black points for median + color = "red", + width = 0.2, + size = 0.3, + alpha = 0, + # remove whiskers, outliers + outlier.shape = 0, + coef = 0 + ) + + guides( + color = guide_legend(override.aes = list(size = 1, alpha = 0.9)) + ) + + theme( + axis.text.x = element_text(angle = 55, hjust = 1, size = rel(0.85)), + legend.title = element_text(size = rel(0.75)), + legend.text = element_text(size = rel(0.75)), + legend.position = "bottom" + ) +``` + + + +```{r, eval = has_cellassign} +knitr::asis_output(" +### `CellAssign` assessment + +`CellAssign` computes the probability that each cell in the library is one of the provided cell types in the reference and ultimately annotates cells by assigning the cell type with the highest probability. +We therefore expect that cell-level probabilities that correspond to the annotated cell types will be high. +Conversely, we expect that cell-level probabilities that correspond to unannotated cell types will be low. + +In the plot below, we show distributions of the `CellAssign` probabilities for each assigned cell type, where the median is shown as a vertical line in each. +Colors represent whether the probabilities correspond to a cell with that given annotation: Purple distributions represent probabilies for cells that _were_ annotated as the given cell type. +Yellow distributions represent probabilities for cells that _were not_ annotated as the given cell type. +For distributions with fewer than three points, density plots cannot be calculated. +In these cases, we directly show values for individual cell probabilities as line segments. +") +``` + + +```{r, eval = has_cellassign, warning=FALSE, message=FALSE, fig.height = 8, fig.width = 7} +# Prepare CellAssign scores for plot +cellassign_prob_df <- metadata(processed_sce)$cellassign_predictions |> + # Change "other" to "Unknown cell type" + dplyr::rename(`Unknown cell type` = other) |> + tidyr::pivot_longer( + -barcode, + names_to = "celltype", + values_to = "probability" + ) |> + # remove cell types that were not annotated + dplyr::filter(celltype %in% celltype_df$cellassign_celltype_annotation) |> + # join with actual annotations + dplyr::inner_join( + tibble::tibble( + cellassign_celltype_annotation = celltype_df$cellassign_celltype_annotation, + barcode = colData(processed_sce)$barcodes # plural! + ) + ) |> + # add indicator for whether the celltype matches the annotation + dplyr::mutate( + annotated = cellassign_celltype_annotation == celltype + ) + +# reverse levels for ridgeplot layout, and wrap the labels +new_levels <- rev(levels(cellassign_prob_df$cellassign_celltype_annotation)) +cellassign_prob_df$annotation_wrapped <- factor( + cellassign_prob_df$celltype, + levels = new_levels, + labels = stringr::str_wrap(new_levels, 30) +) + +# find groups with <=2 cells to add back into plot +# These will always be `annotated = TRUE` +celltypes_leq2_cells <- cellassign_prob_df |> + dplyr::count(annotation_wrapped, annotated) |> + dplyr::filter(n <= 2) |> + dplyr::pull(annotation_wrapped) + +# data frame for adding those points back +leq2_probabilities_df <- cellassign_prob_df |> + dplyr::filter( + annotation_wrapped %in% celltypes_leq2_cells, + annotated + ) |> + dplyr::select(annotation_wrapped, probability, annotated) + +# Finally, the plot: +ggplot(cellassign_prob_df) + + aes( + x = probability, + y = annotation_wrapped, + fill = annotated + ) + + ggridges::stat_density_ridges( + quantile_lines = TRUE, + quantiles = 2, + alpha = 0.6, + # avoid overlap to extent possible in a template - + scale = 0.85 + ) + + scale_fill_viridis_d(direction = -1) + + labs( + x = "CellAssign probability", + y = "Annotated cell type", + fill = "Cell annotated as given cell type" + ) + + theme( + axis.text.y = element_text(size = 9), + legend.position = "bottom" + ) + + #### add line segment for N<3 distributions + geom_segment( + data = leq2_probabilities_df, + aes( + x = probability, + xend = probability, + y = annotation_wrapped, + yend = as.numeric(annotation_wrapped) + 0.25, + color = annotated + ), + show.legend = FALSE + ) + + scale_color_viridis_d() +``` + + +## Heatmaps + +Below, we show heat maps comparing cell type annotations (along the y-axis) to clustering results (along the x-axis). +Heatmap colors represent the log number of cells present in both the given cell type and cluster. + +```{r} +# heatmap function definition: + +#' Create a heatmap of cell type annotations with log1p transformation +#' +#' @param x_vector Vector of values for the x-axis (rows) +#' @param y_vector Vector of values for the y-axis (columns) +#' @param x_label x-axis label. Default is no label. +#' @param y_label y-axis label. Default is no label. +#' @param y_title_location location of the y-axis title. Default is on the bottom. +#' @param column_names_rotation degree to rotate column names. Default is 0. +#' @param row_font_size Size of row font. Default is 8. +#' @param column_font_size. Size of column font. Default is 10. +#' +#' @return heatmap output from `ComplexHeatmap::draw()` +create_celltype_heatmap <- function( + x_vector, + y_vector, + x_label = "", + y_label = "", + y_title_location = "bottom", + column_names_rotation = 0, + row_font_size = 8, + column_font_size = 10) { + # build a matrix for making a heatmap + celltype_mtx <- table( + x_vector, + y_vector + ) |> + log1p() # log transform for visualization + + # Define CVD-friendly palette + heatmap_palette <- viridisLite::inferno(7, alpha = 1, begin = 0, end = 1, direction = 1) + + # heatmap + heat <- ComplexHeatmap::Heatmap( + celltype_mtx, + # Overall heatmap parameters + col = heatmap_palette, + # Column parameters + column_title = y_label, + column_title_side = y_title_location, + column_dend_side = "top", + column_names_rot = column_names_rotation, + column_names_gp = grid::gpar(fontsize = column_font_size), + row_dend_side = "left", + row_title_side = "right", + row_title = x_label, + row_names_gp = grid::gpar(fontsize = row_font_size), + # Legend parameters + heatmap_legend_param = list( + title = "Log(Number of cells)", + title_position = "leftcenter-rot", + legend_height = unit(4, "cm") + ) + ) + # draw with legend on left for spacing + ComplexHeatmap::draw(heat, heatmap_legend_side = "left") +} +``` + +```{r, eval = has_submitter, fig.height=5, fig.width=7} +knitr::asis_output("### Submitter-provided cell type and cluster heatmap\n") +create_celltype_heatmap( + x_vector = celltype_df$submitter_celltype_annotation, + y_vector = celltype_df$clusters, + y_label = "Clusters" +) +``` + +```{r, eval = has_singler, fig.height=5, fig.width=7} +knitr::asis_output("### `SingleR` cell type and cluster heatmap\n") +create_celltype_heatmap( + x_vector = celltype_df$singler_celltype_annotation, + y_vector = celltype_df$clusters, + y_label = "Clusters" +) +``` + +```{r, eval = has_cellassign, fig.height=5, fig.width=7} +knitr::asis_output("### `CellAssign` cell type and cluster heatmap\n") +create_celltype_heatmap( + x_vector = celltype_df$cellassign_celltype_annotation, + y_vector = celltype_df$clusters, + y_label = "Clusters" +) +``` + + +```{r, eval = has_submitter & (has_cellassign | has_singler)} +knitr::asis_output(" +## Comparison with submitter annotations + +This section shows heatmap(s) directly comparing submitter-provided cell type annotations to cell type inferred with the given method(s). +Again, different annotations may use different names for similar cell types. +") +``` + + +```{r, eval = has_submitter & has_singler, fig.height=7, fig.width=8} +create_celltype_heatmap( + x_vector = celltype_df$submitter_celltype_annotation, + y_vector = celltype_df$singler_celltype_annotation, + x_label = "Submitter-provided annotations", + y_label = "SingleR annotations", + y_title_location = "top", + column_names_rotation = 55, + column_font_size = 8 +) +``` + + +
+ +```{r, eval = has_submitter & has_cellassign, fig.height=7, fig.width=8} +create_celltype_heatmap( + x_vector = celltype_df$submitter_celltype_annotation, + y_vector = celltype_df$cellassign_celltype_annotation, + x_label = "Submitter-provided annotations", + y_label = "CellAssign annotations", + y_title_location = "top", + column_names_rotation = 55, + column_font_size = 8 +) +``` + + +```{r, eval = has_cellassign & has_singler, fig.height=7, fig.width=8} +knitr::asis_output(" +## Comparison between `SingleR` and `CellAssign` annotations + +Below is a heatmap directly comparing `SingleR` and `CellAssign` cell type annotations. +Note that due to different annotations references, these methods may use different names for similar cell types. +") + +create_celltype_heatmap( + x_vector = celltype_df$singler_celltype_annotation, + y_vector = celltype_df$cellassign_celltype_annotation, + x_label = "SingleR annotations", + y_label = "CellAssign annotations", + column_names_rotation = 55, + column_font_size = 8 +) +``` + diff --git a/templates/qc_report/qc_report.rmd b/templates/qc_report/main_qc_report.rmd similarity index 98% rename from templates/qc_report/qc_report.rmd rename to templates/qc_report/main_qc_report.rmd index 7171a76f..0b506ca4 100644 --- a/templates/qc_report/qc_report.rmd +++ b/templates/qc_report/main_qc_report.rmd @@ -4,6 +4,7 @@ params: unfiltered_sce: !r scpcaTools:::sim_sce() filtered_sce: NULL processed_sce: NULL + celltype_report: NULL date: !r Sys.Date() seed: NULL @@ -121,6 +122,11 @@ if (has_processed) { has_umap <- FALSE has_celltypes <- FALSE } + +# check for celltypes_report if celltypes are present +if (has_celltypes & is.null(params$celltype_report)) { + stop("Cell type annotations were provided but the parameter specifying the cell type report file is missing.") +} ``` diff --git a/templates/qc_report/utils/celltype_functions.R b/templates/qc_report/utils/celltype_functions.R new file mode 100644 index 00000000..bf342202 --- /dev/null +++ b/templates/qc_report/utils/celltype_functions.R @@ -0,0 +1,69 @@ +library(SingleCellExperiment) + +# This script contains function definitions that are used by both +# `main_qc_report.rmd` and `celltypes_supplemental_report.rmd` to prepare +# cell typing results for analysis/visualization. + +#' Create `celltype_df` data frame for use in cell type QC reports +#' +#' @param processed_sce The processed sce object with cell type annotations in colData +#' +#' @return `celltype_df` with column of cell types, as factors, for each annotation method +create_celltype_df <- function(processed_sce) { + celltype_df <- processed_sce |> + scuttle::makePerCellDF(use.dimred = "UMAP") |> + # rename UMAP columns as needed to remove potential period added by `scuttle::makePerCellDF` + dplyr::rename_with( + \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP"), + starts_with("UMAP") + ) |> + # only keep columns of interest + dplyr::select( + barcodes, + clusters, + contains("UMAP"), + contains("singler"), + contains("cellassign"), + contains("submitter") + ) + + if ("singler_celltype_annotation" %in% names(celltype_df)) { + celltype_df <- celltype_df |> + prepare_annotation_values(singler_celltype_annotation) + } + if ("cellassign_celltype_annotation" %in% names(celltype_df)) { + celltype_df <- celltype_df |> + prepare_annotation_values(cellassign_celltype_annotation) + } + + return (celltype_df) +} + +# Functions for working with cell type annotations in QC reports + +#' Prepare and reformat cell type annotation values for use in QC reports +#' Unknown cell types are updated with the label "Unknown cell type", and +#' cell types are ordered in order of descending frequency, but with +#' "Unknown cell type" as the last level +#' +#' @param df The data frame containing cell type annotations, one row per cell +#' @param annotation_column The column (written plainly, not a string) containing annotations to reformat +#' +#' @return Updated data frame with the `annotation_column` reformatted +prepare_annotation_values <- function(df, annotation_column) { + df |> + dplyr::mutate( + {{ annotation_column }} := dplyr::case_when( + # singler condition + is.na({{ annotation_column }}) ~ "Unknown cell type", + # cellassign conditon + {{ annotation_column }} == "other" ~ "Unknown cell type", + # otherwise, keep it + .default = {{ annotation_column }} + ) |> + forcats::fct_infreq() |> + forcats::fct_relevel("Unknown cell type", after = Inf) + ) +} + +